summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Kconfig1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Makefile2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c43
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_int.h22
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_regs.h175
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h1377
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c1374
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.c1163
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.h40
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c972
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h66
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c247
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h313
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c283
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c500
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h29
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c106
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c15
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c522
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c79
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c31
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_module.c30
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c92
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c227
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c211
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h120
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h268
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c517
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c105
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_queue.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_rdma.c296
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c858
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.h23
38 files changed, 8232 insertions, 1896 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
index e13c67c8d2c0..ac495328dae3 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -5,5 +5,6 @@
config HSA_AMD
tristate "HSA kernel driver for AMD GPU devices"
depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64
+ select DRM_AMDGPU_USERPTR
help
Enable this if you want to use HSA features on AMD GPU devices.
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 7fc9b0f444cb..c8fa422585ec 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -14,6 +14,6 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
kfd_process_queue_manager.o kfd_device_queue_manager.o \
kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \
kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
- kfd_dbgdev.o kfd_dbgmgr.o
+ kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o
obj-$(CONFIG_HSA_AMD) += amdkfd.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 211fc48697fa..02a908249023 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,40 +24,59 @@
#include "kfd_events.h"
#include "cik_int.h"
-static bool cik_event_interrupt_isr(struct kfd_dev *dev,
+static bool is_cpc_vm_fault(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
{
- unsigned int pasid;
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
- pasid = (ihre->ring_id & 0xffff0000) >> 16;
+ if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
+ ihre->vmid >= dev->vm_info.first_vmid_kfd &&
+ ihre->vmid <= dev->vm_info.last_vmid_kfd)
+ return true;
+ return false;
+}
+static bool cik_event_interrupt_isr(struct kfd_dev *dev,
+ const uint32_t *ih_ring_entry)
+{
+ const struct cik_ih_ring_entry *ihre =
+ (const struct cik_ih_ring_entry *)ih_ring_entry;
/* Do not process in ISR, just request it to be forwarded to WQ. */
- return (pasid != 0) &&
+ return (ihre->pasid != 0) &&
(ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE);
+ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
+ is_cpc_vm_fault(dev, ih_ring_entry));
}
static void cik_event_interrupt_wq(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
{
- unsigned int pasid;
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
- pasid = (ihre->ring_id & 0xffff0000) >> 16;
-
- if (pasid == 0)
+ if (ihre->pasid == 0)
return;
if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
- kfd_signal_event_interrupt(pasid, 0, 0);
+ kfd_signal_event_interrupt(ihre->pasid, 0, 0);
else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG)
- kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8);
+ kfd_signal_event_interrupt(ihre->pasid, ihre->data & 0xFF, 8);
else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
- kfd_signal_hw_exception_event(pasid);
+ kfd_signal_hw_exception_event(ihre->pasid);
+ else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+ struct kfd_vm_fault_info info;
+
+ dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
+ kfd_process_vm_fault(dev->dqm, ihre->pasid);
+ if (info.vmid == ihre->vmid)
+ kfd_signal_vm_fault_event(dev, ihre->pasid, &info);
+ else
+ kfd_signal_vm_fault_event(dev, ihre->pasid, NULL);
+ }
}
const struct kfd_event_interrupt_class event_interrupt_class_cik = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
index 79a16d24c1b8..feb3c2428d53 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
@@ -26,16 +26,30 @@
#include <linux/types.h>
struct cik_ih_ring_entry {
- uint32_t source_id;
- uint32_t data;
- uint32_t ring_id;
- uint32_t reserved;
+ uint32_t source_id:8;
+ uint32_t reserved1:8;
+ uint32_t reserved2:16;
+
+ uint32_t data:28;
+ uint32_t reserved3:4;
+
+ /* pipeid, meid and unused3 are officially called RINGID,
+ * but for our purposes, they always decode into pipe and ME. */
+ uint32_t pipeid:2;
+ uint32_t meid:2;
+ uint32_t reserved4:4;
+ uint32_t vmid:8;
+ uint32_t pasid:16;
+
+ uint32_t reserved5;
};
#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6
#define CIK_INTSRC_CP_END_OF_PIPE 0xB5
#define CIK_INTSRC_CP_BAD_OPCODE 0xB7
#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF
+#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92
+#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
index 48769d12dd7b..607fc5ceadbe 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
@@ -23,11 +23,33 @@
#ifndef CIK_REGS_H
#define CIK_REGS_H
+#define IH_VMID_0_LUT 0x3D40u
+
+#define BIF_DOORBELL_CNTL 0x530Cu
+
+#define SRBM_GFX_CNTL 0xE44
+#define PIPEID(x) ((x) << 0)
+#define MEID(x) ((x) << 2)
+#define VMID(x) ((x) << 4)
+#define QUEUEID(x) ((x) << 8)
+
+#define SQ_CONFIG 0x8C00
+
+#define SH_MEM_BASES 0x8C28
/* if PTR32, these are the bases for scratch and lds */
#define PRIVATE_BASE(x) ((x) << 0) /* scratch */
#define SHARED_BASE(x) ((x) << 16) /* LDS */
+#define SH_MEM_APE1_BASE 0x8C2C
+/* if PTR32, this is the base location of GPUVM */
+#define SH_MEM_APE1_LIMIT 0x8C30
+/* if PTR32, this is the upper limit of GPUVM */
+#define SH_MEM_CONFIG 0x8C34
#define PTR32 (1 << 0)
+#define PRIVATE_ATC (1 << 1)
#define ALIGNMENT_MODE(x) ((x) << 2)
+#define SH_MEM_ALIGNMENT_MODE_DWORD 0
+#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1
+#define SH_MEM_ALIGNMENT_MODE_STRICT 2
#define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3
#define DEFAULT_MTYPE(x) ((x) << 4)
#define APE1_MTYPE(x) ((x) << 7)
@@ -36,37 +58,164 @@
#define MTYPE_CACHED 0
#define MTYPE_NONCACHED 3
+
+#define SH_STATIC_MEM_CONFIG 0x9604u
+
+#define TC_CFG_L1_LOAD_POLICY0 0xAC68
+#define TC_CFG_L1_LOAD_POLICY1 0xAC6C
+#define TC_CFG_L1_STORE_POLICY 0xAC70
+#define TC_CFG_L2_LOAD_POLICY0 0xAC74
+#define TC_CFG_L2_LOAD_POLICY1 0xAC78
+#define TC_CFG_L2_STORE_POLICY0 0xAC7C
+#define TC_CFG_L2_STORE_POLICY1 0xAC80
+#define TC_CFG_L2_ATOMIC_POLICY 0xAC84
+#define TC_CFG_L1_VOLATILE 0xAC88
+#define TC_CFG_L2_VOLATILE 0xAC8C
+
+#define CP_PQ_WPTR_POLL_CNTL 0xC20C
+#define WPTR_POLL_EN (1 << 31)
+
+#define CPC_INT_CNTL 0xC2D0
+#define CP_ME1_PIPE0_INT_CNTL 0xC214
+#define CP_ME1_PIPE1_INT_CNTL 0xC218
+#define CP_ME1_PIPE2_INT_CNTL 0xC21C
+#define CP_ME1_PIPE3_INT_CNTL 0xC220
+#define CP_ME2_PIPE0_INT_CNTL 0xC224
+#define CP_ME2_PIPE1_INT_CNTL 0xC228
+#define CP_ME2_PIPE2_INT_CNTL 0xC22C
+#define CP_ME2_PIPE3_INT_CNTL 0xC230
+#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13)
+#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17)
+#define PRIV_REG_INT_ENABLE (1 << 23)
+#define TIME_STAMP_INT_ENABLE (1 << 26)
+#define GENERIC2_INT_ENABLE (1 << 29)
+#define GENERIC1_INT_ENABLE (1 << 30)
+#define GENERIC0_INT_ENABLE (1 << 31)
+#define CP_ME1_PIPE0_INT_STATUS 0xC214
+#define CP_ME1_PIPE1_INT_STATUS 0xC218
+#define CP_ME1_PIPE2_INT_STATUS 0xC21C
+#define CP_ME1_PIPE3_INT_STATUS 0xC220
+#define CP_ME2_PIPE0_INT_STATUS 0xC224
+#define CP_ME2_PIPE1_INT_STATUS 0xC228
+#define CP_ME2_PIPE2_INT_STATUS 0xC22C
+#define CP_ME2_PIPE3_INT_STATUS 0xC230
+#define DEQUEUE_REQUEST_INT_STATUS (1 << 13)
+#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17)
+#define PRIV_REG_INT_STATUS (1 << 23)
+#define TIME_STAMP_INT_STATUS (1 << 26)
+#define GENERIC2_INT_STATUS (1 << 29)
+#define GENERIC1_INT_STATUS (1 << 30)
+#define GENERIC0_INT_STATUS (1 << 31)
+
+#define CP_HPD_EOP_BASE_ADDR 0xC904
+#define CP_HPD_EOP_BASE_ADDR_HI 0xC908
+#define CP_HPD_EOP_VMID 0xC90C
+#define CP_HPD_EOP_CONTROL 0xC910
+#define EOP_SIZE(x) ((x) << 0)
+#define EOP_SIZE_MASK (0x3f << 0)
+#define CP_MQD_BASE_ADDR 0xC914
+#define CP_MQD_BASE_ADDR_HI 0xC918
+#define CP_HQD_ACTIVE 0xC91C
+#define CP_HQD_VMID 0xC920
+
+#define CP_HQD_PERSISTENT_STATE 0xC924u
#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8)
#define PRELOAD_REQ (1 << 0)
-#define MQD_CONTROL_PRIV_STATE_EN (1U << 8)
-
-#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20)
-
-#define IB_ATC_EN (1U << 23)
-
+#define CP_HQD_PIPE_PRIORITY 0xC928u
+#define CP_HQD_QUEUE_PRIORITY 0xC92Cu
+#define CP_HQD_QUANTUM 0xC930u
#define QUANTUM_EN 1U
#define QUANTUM_SCALE_1MS (1U << 4)
#define QUANTUM_DURATION(x) ((x) << 8)
+#define CP_HQD_PQ_BASE 0xC934
+#define CP_HQD_PQ_BASE_HI 0xC938
+#define CP_HQD_PQ_RPTR 0xC93C
+#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940
+#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944
+#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948
+#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C
+#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950
+#define DOORBELL_OFFSET(x) ((x) << 2)
+#define DOORBELL_OFFSET_MASK (0x1fffff << 2)
+#define DOORBELL_SOURCE (1 << 28)
+#define DOORBELL_SCHD_HIT (1 << 29)
+#define DOORBELL_EN (1 << 30)
+#define DOORBELL_HIT (1 << 31)
+#define CP_HQD_PQ_WPTR 0xC954
+#define CP_HQD_PQ_CONTROL 0xC958
+#define QUEUE_SIZE(x) ((x) << 0)
+#define QUEUE_SIZE_MASK (0x3f << 0)
#define RPTR_BLOCK_SIZE(x) ((x) << 8)
+#define RPTR_BLOCK_SIZE_MASK (0x3f << 8)
#define MIN_AVAIL_SIZE(x) ((x) << 20)
+#define PQ_ATC_EN (1 << 23)
+#define PQ_VOLATILE (1 << 26)
+#define NO_UPDATE_RPTR (1 << 27)
+#define UNORD_DISPATCH (1 << 28)
+#define ROQ_PQ_IB_FLIP (1 << 29)
+#define PRIV_STATE (1 << 30)
+#define KMD_QUEUE (1 << 31)
+
#define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5)
#define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3)
-#define PQ_ATC_EN (1 << 23)
-#define NO_UPDATE_RPTR (1 << 27)
+#define CP_HQD_IB_BASE_ADDR 0xC95Cu
+#define CP_HQD_IB_BASE_ADDR_HI 0xC960u
+#define CP_HQD_IB_RPTR 0xC964u
+#define CP_HQD_IB_CONTROL 0xC968u
+#define IB_ATC_EN (1U << 23)
+#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20)
-#define DOORBELL_OFFSET(x) ((x) << 2)
-#define DOORBELL_EN (1 << 30)
+#define CP_HQD_DEQUEUE_REQUEST 0xC974
+#define DEQUEUE_REQUEST_DRAIN 1
+#define DEQUEUE_REQUEST_RESET 2
+#define DEQUEUE_INT (1U << 8)
-#define PRIV_STATE (1 << 30)
-#define KMD_QUEUE (1 << 31)
+#define CP_HQD_SEMA_CMD 0xC97Cu
+#define CP_HQD_MSG_TYPE 0xC980u
+#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u
+#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u
+#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu
+#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u
+#define CP_HQD_HQ_SCHEDULER0 0xC994u
+#define CP_HQD_HQ_SCHEDULER1 0xC998u
-#define AQL_ENABLE 1
+
+#define CP_MQD_CONTROL 0xC99C
+#define MQD_VMID(x) ((x) << 0)
+#define MQD_VMID_MASK (0xf << 0)
+#define MQD_CONTROL_PRIV_STATE_EN (1U << 8)
#define GRBM_GFX_INDEX 0x30800
+#define INSTANCE_INDEX(x) ((x) << 0)
+#define SH_INDEX(x) ((x) << 8)
+#define SE_INDEX(x) ((x) << 16)
+#define SH_BROADCAST_WRITES (1 << 29)
+#define INSTANCE_BROADCAST_WRITES (1 << 30)
+#define SE_BROADCAST_WRITES (1 << 31)
+#define SQC_CACHES 0x30d20
+#define SQC_POLICY 0x8C38u
+#define SQC_VOLATILE 0x8C3Cu
+
+#define CP_PERFMON_CNTL 0x36020
+
+#define ATC_VMID0_PASID_MAPPING 0x339Cu
+#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u
#define ATC_VMID_PASID_MAPPING_VALID (1U << 31)
+#define ATC_VM_APERTURE0_CNTL 0x3310u
+#define ATS_ACCESS_MODE_NEVER 0
+#define ATS_ACCESS_MODE_ALWAYS 1
+
+#define ATC_VM_APERTURE0_CNTL2 0x3318u
+#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u
+#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u
+#define ATC_VM_APERTURE1_CNTL 0x3314u
+#define ATC_VM_APERTURE1_CNTL2 0x331Cu
+#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu
+#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u
+
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
new file mode 100644
index 000000000000..1880dc0b0fcb
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_carrizo.h
@@ -0,0 +1,1377 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if 0
+ HW (CARRIZO) source code for CWSR trap handler
+
+var G8SR_WDMEM_HWREG_OFFSET = 0
+var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes
+
+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
+
+var G8SR_DEBUG_TIMESTAMP = 0
+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
+var s_g8sr_ts_save_s = s[34:35] // save start
+var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi
+var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ
+var s_g8sr_ts_save_d = s[40:41] // save end
+var s_g8sr_ts_restore_s = s[42:43] // restore start
+var s_g8sr_ts_restore_d = s[44:45] // restore end
+
+var G8SR_VGPR_SR_IN_DWX4 = 0
+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes
+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
+
+
+/*************************************************************************/
+/* control on how to run the shader */
+/*************************************************************************/
+//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
+var EMU_RUN_HACK = 0
+var EMU_RUN_HACK_RESTORE_NORMAL = 0
+var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
+var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
+var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
+var SAVE_LDS = 1
+var WG_BASE_ADDR_LO = 0x9000a000
+var WG_BASE_ADDR_HI = 0x0
+var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
+var CTX_SAVE_CONTROL = 0x0
+var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
+var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
+var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write
+var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
+
+/**************************************************************************/
+/* variables */
+/**************************************************************************/
+var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
+var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
+var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
+
+
+/* Save */
+var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
+var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+
+var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+var S_SAVE_SPI_INIT_ATC_SHIFT = 27
+var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+
+var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
+
+var s_save_spi_init_lo = exec_lo
+var s_save_spi_init_hi = exec_hi
+
+ //tba_lo and tba_hi need to be saved/restored
+var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+var s_save_pc_hi = ttmp1
+var s_save_exec_lo = ttmp2
+var s_save_exec_hi = ttmp3
+var s_save_status = ttmp4
+var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
+var s_save_xnack_mask_lo = ttmp6
+var s_save_xnack_mask_hi = ttmp7
+var s_save_buf_rsrc0 = ttmp8
+var s_save_buf_rsrc1 = ttmp9
+var s_save_buf_rsrc2 = ttmp10
+var s_save_buf_rsrc3 = ttmp11
+
+var s_save_mem_offset = tma_lo
+var s_save_alloc_size = s_save_trapsts //conflict
+var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
+var s_save_m0 = tma_hi
+
+/* Restore */
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
+var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+
+var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo = exec_lo
+var s_restore_spi_init_hi = exec_hi
+
+var s_restore_mem_offset = ttmp2
+var s_restore_alloc_size = ttmp3
+var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored
+var s_restore_mem_offset_save = s_restore_tmp //no conflict
+
+var s_restore_m0 = s_restore_alloc_size //no conflict
+
+var s_restore_mode = ttmp7
+
+var s_restore_pc_lo = ttmp0
+var s_restore_pc_hi = ttmp1
+var s_restore_exec_lo = tma_lo //no conflict
+var s_restore_exec_hi = tma_hi //no conflict
+var s_restore_status = ttmp4
+var s_restore_trapsts = ttmp5
+var s_restore_xnack_mask_lo = xnack_mask_lo
+var s_restore_xnack_mask_hi = xnack_mask_hi
+var s_restore_buf_rsrc0 = ttmp8
+var s_restore_buf_rsrc1 = ttmp9
+var s_restore_buf_rsrc2 = ttmp10
+var s_restore_buf_rsrc3 = ttmp11
+
+/**************************************************************************/
+/* trap handler entry points */
+/**************************************************************************/
+/* Shader Main*/
+
+shader main
+ asic(CARRIZO)
+ type(CS)
+
+
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
+ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
+ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
+ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
+ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
+ else
+ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
+ end
+
+L_JUMP_TO_RESTORE:
+ s_branch L_RESTORE //restore
+
+L_SKIP_RESTORE:
+
+ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
+ s_cbranch_scc1 L_SAVE //this is the operation for save
+
+ // ********* Handle non-CWSR traps *******************
+if (!EMU_RUN_HACK)
+ /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
+ s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
+ s_waitcnt lgkmcnt(0)
+ s_or_b32 ttmp7, ttmp8, ttmp9
+ s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set
+ s_mov_b32 tma_lo, ttmp10 //set tma_lo/hi for next level trap handler
+ s_mov_b32 tma_hi, ttmp11
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+ s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler
+
+L_NO_NEXT_TRAP:
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
+ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
+ s_addc_u32 ttmp1, ttmp1, 0
+L_EXCP_CASE:
+ s_and_b32 ttmp1, ttmp1, 0xFFFF
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+ s_rfe_b64 [ttmp0, ttmp1]
+end
+ // ********* End handling of non-CWSR traps *******************
+
+/**************************************************************************/
+/* save routine */
+/**************************************************************************/
+
+L_SAVE:
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_save_s
+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
+end
+
+ //check whether there is mem_viol
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+ s_cbranch_scc0 L_NO_PC_REWIND
+
+ //if so, need rewind PC assuming GDS operation gets NACKed
+ s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+ s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
+ s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
+
+L_NO_PC_REWIND:
+ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
+
+ s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
+ s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
+ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
+
+ /* inform SPI the readiness and wait for SPI's go signal */
+ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
+ s_mov_b32 s_save_exec_hi, exec_hi
+ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_sq_save_msg
+ s_waitcnt lgkmcnt(0)
+end
+
+ if (EMU_RUN_HACK)
+
+ else
+ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
+ end
+
+ L_SLEEP:
+ s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+
+ if (EMU_RUN_HACK)
+
+ else
+ s_cbranch_execz L_SLEEP
+ end
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_spi_wrexec
+ s_waitcnt lgkmcnt(0)
+end
+
+ /* setup Resource Contants */
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+ //calculate wd_addr using absolute thread id
+ v_readlane_b32 s_save_tmp, v9, 0
+ s_lshr_b32 s_save_tmp, s_save_tmp, 6
+ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+ else
+ end
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+ else
+ end
+
+
+ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
+ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
+ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
+
+ //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
+ s_mov_b32 s_save_m0, m0 //save M0
+
+ /* global mem offset */
+ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
+
+
+
+
+ /* save HW registers */
+ //////////////////////////////
+
+ L_SAVE_HWREG:
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
+ get_vgpr_size_bytes(s_save_mem_offset)
+ get_sgpr_size_bytes(s_save_tmp)
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+
+
+ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0
+
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+ s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
+ s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
+ end
+
+ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC
+ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC
+ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+ // Save the tma_lo and tma_hi content from exec_lo and ttmp5
+ s_mov_b32 s_save_exec_lo, exec_lo
+ s_mov_b32 s_save_exec_hi, ttmp5
+ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS
+
+ //s_save_trapsts conflicts with s_save_alloc_size
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS
+
+ write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO
+ write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI
+
+ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+ write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO
+ write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI
+ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //TMA_LO
+ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) //TMA_HI
+
+ /* the first wave in the threadgroup */
+ // save fist_wave bits in tba_hi unused bit.26
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit
+ //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26]
+ s_mov_b32 s_save_exec_hi, 0x0
+ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
+
+
+ /* save SGPRs */
+ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+ //////////////////////////////
+
+ // SGPR SR memory offset : size(VGPR)
+ get_vgpr_size_bytes(s_save_mem_offset)
+ // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
+ end
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+ //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
+ s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
+ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+
+ s_mov_b32 m0, 0x0 //SGPR initial index value =0
+ L_SAVE_SGPR_LOOP:
+ // SGPR is allocated in 16 SGPR granularity
+ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
+ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
+ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
+ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
+ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
+ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
+ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
+ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
+
+ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
+ s_add_u32 m0, m0, 16 //next sgpr index
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
+ // restore s_save_buf_rsrc0,1
+ //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
+ s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
+
+
+
+
+ /* save first 4 VGPR, then LDS save could use */
+ // each wave will alloc 4 vgprs at least...
+ /////////////////////////////////////////////////////////////////////////////////////
+
+ s_mov_b32 s_save_mem_offset, 0
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+ // the const stride for DWx4 is 4*4 bytes
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
+
+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
+else
+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+end
+
+
+
+ /* save LDS */
+ //////////////////////////////
+
+ L_SAVE_LDS:
+
+ // Change EXEC to all threads...
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
+ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
+
+ s_barrier //LDS is used? wait for other waves in the same TG
+ //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
+ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
+ s_cbranch_scc0 L_SAVE_LDS_DONE
+
+ // first wave do LDS save;
+
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
+ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
+
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+ //
+ get_vgpr_size_bytes(s_save_mem_offset)
+ get_sgpr_size_bytes(s_save_tmp)
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0
+
+
+var LDS_DMA_ENABLE = 0
+var UNROLL = 0
+if UNROLL==0 && LDS_DMA_ENABLE==1
+ s_mov_b32 s3, 256*2
+ s_nop 0
+ s_nop 0
+ s_nop 0
+ L_SAVE_LDS_LOOP:
+ //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
+ if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+ end
+
+ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
+
+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss
+ // store from higest LDS address to lowest
+ s_mov_b32 s3, 256*2
+ s_sub_u32 m0, s_save_alloc_size, s3
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
+ s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks...
+ s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest
+ s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc
+ s_nop 0
+ s_nop 0
+ s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes
+ s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved
+ s_add_u32 s0, s0,s_save_alloc_size
+ s_addc_u32 s1, s1, 0
+ s_setpc_b64 s[0:1]
+
+
+ for var i =0; i< 128; i++
+ // be careful to make here a 64Byte aligned address, which could improve performance...
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+
+ if i!=127
+ s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline
+ s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3
+ end
+ end
+
+else // BUFFER_STORE
+ v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
+ v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
+ v_mul_i32_i24 v2, v3, 8 // tid*8
+ v_mov_b32 v3, 256*2
+ s_mov_b32 m0, 0x10000
+ s_mov_b32 s0, s_save_buf_rsrc3
+ s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT
+
+L_SAVE_LDS_LOOP_VECTOR:
+ ds_read_b64 v[0:1], v2 //x =LDS[a], byte address
+ s_waitcnt lgkmcnt(0)
+ buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1
+// s_waitcnt vmcnt(0)
+ v_add_u32 v2, vcc[0:1], v2, v3
+ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+ s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
+
+ // restore rsrc3
+ s_mov_b32 s_save_buf_rsrc3, s0
+
+end
+
+L_SAVE_LDS_DONE:
+
+
+ /* save VGPRs - set the Rest VGPRs */
+ //////////////////////////////////////////////////////////////////////////////////////
+ L_SAVE_VGPR:
+ // VGPR SR memory offset: 0
+ // TODO rearrange the RSRC words to use swizzle for VGPR save...
+
+ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+ // the const stride for DWx4 is 4*4 bytes
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
+
+ s_mov_b32 m0, 4 // skip first 4 VGPRs
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs
+
+ s_set_gpr_idx_on m0, 0x1 // This will change M0
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0
+L_SAVE_VGPR_LOOP:
+ v_mov_b32 v0, v0 // v0 = v[0+m0]
+ v_mov_b32 v1, v1
+ v_mov_b32 v2, v2
+ v_mov_b32 v3, v3
+
+
+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ s_add_u32 m0, m0, 4
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
+ s_set_gpr_idx_off
+L_SAVE_VGPR_LOOP_END:
+
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
+else
+ // VGPR store using dw burst
+ s_mov_b32 m0, 0x4 //VGPR initial index value =0
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc0 L_SAVE_VGPR_END
+
+
+ s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
+
+ L_SAVE_VGPR_LOOP:
+ v_mov_b32 v0, v0 //v0 = v[0+m0]
+ v_mov_b32 v1, v1 //v0 = v[0+m0]
+ v_mov_b32 v2, v2 //v0 = v[0+m0]
+ v_mov_b32 v3, v3 //v0 = v[0+m0]
+
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+ end
+
+ s_add_u32 m0, m0, 4 //next vgpr index
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
+ s_set_gpr_idx_off
+end
+
+L_SAVE_VGPR_END:
+
+
+
+
+
+
+ /* S_PGM_END_SAVED */ //FIXME graphics ONLY
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+ s_rfe_b64 s_save_pc_lo //Return to the main shader program
+ else
+ end
+
+// Save Done timestamp
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_save_d
+ // SGPR SR memory offset : size(VGPR)
+ get_vgpr_size_bytes(s_save_mem_offset)
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
+ // Need reset rsrc2??
+ s_mov_b32 m0, s_save_mem_offset
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1
+end
+
+
+ s_branch L_END_PGM
+
+
+
+/**************************************************************************/
+/* restore routine */
+/**************************************************************************/
+
+L_RESTORE:
+ /* Setup Resource Contants */
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+ //calculate wd_addr using absolute thread id
+ v_readlane_b32 s_restore_tmp, v9, 0
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
+ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+ else
+ end
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_restore_s
+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
+ // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
+ s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
+ s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored..
+end
+
+
+
+ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
+ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
+ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
+ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
+
+ /* global mem offset */
+// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
+
+ /* the first wave in the threadgroup */
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+ s_cbranch_scc0 L_RESTORE_VGPR
+
+ /* restore LDS */
+ //////////////////////////////
+ L_RESTORE_LDS:
+
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
+ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
+ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
+
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+ //
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ get_sgpr_size_bytes(s_restore_tmp)
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow???
+
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0
+
+ L_RESTORE_LDS_LOOP:
+ if (SAVE_LDS)
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
+ end
+ s_add_u32 m0, m0, 256*2 // 128 DW
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
+
+
+ /* restore VGPRs */
+ //////////////////////////////
+ L_RESTORE_VGPR:
+ // VGPR SR memory offset : 0
+ s_mov_b32 s_restore_mem_offset, 0x0
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+if G8SR_VGPR_SR_IN_DWX4
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
+
+ // the const stride for DWx4 is 4*4 bytes
+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
+
+ s_mov_b32 m0, s_restore_alloc_size
+ s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0
+
+L_RESTORE_VGPR_LOOP:
+ buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+ s_waitcnt vmcnt(0)
+ s_sub_u32 m0, m0, 4
+ v_mov_b32 v0, v0 // v[0+m0] = v0
+ v_mov_b32 v1, v1
+ v_mov_b32 v2, v2
+ v_mov_b32 v3, v3
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
+ s_cmp_eq_u32 m0, 0x8000
+ s_cbranch_scc0 L_RESTORE_VGPR_LOOP
+ s_set_gpr_idx_off
+
+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes
+
+else
+ // VGPR load using dw burst
+ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
+ s_mov_b32 m0, 4 //VGPR initial index value = 1
+ s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
+
+ L_RESTORE_VGPR_LOOP:
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+ end
+ s_waitcnt vmcnt(0) //ensure data ready
+ v_mov_b32 v0, v0 //v[0+m0] = v0
+ v_mov_b32 v1, v1
+ v_mov_b32 v2, v2
+ v_mov_b32 v3, v3
+ s_add_u32 m0, m0, 4 //next vgpr index
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
+ s_set_gpr_idx_off
+ /* VGPR restore on v0 */
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
+ end
+
+end
+
+ /* restore SGPRs */
+ //////////////////////////////
+
+ // SGPR SR memory offset : size(VGPR)
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ get_sgpr_size_bytes(s_restore_tmp)
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group
+ // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
+ end
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
+ However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
+ */
+ s_mov_b32 m0, s_restore_alloc_size
+
+ L_RESTORE_SGPR_LOOP:
+ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made
+ s_waitcnt lgkmcnt(0) //ensure data ready
+
+ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
+
+ s_movreld_b64 s0, s0 //s[0+m0] = s0
+ s_movreld_b64 s2, s2
+ s_movreld_b64 s4, s4
+ s_movreld_b64 s6, s6
+ s_movreld_b64 s8, s8
+ s_movreld_b64 s10, s10
+ s_movreld_b64 s12, s12
+ s_movreld_b64 s14, s14
+
+ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
+
+ /* restore HW registers */
+ //////////////////////////////
+ L_RESTORE_HWREG:
+
+
+if G8SR_DEBUG_TIMESTAMP
+ s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
+ s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
+end
+
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ get_sgpr_size_bytes(s_restore_tmp)
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+
+
+ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0
+ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC
+ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC
+ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS
+ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS
+ read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO
+ read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI
+ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE
+ read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO
+ read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI
+
+ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
+
+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
+
+ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+ end
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+ end
+
+ s_mov_b32 m0, s_restore_m0
+ s_mov_b32 exec_lo, s_restore_exec_lo
+ s_mov_b32 exec_hi, s_restore_exec_hi
+
+ read_hwreg_from_mem(tma_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_lo
+ read_hwreg_from_mem(tma_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //tma_hi
+ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
+ //reuse s_restore_m0 as a temp register
+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
+
+ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
+ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
+
+ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_restore_d
+ s_waitcnt lgkmcnt(0)
+end
+
+// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
+ s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
+
+
+/**************************************************************************/
+/* the END */
+/**************************************************************************/
+L_END_PGM:
+ s_endpgm
+
+end
+
+
+/**************************************************************************/
+/* the helper functions */
+/**************************************************************************/
+
+//Only for save hwreg to mem
+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
+ s_mov_b32 m0, s_mem_offset
+ s_buffer_store_dword s, s_rsrc, m0 glc:0
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+ s_mov_b32 m0, exec_lo
+end
+
+//Only for save hwreg to mem
+function write_tma_to_mem(s, s_rsrc, offset_imm)
+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
+ s_mov_b32 m0, offset_imm
+ s_buffer_store_dword s, s_rsrc, m0 glc:0
+ s_mov_b32 m0, exec_lo
+end
+
+// HWREG are saved before SGPRs, so all HWREG could be use.
+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+
+ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:0
+ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:0
+ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:0
+ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:0
+ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
+ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc
+end
+
+
+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+end
+
+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
+ s_sub_u32 s_mem_offset, s_mem_offset, 4*16
+end
+
+
+
+function get_lds_size_bytes(s_lds_size_byte)
+ // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
+ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size
+ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
+end
+
+function get_vgpr_size_bytes(s_vgpr_size_byte)
+ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
+ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible
+end
+
+function get_sgpr_size_bytes(s_sgpr_size_byte)
+ s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1
+ s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value)
+end
+
+function get_hwreg_size_bytes
+ return 128 //HWREG size 128 bytes
+end
+
+#endif
+
+static const uint32_t cwsr_trap_carrizo_hex[] = {
+ 0xbf820001, 0xbf820131,
+ 0xb8f4f802, 0xb8f5f803,
+ 0x8675ff75, 0x00000400,
+ 0xbf850013, 0xc00a1e37,
+ 0x00000000, 0xbf8c007f,
+ 0x87777978, 0xbf840004,
+ 0xbeee007a, 0xbeef007b,
+ 0xb974f802, 0xbe801d78,
+ 0xb8f5f803, 0x8675ff75,
+ 0x000001ff, 0xbf850002,
+ 0x80708470, 0x82718071,
+ 0x8671ff71, 0x0000ffff,
+ 0xb974f802, 0xbe801f70,
+ 0xb8f5f803, 0x8675ff75,
+ 0x00000100, 0xbf840006,
+ 0xbefa0080, 0xb97a0203,
+ 0x8671ff71, 0x0000ffff,
+ 0x80f08870, 0x82f18071,
+ 0xbefa0080, 0xb97a0283,
+ 0xbef60068, 0xbef70069,
+ 0xb8fa1c07, 0x8e7a9c7a,
+ 0x87717a71, 0xb8fa03c7,
+ 0x8e7a9b7a, 0x87717a71,
+ 0xb8faf807, 0x867aff7a,
+ 0x00007fff, 0xb97af807,
+ 0xbef2007e, 0xbef3007f,
+ 0xbefe0180, 0xbf900004,
+ 0xbf8e0002, 0xbf88fffe,
+ 0xbef8007e, 0x8679ff7f,
+ 0x0000ffff, 0x8779ff79,
+ 0x00040000, 0xbefa0080,
+ 0xbefb00ff, 0x00807fac,
+ 0x867aff7f, 0x08000000,
+ 0x8f7a837a, 0x877b7a7b,
+ 0x867aff7f, 0x70000000,
+ 0x8f7a817a, 0x877b7a7b,
+ 0xbeef007c, 0xbeee0080,
+ 0xb8ee2a05, 0x806e816e,
+ 0x8e6e8a6e, 0xb8fa1605,
+ 0x807a817a, 0x8e7a867a,
+ 0x806e7a6e, 0xbefa0084,
+ 0xbefa00ff, 0x01000000,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601bfc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601c3c, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601c7c, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601cbc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601cfc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbef2007e, 0xbef30075,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601d3c, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xb8f5f803, 0xbefe007c,
+ 0xbefc006e, 0xc0601d7c,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0601dbc,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0601dfc,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xb8eff801,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601bfc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601b3c, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601b7c, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601cbc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0601cfc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0x867aff7f, 0x04000000,
+ 0xbef30080, 0x8773737a,
+ 0xb8ee2a05, 0x806e816e,
+ 0x8e6e8a6e, 0xb8f51605,
+ 0x80758175, 0x8e758475,
+ 0x8e7a8275, 0xbefa00ff,
+ 0x01000000, 0xbef60178,
+ 0x80786e78, 0xbefc0080,
+ 0xbe802b00, 0xbe822b02,
+ 0xbe842b04, 0xbe862b06,
+ 0xbe882b08, 0xbe8a2b0a,
+ 0xbe8c2b0c, 0xbe8e2b0e,
+ 0xc06a003c, 0x00000000,
+ 0xc06a013c, 0x00000010,
+ 0xc06a023c, 0x00000020,
+ 0xc06a033c, 0x00000030,
+ 0x8078c078, 0x82798079,
+ 0x807c907c, 0xbf0a757c,
+ 0xbf85ffeb, 0xbef80176,
+ 0xbeee0080, 0xbefe00c1,
+ 0xbeff00c1, 0xbefa00ff,
+ 0x01000000, 0xe0724000,
+ 0x6e1e0000, 0xe0724100,
+ 0x6e1e0100, 0xe0724200,
+ 0x6e1e0200, 0xe0724300,
+ 0x6e1e0300, 0xbefe00c1,
+ 0xbeff00c1, 0xb8f54306,
+ 0x8675c175, 0xbf84002c,
+ 0xbf8a0000, 0x867aff73,
+ 0x04000000, 0xbf840028,
+ 0x8e758675, 0x8e758275,
+ 0xbefa0075, 0xb8ee2a05,
+ 0x806e816e, 0x8e6e8a6e,
+ 0xb8fa1605, 0x807a817a,
+ 0x8e7a867a, 0x806e7a6e,
+ 0x806eff6e, 0x00000080,
+ 0xbefa00ff, 0x01000000,
+ 0xbefc0080, 0xd28c0002,
+ 0x000100c1, 0xd28d0003,
+ 0x000204c1, 0xd1060002,
+ 0x00011103, 0x7e0602ff,
+ 0x00000200, 0xbefc00ff,
+ 0x00010000, 0xbe80007b,
+ 0x867bff7b, 0xff7fffff,
+ 0x877bff7b, 0x00058000,
+ 0xd8ec0000, 0x00000002,
+ 0xbf8c007f, 0xe0765000,
+ 0x6e1e0002, 0x32040702,
+ 0xd0c9006a, 0x0000eb02,
+ 0xbf87fff7, 0xbefb0000,
+ 0xbeee00ff, 0x00000400,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xb8f52a05, 0x80758175,
+ 0x8e758275, 0x8e7a8875,
+ 0xbefa00ff, 0x01000000,
+ 0xbefc0084, 0xbf0a757c,
+ 0xbf840015, 0xbf11017c,
+ 0x8075ff75, 0x00001000,
+ 0x7e000300, 0x7e020301,
+ 0x7e040302, 0x7e060303,
+ 0xe0724000, 0x6e1e0000,
+ 0xe0724100, 0x6e1e0100,
+ 0xe0724200, 0x6e1e0200,
+ 0xe0724300, 0x6e1e0300,
+ 0x807c847c, 0x806eff6e,
+ 0x00000400, 0xbf0a757c,
+ 0xbf85ffef, 0xbf9c0000,
+ 0xbf8200d1, 0xbef8007e,
+ 0x8679ff7f, 0x0000ffff,
+ 0x8779ff79, 0x00040000,
+ 0xbefa0080, 0xbefb00ff,
+ 0x00807fac, 0x8676ff7f,
+ 0x08000000, 0x8f768376,
+ 0x877b767b, 0x8676ff7f,
+ 0x70000000, 0x8f768176,
+ 0x877b767b, 0x8676ff7f,
+ 0x04000000, 0xbf84001e,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xb8f34306, 0x8673c173,
+ 0xbf840019, 0x8e738673,
+ 0x8e738273, 0xbefa0073,
+ 0xb8f22a05, 0x80728172,
+ 0x8e728a72, 0xb8f61605,
+ 0x80768176, 0x8e768676,
+ 0x80727672, 0x8072ff72,
+ 0x00000080, 0xbefa00ff,
+ 0x01000000, 0xbefc0080,
+ 0xe0510000, 0x721e0000,
+ 0xe0510100, 0x721e0000,
+ 0x807cff7c, 0x00000200,
+ 0x8072ff72, 0x00000200,
+ 0xbf0a737c, 0xbf85fff6,
+ 0xbef20080, 0xbefe00c1,
+ 0xbeff00c1, 0xb8f32a05,
+ 0x80738173, 0x8e738273,
+ 0x8e7a8873, 0xbefa00ff,
+ 0x01000000, 0xbef60072,
+ 0x8072ff72, 0x00000400,
+ 0xbefc0084, 0xbf11087c,
+ 0x8073ff73, 0x00008000,
+ 0xe0524000, 0x721e0000,
+ 0xe0524100, 0x721e0100,
+ 0xe0524200, 0x721e0200,
+ 0xe0524300, 0x721e0300,
+ 0xbf8c0f70, 0x7e000300,
+ 0x7e020301, 0x7e040302,
+ 0x7e060303, 0x807c847c,
+ 0x8072ff72, 0x00000400,
+ 0xbf0a737c, 0xbf85ffee,
+ 0xbf9c0000, 0xe0524000,
+ 0x761e0000, 0xe0524100,
+ 0x761e0100, 0xe0524200,
+ 0x761e0200, 0xe0524300,
+ 0x761e0300, 0xb8f22a05,
+ 0x80728172, 0x8e728a72,
+ 0xb8f61605, 0x80768176,
+ 0x8e768676, 0x80727672,
+ 0x80f2c072, 0xb8f31605,
+ 0x80738173, 0x8e738473,
+ 0x8e7a8273, 0xbefa00ff,
+ 0x01000000, 0xbefc0073,
+ 0xc031003c, 0x00000072,
+ 0x80f2c072, 0xbf8c007f,
+ 0x80fc907c, 0xbe802d00,
+ 0xbe822d02, 0xbe842d04,
+ 0xbe862d06, 0xbe882d08,
+ 0xbe8a2d0a, 0xbe8c2d0c,
+ 0xbe8e2d0e, 0xbf06807c,
+ 0xbf84fff1, 0xb8f22a05,
+ 0x80728172, 0x8e728a72,
+ 0xb8f61605, 0x80768176,
+ 0x8e768676, 0x80727672,
+ 0xbefa0084, 0xbefa00ff,
+ 0x01000000, 0xc0211cfc,
+ 0x00000072, 0x80728472,
+ 0xc0211c3c, 0x00000072,
+ 0x80728472, 0xc0211c7c,
+ 0x00000072, 0x80728472,
+ 0xc0211bbc, 0x00000072,
+ 0x80728472, 0xc0211bfc,
+ 0x00000072, 0x80728472,
+ 0xc0211d3c, 0x00000072,
+ 0x80728472, 0xc0211d7c,
+ 0x00000072, 0x80728472,
+ 0xc0211a3c, 0x00000072,
+ 0x80728472, 0xc0211a7c,
+ 0x00000072, 0x80728472,
+ 0xc0211dfc, 0x00000072,
+ 0x80728472, 0xc0211b3c,
+ 0x00000072, 0x80728472,
+ 0xc0211b7c, 0x00000072,
+ 0x80728472, 0xbf8c007f,
+ 0x8671ff71, 0x0000ffff,
+ 0xbefc0073, 0xbefe006e,
+ 0xbeff006f, 0xc0211bbc,
+ 0x00000072, 0x80728472,
+ 0xc0211bfc, 0x00000072,
+ 0x80728472, 0xbf8c007f,
+ 0x867375ff, 0x000003ff,
+ 0xb9734803, 0x867375ff,
+ 0xfffff800, 0x8f738b73,
+ 0xb973a2c3, 0xb977f801,
+ 0x8673ff71, 0xf0000000,
+ 0x8f739c73, 0x8e739073,
+ 0xbef60080, 0x87767376,
+ 0x8673ff71, 0x08000000,
+ 0x8f739b73, 0x8e738f73,
+ 0x87767376, 0x8673ff74,
+ 0x00800000, 0x8f739773,
+ 0xb976f807, 0x86fe7e7e,
+ 0x86ea6a6a, 0xb974f802,
+ 0xbf8a0000, 0x95807370,
+ 0xbf810000, 0x00000000,
+};
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ee3e04e10dae..0fe1161a2182 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -31,16 +31,23 @@
#include <uapi/linux/kfd_ioctl.h>
#include <linux/time.h>
#include <linux/mm.h>
-#include <linux/mman.h>
+#include <uapi/asm-generic/mman-common.h>
#include <asm/processor.h>
+
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#include "kfd_dbgmgr.h"
+#include "cik_regs.h"
static long kfd_ioctl(struct file *, unsigned int, unsigned long);
static int kfd_open(struct inode *, struct file *);
static int kfd_mmap(struct file *, struct vm_area_struct *);
+static uint32_t kfd_convert_user_mem_alloction_flags(
+ struct kfd_dev *dev,
+ uint32_t userspace_flags);
+static bool kfd_is_large_bar(struct kfd_dev *dev);
+static int kfd_evict(struct file *filep, struct kfd_process *p, void *data);
static const char kfd_dev_name[] = "kfd";
static const struct file_operations kfd_fops = {
@@ -117,7 +124,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
return -EPERM;
}
- process = kfd_create_process(current);
+ process = kfd_create_process(filep);
if (IS_ERR(process))
return PTR_ERR(process);
@@ -206,6 +213,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
q_properties->ctx_save_restore_area_address =
args->ctx_save_restore_address;
q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
+ q_properties->ctl_stack_size = args->ctl_stack_size;
if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
@@ -270,7 +278,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
return -EINVAL;
}
- mutex_lock(&p->mutex);
+ down_write(&p->lock);
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
@@ -282,8 +290,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
p->pasid,
dev->id);
- err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
- 0, q_properties.type, &queue_id);
+ err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id);
if (err != 0)
goto err_create_queue;
@@ -291,10 +298,10 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
/* Return gpu_id as doorbell offset for mmap usage */
- args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id);
+ args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL | args->gpu_id);
args->doorbell_offset <<= PAGE_SHIFT;
- mutex_unlock(&p->mutex);
+ up_write(&p->lock);
pr_debug("kfd: queue id %d was created successfully\n", args->queue_id);
@@ -311,7 +318,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
err_create_queue:
err_bind_process:
- mutex_unlock(&p->mutex);
+ up_write(&p->lock);
return err;
}
@@ -325,11 +332,11 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
args->queue_id,
p->pasid);
- mutex_lock(&p->mutex);
+ down_write(&p->lock);
retval = pqm_destroy_queue(&p->pqm, args->queue_id);
- mutex_unlock(&p->mutex);
+ up_write(&p->lock);
return retval;
}
@@ -371,11 +378,33 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
pr_debug("kfd: updating queue id %d for PASID %d\n",
args->queue_id, p->pasid);
- mutex_lock(&p->mutex);
+ down_write(&p->lock);
retval = pqm_update_queue(&p->pqm, args->queue_id, &properties);
- mutex_unlock(&p->mutex);
+ up_write(&p->lock);
+
+ return retval;
+}
+
+static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
+ void *data)
+{
+ int retval;
+ struct kfd_ioctl_set_cu_mask_args *args = data;
+ struct queue_properties properties;
+ uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr;
+
+ if (get_user(properties.cu_mask, cu_mask_ptr))
+ return -EFAULT;
+ if (properties.cu_mask == 0)
+ return 0;
+
+ down_write(&p->lock);
+
+ retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties);
+
+ up_write(&p->lock);
return retval;
}
@@ -403,7 +432,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
if (dev == NULL)
return -EINVAL;
- mutex_lock(&p->mutex);
+ down_write(&p->lock);
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
@@ -427,46 +456,80 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
err = -EINVAL;
out:
- mutex_unlock(&p->mutex);
+ up_write(&p->lock);
return err;
}
-static int kfd_ioctl_dbg_register(struct file *filep,
- struct kfd_process *p, void *data)
+static int kfd_ioctl_set_trap_handler(struct file *filep,
+ struct kfd_process *p, void *data)
{
- struct kfd_ioctl_dbg_register_args *args = data;
+ struct kfd_ioctl_set_trap_handler_args *args = data;
struct kfd_dev *dev;
- struct kfd_dbgmgr *dbgmgr_ptr;
+ int err = 0;
struct kfd_process_device *pdd;
- bool create_ok;
- long status = 0;
dev = kfd_device_by_id(args->gpu_id);
if (dev == NULL)
return -EINVAL;
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
- return -EINVAL;
+ down_write(&p->lock);
+
+ pdd = kfd_bind_process_to_device(dev, p);
+ if (IS_ERR(pdd)) {
+ err = -ESRCH;
+ goto out;
+ }
+ if (!dev->cwsr_enabled || !pdd->qpd.cwsr_kaddr) {
+ pr_err("kfd: CWSR is not enabled, can't set trap handler.\n");
+ err = -EINVAL;
+ goto out;
}
- mutex_lock(kfd_get_dbgmgr_mutex());
- mutex_lock(&p->mutex);
+ if (dev->dqm->ops.set_trap_handler(dev->dqm,
+ &pdd->qpd,
+ args->tba_addr,
+ args->tma_addr))
+ err = -EINVAL;
- /*
- * make sure that we have pdd, if this the first queue created for
- * this process
- */
+out:
+ up_write(&p->lock);
+
+ return err;
+}
+
+static int
+kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data)
+{
+ long status = -EFAULT;
+ struct kfd_ioctl_dbg_register_args *args = data;
+ struct kfd_dev *dev;
+ struct kfd_dbgmgr *dbgmgr_ptr;
+ struct kfd_process_device *pdd;
+ bool create_ok = false;
+
+ pr_debug("kfd:dbg: %s\n", __func__);
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (!dev) {
+ dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__);
+ return status;
+ }
+
+ down_write(&p->lock);
+ mutex_lock(get_dbgmgr_mutex());
+
+ /* make sure that we have pdd, if this the first queue created for this process */
pdd = kfd_bind_process_to_device(dev, p);
- if (IS_ERR(pdd)) {
- mutex_unlock(&p->mutex);
- mutex_unlock(kfd_get_dbgmgr_mutex());
+ if (IS_ERR(pdd) < 0) {
+ mutex_unlock(get_dbgmgr_mutex());
+ up_write(&p->lock);
return PTR_ERR(pdd);
}
if (dev->dbgmgr == NULL) {
/* In case of a legal call, we have no dbgmgr yet */
+
create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev);
if (create_ok) {
status = kfd_dbgmgr_register(dbgmgr_ptr, p);
@@ -475,34 +538,32 @@ static int kfd_ioctl_dbg_register(struct file *filep,
else
dev->dbgmgr = dbgmgr_ptr;
}
- } else {
- pr_debug("debugger already registered\n");
- status = -EINVAL;
}
- mutex_unlock(&p->mutex);
- mutex_unlock(kfd_get_dbgmgr_mutex());
+ mutex_unlock(get_dbgmgr_mutex());
+ up_write(&p->lock);
return status;
}
-static int kfd_ioctl_dbg_unrgesiter(struct file *filep,
- struct kfd_process *p, void *data)
+/*
+ * Unregister dbg IOCTL
+ */
+
+static int
+kfd_ioctl_dbg_unrgesiter(struct file *filep, struct kfd_process *p, void *data)
{
+ long status = -EFAULT;
struct kfd_ioctl_dbg_unregister_args *args = data;
struct kfd_dev *dev;
- long status;
dev = kfd_device_by_id(args->gpu_id);
- if (dev == NULL)
- return -EINVAL;
-
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_unrgesiter not supported on CZ\n");
- return -EINVAL;
+ if (!dev) {
+ dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__);
+ return status;
}
- mutex_lock(kfd_get_dbgmgr_mutex());
+ mutex_lock(get_dbgmgr_mutex());
status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
if (status == 0) {
@@ -510,7 +571,7 @@ static int kfd_ioctl_dbg_unrgesiter(struct file *filep,
dev->dbgmgr = NULL;
}
- mutex_unlock(kfd_get_dbgmgr_mutex());
+ mutex_unlock(get_dbgmgr_mutex());
return status;
}
@@ -519,125 +580,144 @@ static int kfd_ioctl_dbg_unrgesiter(struct file *filep,
* Parse and generate variable size data structure for address watch.
* Total size of the buffer and # watch points is limited in order
* to prevent kernel abuse. (no bearing to the much smaller HW limitation
- * which is enforced by dbgdev module)
+ * which is enforced by dbgdev module.
* please also note that the watch address itself are not "copied from user",
* since it be set into the HW in user mode values.
*
*/
-static int kfd_ioctl_dbg_address_watch(struct file *filep,
- struct kfd_process *p, void *data)
+
+static int
+kfd_ioctl_dbg_address_watch(struct file *filep,
+ struct kfd_process *p,
+ void *data)
{
+ long status = -EFAULT;
struct kfd_ioctl_dbg_address_watch_args *args = data;
struct kfd_dev *dev;
struct dbg_address_watch_info aw_info;
- unsigned char *args_buff;
- long status;
- void __user *cmd_from_user;
- uint64_t watch_mask_value = 0;
+ unsigned char *args_buff = NULL;
unsigned int args_idx = 0;
+ uint64_t watch_mask_value = 0;
memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info));
- dev = kfd_device_by_id(args->gpu_id);
- if (dev == NULL)
- return -EINVAL;
+ do {
+ dev = kfd_device_by_id(args->gpu_id);
+ if (!dev) {
+ dev_info(NULL,
+ "Error! kfd: In func %s >> get device by id failed\n",
+ __func__);
+ break;
+ }
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
- return -EINVAL;
- }
+ if (args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) {
+ status = -EINVAL;
+ break;
+ }
- cmd_from_user = (void __user *) args->content_ptr;
+ if (args->buf_size_in_bytes <= sizeof(*args)) {
+ status = -EINVAL;
+ break;
+ }
- /* Validate arguments */
+ /* this is the actual buffer to work with */
- if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) ||
- (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) ||
- (cmd_from_user == NULL))
- return -EINVAL;
+ args_buff = kzalloc(args->buf_size_in_bytes -
+ sizeof(*args), GFP_KERNEL);
+ if (args_buff == NULL) {
+ status = -ENOMEM;
+ break;
+ }
- /* this is the actual buffer to work with */
- args_buff = memdup_user(cmd_from_user,
- args->buf_size_in_bytes - sizeof(*args));
- if (IS_ERR(args_buff))
- return PTR_ERR(args_buff);
+ /* this is the actual buffer to work with */
+ args_buff = memdup_user(cmd_from_user,
+ args->buf_size_in_bytes - sizeof(*args));
+ if (IS_ERR(args_buff))
+ return PTR_ERR(args_buff);
- aw_info.process = p;
+ aw_info.process = p;
- aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx]));
- args_idx += sizeof(aw_info.num_watch_points);
+ aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx]));
+ args_idx += sizeof(aw_info.num_watch_points);
- aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx];
- args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points;
+ aw_info.watch_mode = (HSA_DBG_WATCH_MODE *) &args_buff[args_idx];
+ args_idx += sizeof(HSA_DBG_WATCH_MODE) * aw_info.num_watch_points;
- /*
- * set watch address base pointer to point on the array base
- * within args_buff
- */
- aw_info.watch_address = (uint64_t *) &args_buff[args_idx];
+ /* set watch address base pointer to point on the array base within args_buff */
- /* skip over the addresses buffer */
- args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points;
+ aw_info.watch_address = (uint64_t *) &args_buff[args_idx];
- if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) {
- kfree(args_buff);
- return -EINVAL;
- }
+ /*skip over the addresses buffer */
+ args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points;
- watch_mask_value = (uint64_t) args_buff[args_idx];
+ if (args_idx >= args->buf_size_in_bytes) {
+ status = -EINVAL;
+ break;
+ }
- if (watch_mask_value > 0) {
- /*
- * There is an array of masks.
- * set watch mask base pointer to point on the array base
- * within args_buff
- */
- aw_info.watch_mask = (uint64_t *) &args_buff[args_idx];
+ watch_mask_value = (uint64_t) args_buff[args_idx];
- /* skip over the masks buffer */
- args_idx += sizeof(aw_info.watch_mask) *
- aw_info.num_watch_points;
- } else {
- /* just the NULL mask, set to NULL and skip over it */
- aw_info.watch_mask = NULL;
- args_idx += sizeof(aw_info.watch_mask);
- }
+ if (watch_mask_value > 0) {
+ /* there is an array of masks */
- if (args_idx >= args->buf_size_in_bytes - sizeof(args)) {
- kfree(args_buff);
- return -EINVAL;
- }
+ /* set watch mask base pointer to point on the array base within args_buff */
+ aw_info.watch_mask = (uint64_t *) &args_buff[args_idx];
- /* Currently HSA Event is not supported for DBG */
- aw_info.watch_event = NULL;
+ /*skip over the masks buffer */
+ args_idx += sizeof(aw_info.watch_mask) * aw_info.num_watch_points;
+ }
- mutex_lock(kfd_get_dbgmgr_mutex());
+ else
+ /* just the NULL mask, set to NULL and skip over it */
+ {
+ aw_info.watch_mask = NULL;
+ args_idx += sizeof(aw_info.watch_mask);
+ }
+
+ if (args_idx > args->buf_size_in_bytes) {
+ status = -EINVAL;
+ break;
+ }
- status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
+ aw_info.watch_event = NULL; /* Currently HSA Event is not supported for DBG */
+ status = 0;
- mutex_unlock(kfd_get_dbgmgr_mutex());
+ } while (0);
+
+ if (status == 0) {
+ mutex_lock(get_dbgmgr_mutex());
+
+ status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
+
+ mutex_unlock(get_dbgmgr_mutex());
+
+ }
kfree(args_buff);
return status;
}
-/* Parse and generate fixed size data structure for wave control */
-static int kfd_ioctl_dbg_wave_control(struct file *filep,
- struct kfd_process *p, void *data)
+/*
+ * Parse and generate fixed size data structure for wave control.
+ * Buffer is generated in a "packed" form, for avoiding structure packing/pending dependencies.
+ */
+
+static int
+kfd_ioctl_dbg_wave_control(struct file *filep, struct kfd_process *p, void *data)
{
+ long status = -EFAULT;
struct kfd_ioctl_dbg_wave_control_args *args = data;
struct kfd_dev *dev;
struct dbg_wave_control_info wac_info;
- unsigned char *args_buff;
- uint32_t computed_buff_size;
- long status;
- void __user *cmd_from_user;
+ unsigned char *args_buff = NULL;
unsigned int args_idx = 0;
+ uint32_t computed_buff_size;
memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info));
/* we use compact form, independent of the packing attribute value */
+
computed_buff_size = sizeof(*args) +
sizeof(wac_info.mode) +
sizeof(wac_info.operand) +
@@ -645,26 +725,25 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
sizeof(wac_info.dbgWave_msg.MemoryVA) +
sizeof(wac_info.trapId);
- dev = kfd_device_by_id(args->gpu_id);
- if (dev == NULL)
- return -EINVAL;
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
- return -EINVAL;
- }
+ dev_info(NULL, "kfd: In func %s - start\n", __func__);
- /* input size must match the computed "compact" size */
- if (args->buf_size_in_bytes != computed_buff_size) {
- pr_debug("size mismatch, computed : actual %u : %u\n",
- args->buf_size_in_bytes, computed_buff_size);
- return -EINVAL;
- }
+ do {
+ dev = kfd_device_by_id(args->gpu_id);
+ if (!dev) {
+ dev_info(NULL, "Error! kfd: In func %s >> getting device by id failed\n", __func__);
+ break;
+ }
- cmd_from_user = (void __user *) args->content_ptr;
+ /* input size must match the computed "compact" size */
- if (cmd_from_user == NULL)
- return -EINVAL;
+ if (args->buf_size_in_bytes != computed_buff_size) {
+ dev_info(NULL,
+ "Error! kfd: In func %s >> size mismatch, computed : actual %u : %u\n",
+ __func__, args->buf_size_in_bytes, computed_buff_size);
+ status = -EINVAL;
+ break;
+ }
/* copy the entire buffer from user */
@@ -673,34 +752,51 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
if (IS_ERR(args_buff))
return PTR_ERR(args_buff);
- /* move ptr to the start of the "pay-load" area */
- wac_info.process = p;
+ if (copy_from_user(args_buff,
+ (void __user *) args->content_ptr,
+ args->buf_size_in_bytes - sizeof(*args))) {
+ dev_info(NULL,
+ "Error! kfd: In func %s >> copy_from_user failed\n",
+ __func__);
+ break;
+ }
+
+ /* move ptr to the start of the "pay-load" area */
+
- wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx]));
- args_idx += sizeof(wac_info.operand);
+ wac_info.process = p;
- wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx]));
- args_idx += sizeof(wac_info.mode);
+ wac_info.operand = (HSA_DBG_WAVEOP) *((HSA_DBG_WAVEOP *)(&args_buff[args_idx]));
+ args_idx += sizeof(wac_info.operand);
- wac_info.trapId = *((uint32_t *)(&args_buff[args_idx]));
- args_idx += sizeof(wac_info.trapId);
+ wac_info.mode = (HSA_DBG_WAVEMODE) *((HSA_DBG_WAVEMODE *)(&args_buff[args_idx]));
+ args_idx += sizeof(wac_info.mode);
- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value =
- *((uint32_t *)(&args_buff[args_idx]));
- wac_info.dbgWave_msg.MemoryVA = NULL;
+ wac_info.trapId = (uint32_t) *((uint32_t *)(&args_buff[args_idx]));
+ args_idx += sizeof(wac_info.trapId);
- mutex_lock(kfd_get_dbgmgr_mutex());
+ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = *((uint32_t *)(&args_buff[args_idx]));
+ wac_info.dbgWave_msg.MemoryVA = NULL;
- pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
- wac_info.process, wac_info.operand,
- wac_info.mode, wac_info.trapId,
- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
- status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info);
+ status = 0;
+
+ } while (0);
+ if (status == 0) {
+ mutex_lock(get_dbgmgr_mutex());
+
+ dev_info(NULL,
+ "kfd: In func %s >> calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
+ __func__, wac_info.process, wac_info.operand, wac_info.mode, wac_info.trapId,
+ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
- pr_debug("Returned status of dbg manager is %ld\n", status);
+ status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info);
- mutex_unlock(kfd_get_dbgmgr_mutex());
+ dev_info(NULL, "kfd: In func %s >> returned status of dbg manager is %ld\n", __func__, status);
+
+ mutex_unlock(get_dbgmgr_mutex());
+
+ }
kfree(args_buff);
@@ -715,12 +811,13 @@ static int kfd_ioctl_get_clock_counters(struct file *filep,
struct timespec64 time;
dev = kfd_device_by_id(args->gpu_id);
- if (dev == NULL)
- return -EINVAL;
-
- /* Reading GPU clock counter from KGD */
- args->gpu_clock_counter =
- dev->kfd2kgd->get_gpu_clock_counter(dev->kgd);
+ if (dev)
+ /* Reading GPU clock counter from KGD */
+ args->gpu_clock_counter =
+ dev->kfd2kgd->get_gpu_clock_counter(dev->kgd);
+ else
+ /* Node without GPU resource */
+ args->gpu_clock_counter = 0;
/* No access to rdtsc. Using raw monotonic time */
getrawmonotonic64(&time);
@@ -747,7 +844,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
args->num_of_nodes = 0;
- mutex_lock(&p->mutex);
+ down_write(&p->lock);
/*if the process-device list isn't empty*/
if (kfd_has_process_device_data(p)) {
@@ -786,52 +883,180 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
(args->num_of_nodes < NUM_OF_SUPPORTED_GPUS));
}
- mutex_unlock(&p->mutex);
+ up_write(&p->lock);
return 0;
}
-static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
- void *data)
+static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_get_process_apertures_new_args *args = data;
+ struct kfd_process_device_apertures *pa;
+ struct kfd_process_device *pdd;
+ uint32_t nodes = 0;
+ int ret;
+
+ dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
+
+ if (args->num_of_nodes == 0) {
+ /* Return number of nodes, so that user space can alloacate
+ * sufficient memory */
+ down_write(&p->lock);
+
+ if (!kfd_has_process_device_data(p)) {
+ up_write(&p->lock);
+ return 0;
+ }
+
+ /* Run over all pdd of the process */
+ pdd = kfd_get_first_process_device_data(p);
+ do {
+ args->num_of_nodes++;
+ } while ((pdd =
+ kfd_get_next_process_device_data(p, pdd)) != NULL);
+
+ up_write(&p->lock);
+ return 0;
+ }
+
+ /* Fill in process-aperture information for all available
+ * nodes, but not more than args->num_of_nodes as that is
+ * the amount of memory allocated by user */
+ pa = kzalloc((sizeof(struct kfd_process_device_apertures) *
+ args->num_of_nodes), GFP_KERNEL);
+ if (!pa)
+ return -ENOMEM;
+
+ down_write(&p->lock);
+
+ if (!kfd_has_process_device_data(p)) {
+ up_write(&p->lock);
+ args->num_of_nodes = 0;
+ kfree(pa);
+ return 0;
+ }
+
+ /* Run over all pdd of the process */
+ pdd = kfd_get_first_process_device_data(p);
+ do {
+ pa[nodes].gpu_id = pdd->dev->id;
+ pa[nodes].lds_base = pdd->lds_base;
+ pa[nodes].lds_limit = pdd->lds_limit;
+ pa[nodes].gpuvm_base = pdd->gpuvm_base;
+ pa[nodes].gpuvm_limit = pdd->gpuvm_limit;
+ pa[nodes].scratch_base = pdd->scratch_base;
+ pa[nodes].scratch_limit = pdd->scratch_limit;
+
+ dev_dbg(kfd_device,
+ "gpu id %u\n", pdd->dev->id);
+ dev_dbg(kfd_device,
+ "lds_base %llX\n", pdd->lds_base);
+ dev_dbg(kfd_device,
+ "lds_limit %llX\n", pdd->lds_limit);
+ dev_dbg(kfd_device,
+ "gpuvm_base %llX\n", pdd->gpuvm_base);
+ dev_dbg(kfd_device,
+ "gpuvm_limit %llX\n", pdd->gpuvm_limit);
+ dev_dbg(kfd_device,
+ "scratch_base %llX\n", pdd->scratch_base);
+ dev_dbg(kfd_device,
+ "scratch_limit %llX\n", pdd->scratch_limit);
+ nodes++;
+ } while (
+ (pdd = kfd_get_next_process_device_data(p, pdd)) != NULL &&
+ (nodes < args->num_of_nodes));
+ up_write(&p->lock);
+
+ args->num_of_nodes = nodes;
+ ret = copy_to_user(
+ (void __user *)args->kfd_process_device_apertures_ptr,
+ pa,
+ (nodes * sizeof(struct kfd_process_device_apertures)));
+ kfree(pa);
+ return ret ? -EFAULT : 0;
+}
+
+static int
+kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data)
{
struct kfd_ioctl_create_event_args *args = data;
- int err;
+ struct kfd_dev *kfd;
+ struct kfd_process_device *pdd;
+ int err = -EINVAL;
+ void *mem, *kern_addr = NULL;
- err = kfd_event_create(filp, p, args->event_type,
- args->auto_reset != 0, args->node_id,
- &args->event_id, &args->event_trigger_data,
- &args->event_page_offset,
- &args->event_slot_index);
+ pr_debug("amdkfd: Event page offset 0x%llx\n", args->event_page_offset);
+
+ if (args->event_page_offset) {
+ kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
+ if (!kfd) {
+ pr_err("amdkfd: can't find kfd device\n");
+ return -EFAULT;
+ }
+ if (KFD_IS_DGPU(kfd->device_info->asic_family)) {
+ down_write(&p->lock);
+ pdd = kfd_bind_process_to_device(kfd, p);
+ if (IS_ERR(pdd) < 0) {
+ err = PTR_ERR(pdd);
+ up_write(&p->lock);
+ return -EFAULT;
+ }
+ mem = kfd_process_device_translate_handle(pdd,
+ GET_IDR_HANDLE(args->event_page_offset));
+ if (!mem) {
+ pr_err("amdkfd: can't find BO offset is 0x%llx\n",
+ args->event_page_offset);
+ up_write(&p->lock);
+ return -EFAULT;
+ }
+ up_write(&p->lock);
+
+ /* Map dGPU gtt BO to kernel */
+ kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd,
+ mem, &kern_addr);
+ }
+ }
+
+ err = kfd_event_create(filp, p,
+ args->event_type,
+ args->auto_reset != 0,
+ args->node_id,
+ &args->event_id,
+ &args->event_trigger_data,
+ &args->event_page_offset,
+ &args->event_slot_index,
+ kern_addr);
return err;
}
-static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p,
- void *data)
+static int
+kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p, void *data)
{
struct kfd_ioctl_destroy_event_args *args = data;
return kfd_event_destroy(p, args->event_id);
}
-static int kfd_ioctl_set_event(struct file *filp, struct kfd_process *p,
- void *data)
+static int
+kfd_ioctl_set_event(struct file *filp, struct kfd_process *p, void *data)
{
struct kfd_ioctl_set_event_args *args = data;
return kfd_set_event(p, args->event_id);
}
-static int kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p,
- void *data)
+static int
+kfd_ioctl_reset_event(struct file *filp, struct kfd_process *p, void *data)
{
struct kfd_ioctl_reset_event_args *args = data;
return kfd_reset_event(p, args->event_id);
}
-static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
- void *data)
+static int
+kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data)
{
struct kfd_ioctl_wait_events_args *args = data;
enum kfd_event_wait_result wait_result;
@@ -846,6 +1071,711 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
return err;
}
+static int kfd_ioctl_alloc_scratch_memory(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_alloc_memory_of_gpu_args *args =
+ (struct kfd_ioctl_alloc_memory_of_gpu_args *)data;
+ struct kfd_process_device *pdd;
+ struct kfd_dev *dev;
+ long err;
+
+ if (args->size == 0)
+ return -EINVAL;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (dev == NULL)
+ return -EINVAL;
+
+ down_write(&p->lock);
+
+ pdd = kfd_bind_process_to_device(dev, p);
+ if (IS_ERR(pdd) < 0) {
+ err = PTR_ERR(pdd);
+ goto bind_process_to_device_fail;
+ }
+
+ pdd->sh_hidden_private_base_vmid = args->va_addr;
+ pdd->qpd.sh_hidden_private_base = args->va_addr;
+
+ up_write(&p->lock);
+
+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0) {
+ err = dev->kfd2kgd->alloc_memory_of_scratch(
+ dev->kgd, args->va_addr, pdd->qpd.vmid);
+ if (err != 0)
+ goto alloc_memory_of_scratch_failed;
+ }
+
+ return 0;
+
+bind_process_to_device_fail:
+ up_write(&p->lock);
+alloc_memory_of_scratch_failed:
+ return -EFAULT;
+}
+
+static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_alloc_memory_of_gpu_args *args = data;
+ struct kfd_process_device *pdd;
+ void *mem;
+ struct kfd_dev *dev;
+ int idr_handle;
+ long err;
+
+ if (args->size == 0)
+ return -EINVAL;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (dev == NULL)
+ return -EINVAL;
+
+ down_write(&p->lock);
+ pdd = kfd_bind_process_to_device(dev, p);
+ up_write(&p->lock);
+ if (IS_ERR(pdd) < 0)
+ return PTR_ERR(pdd);
+
+ err = dev->kfd2kgd->alloc_memory_of_gpu(
+ dev->kgd, args->va_addr, args->size,
+ pdd->vm, (struct kgd_mem **) &mem, NULL, NULL, pdd, 0);
+
+ if (err != 0)
+ return err;
+
+ down_write(&p->lock);
+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+ args->va_addr, args->size);
+ up_write(&p->lock);
+ if (idr_handle < 0) {
+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
+ (struct kgd_mem *) mem);
+ return -EFAULT;
+ }
+
+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
+
+ return 0;
+}
+
+bool kfd_is_large_bar(struct kfd_dev *dev)
+{
+ struct kfd_local_mem_info mem_info;
+
+ if (debug_largebar) {
+ pr_debug("amdkfd: simulate large-bar allocation on non large-bar machine\n");
+ return true;
+ }
+
+ if (!KFD_IS_DGPU(dev->device_info->asic_family))
+ return false;
+
+ dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info);
+ if (mem_info.local_mem_size_private == 0 &&
+ mem_info.local_mem_size_public > 0)
+ return true;
+ return false;
+}
+
+static uint32_t kfd_convert_user_mem_alloction_flags(
+ struct kfd_dev *dev,
+ uint32_t userspace_flags)
+{
+ uint32_t kernel_allocation_flags;
+
+ kernel_allocation_flags = 0;
+
+ /* Allocate VRAM bo */
+ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) ||
+ (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE)) {
+ kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM;
+ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) &&
+ kfd_is_large_bar(dev))
+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_PUBLIC;
+ goto out;
+ }
+ /*
+ * Since currently user space library doesn't uses scratch
+ * allocation flag I route it to VRAM
+ */
+ if ((userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_SCRATCH) ||
+ (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_APU_SCRATCH)) {
+ kernel_allocation_flags = ALLOC_MEM_FLAGS_VRAM;
+ goto out;
+ }
+ /*
+ * The current usage for *_HOST allocation flags are for GTT memory
+ * Need to verify if we're node zero or we want to allocate bo on
+ * public domain for P2P buffers.
+ */
+ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST) {
+ kernel_allocation_flags = ALLOC_MEM_FLAGS_GTT;
+ goto out;
+ }
+ /* Allocate userptr BO */
+ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+ kernel_allocation_flags = ALLOC_MEM_FLAGS_USERPTR;
+ goto out;
+ }
+
+out:
+ if (userspace_flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_AQL_QUEUE_MEM)
+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_AQL_QUEUE_MEM;
+ /* Current HW doesn't support non paged memory */
+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_NONPAGED;
+ /*
+ * Set by default execute access as this buffer might be allocated
+ * for CP's ring buffer
+ */
+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_EXECUTE_ACCESS;
+ kernel_allocation_flags |= ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
+
+ pr_debug("amdkfd: user allocation flags 0x%x kernel allocation flags: 0x%x\n",
+ userspace_flags, kernel_allocation_flags);
+
+ return kernel_allocation_flags;
+}
+
+static int kfd_ioctl_alloc_memory_of_gpu_new(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_alloc_memory_of_gpu_new_args *args = data;
+ struct kfd_process_device *pdd;
+ void *mem;
+ struct kfd_dev *dev;
+ int idr_handle;
+ long err;
+ uint64_t offset;
+
+ if (args->size == 0)
+ return -EINVAL;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (dev == NULL)
+ return -EINVAL;
+
+ down_write(&p->lock);
+ pdd = kfd_bind_process_to_device(dev, p);
+ up_write(&p->lock);
+ if (IS_ERR(pdd) < 0)
+ return PTR_ERR(pdd);
+
+ offset = args->mmap_offset;
+ err = dev->kfd2kgd->alloc_memory_of_gpu(
+ dev->kgd, args->va_addr, args->size,
+ pdd->vm, (struct kgd_mem **) &mem, &offset,
+ NULL, pdd,
+ kfd_convert_user_mem_alloction_flags(dev, args->flags));
+
+ if (err != 0)
+ return err;
+
+ down_write(&p->lock);
+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+ args->va_addr, args->size);
+ up_write(&p->lock);
+ if (idr_handle < 0) {
+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
+ (struct kgd_mem *) mem);
+ return -EFAULT;
+ }
+
+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
+ if ((args->flags & KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE) != 0 &&
+ !kfd_is_large_bar(dev)) {
+ args->mmap_offset = 0;
+ } else {
+ args->mmap_offset = KFD_MMAP_TYPE_MAP_BO;
+ args->mmap_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
+ args->mmap_offset <<= PAGE_SHIFT;
+ args->mmap_offset |= offset;
+ }
+
+ return 0;
+}
+
+static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_free_memory_of_gpu_args *args = data;
+ struct kfd_process_device *pdd;
+ struct kfd_bo *buf_obj;
+ struct kfd_dev *dev;
+ int ret;
+
+ dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+ if (dev == NULL)
+ return -EINVAL;
+
+ down_write(&p->lock);
+
+ pdd = kfd_get_process_device_data(dev, p);
+ if (!pdd) {
+ pr_err("Process device data doesn't exist\n");
+ ret = -EINVAL;
+ goto err_unlock;
+ }
+
+ buf_obj = kfd_process_device_find_bo(pdd,
+ GET_IDR_HANDLE(args->handle));
+ if (buf_obj == NULL) {
+ ret = -EINVAL;
+ goto err_unlock;
+ }
+ run_rdma_free_callback(buf_obj);
+
+ up_write(&p->lock);
+
+ ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem);
+
+ /* If freeing the buffer failed, leave the handle in place for
+ * clean-up during process tear-down. */
+ if (ret == 0) {
+ down_write(&p->lock);
+ kfd_process_device_remove_obj_handle(
+ pdd, GET_IDR_HANDLE(args->handle));
+ up_write(&p->lock);
+ }
+
+ return ret;
+
+err_unlock:
+ up_write(&p->lock);
+ return ret;
+}
+
+int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem,
+ struct kfd_process *p, struct kfd_process_device *pdd)
+{
+ int err;
+
+ BUG_ON(!dev);
+ BUG_ON(!pdd);
+
+ err = dev->kfd2kgd->map_memory_to_gpu(
+ dev->kgd, (struct kgd_mem *) mem, pdd->vm);
+
+ if (err != 0)
+ return err;
+
+ radeon_flush_tlb(dev, p->pasid);
+
+ err = dev->dqm->ops.set_page_directory_base(dev->dqm, &pdd->qpd);
+ if (err != 0) {
+ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd,
+ (struct kgd_mem *) mem, pdd->vm);
+ return err;
+ }
+
+ return 0;
+}
+
+static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_map_memory_to_gpu_new_args *args = data;
+ struct kfd_process_device *pdd, *peer_pdd;
+ void *mem;
+ struct kfd_dev *dev, *peer;
+ long err = 0;
+ int i, num_dev;
+ uint32_t *devices_arr = NULL;
+ int bo_size;
+
+ dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+ if (dev == NULL)
+ return -EINVAL;
+
+ if (args->device_ids_array_size > 0 &&
+ (args->device_ids_array_size < sizeof(uint32_t))) {
+ pr_err("amdkfd: err node IDs array size %u\n",
+ args->device_ids_array_size);
+ return -EFAULT;
+ }
+
+ if (args->device_ids_array_size > 0) {
+ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
+ if (!devices_arr)
+ return -ENOMEM;
+
+ err = copy_from_user(devices_arr,
+ (void __user *)args->device_ids_array,
+ args->device_ids_array_size);
+ if (err != 0) {
+ err = -EFAULT;
+ goto copy_from_user_failed;
+ }
+ }
+
+ down_write(&p->lock);
+
+ pdd = kfd_bind_process_to_device(dev, p);
+ if (IS_ERR(pdd) < 0) {
+ err = PTR_ERR(pdd);
+ goto bind_process_to_device_failed;
+ }
+
+ mem = kfd_process_device_translate_handle(pdd,
+ GET_IDR_HANDLE(args->handle));
+ up_write(&p->lock);
+
+ if (mem == NULL) {
+ err = PTR_ERR(mem);
+ goto get_mem_obj_from_handle_failed;
+ }
+
+ if (args->device_ids_array_size > 0) {
+ num_dev = args->device_ids_array_size / sizeof(uint32_t);
+ for (i = 0 ; i < num_dev; i++) {
+ peer = kfd_device_by_id(devices_arr[i]);
+ if (!peer) {
+ pr_err("amdkfd: didn't found kfd-dev for 0x%x\n",
+ devices_arr[i]);
+ err = -EFAULT;
+ goto get_mem_obj_from_handle_failed;
+ }
+ down_write(&p->lock);
+ peer_pdd = kfd_bind_process_to_device(peer, p);
+ up_write(&p->lock);
+ if (!peer_pdd) {
+ err = -EFAULT;
+ goto get_mem_obj_from_handle_failed;
+ }
+ err = kfd_map_memory_to_gpu(peer, mem, p, peer_pdd);
+ if (err != 0)
+ pr_err("amdkfd: failed to map\n");
+ }
+ } else {
+ err = kfd_map_memory_to_gpu(dev, mem, p, pdd);
+ if (err != 0)
+ pr_err("amdkfd: failed to map\n");
+ }
+
+ bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem);
+ down_write(&p->lock);
+ pdd->mapped_size += bo_size;
+ up_write(&p->lock);
+
+ if (args->device_ids_array_size > 0 && devices_arr)
+ kfree(devices_arr);
+
+ return err;
+
+bind_process_to_device_failed:
+ up_write(&p->lock);
+get_mem_obj_from_handle_failed:
+copy_from_user_failed:
+ kfree(devices_arr);
+ return err;
+}
+
+static int kfd_ioctl_map_memory_to_gpu_wrapper(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_map_memory_to_gpu_args *args = data;
+ struct kfd_ioctl_map_memory_to_gpu_new_args new_args;
+
+ new_args.handle = args->handle;
+ new_args.device_ids_array = NULL;
+ new_args.device_ids_array_size = 0;
+
+ return kfd_ioctl_map_memory_to_gpu(filep, p, &new_args);
+}
+
+static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_unmap_memory_from_gpu_new_args *args = data;
+ struct kfd_process_device *pdd, *peer_pdd;
+ void *mem;
+ struct kfd_dev *dev, *peer;
+ long err = 0;
+ uint32_t *devices_arr = NULL, num_dev, i;
+ int bo_size;
+
+ dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+ if (dev == NULL)
+ return -EINVAL;
+
+ if (args->device_ids_array_size > 0 &&
+ (args->device_ids_array_size < sizeof(uint32_t))) {
+ pr_err("amdkfd: err node IDs array size %u\n",
+ args->device_ids_array_size);
+ return -EFAULT;
+ }
+
+ if (args->device_ids_array_size > 0) {
+ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
+ if (!devices_arr)
+ return -ENOMEM;
+
+ err = copy_from_user(devices_arr,
+ (void __user *)args->device_ids_array,
+ args->device_ids_array_size);
+ if (err != 0) {
+ err = -EFAULT;
+ goto copy_from_user_failed;
+ }
+ }
+
+ down_write(&p->lock);
+
+ pdd = kfd_get_process_device_data(dev, p);
+ if (!pdd) {
+ pr_err("Process device data doesn't exist\n");
+ err = PTR_ERR(pdd);
+ goto bind_process_to_device_failed;
+ }
+
+ mem = kfd_process_device_translate_handle(pdd,
+ GET_IDR_HANDLE(args->handle));
+ up_write(&p->lock);
+
+ if (mem == NULL) {
+ err = PTR_ERR(mem);
+ goto get_mem_obj_from_handle_failed;
+ }
+
+ if (args->device_ids_array_size > 0) {
+ num_dev = args->device_ids_array_size / sizeof(uint32_t);
+ for (i = 0 ; i < num_dev; i++) {
+ peer = kfd_device_by_id(devices_arr[i]);
+ if (!peer) {
+ err = -EFAULT;
+ goto get_mem_obj_from_handle_failed;
+ }
+ down_write(&p->lock);
+ peer_pdd = kfd_get_process_device_data(peer, p);
+ up_write(&p->lock);
+ if (!peer_pdd) {
+ err = -EFAULT;
+ goto get_mem_obj_from_handle_failed;
+ }
+ peer->kfd2kgd->unmap_memory_to_gpu(peer->kgd,
+ mem, peer_pdd->vm);
+ radeon_flush_tlb(peer, p->pasid);
+ }
+ } else {
+ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm);
+ radeon_flush_tlb(dev, p->pasid);
+ }
+
+ bo_size = dev->kfd2kgd->return_bo_size(dev->kgd, mem);
+ down_write(&p->lock);
+ pdd->mapped_size -= bo_size;
+ up_write(&p->lock);
+
+ return 0;
+
+bind_process_to_device_failed:
+ up_write(&p->lock);
+get_mem_obj_from_handle_failed:
+copy_from_user_failed:
+ kfree(devices_arr);
+ return err;
+}
+
+static int kfd_ioctl_unmap_memory_from_gpu_wrapper(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
+ struct kfd_ioctl_unmap_memory_from_gpu_new_args new_args;
+
+ new_args.handle = args->handle;
+ new_args.device_ids_array = NULL;
+ new_args.device_ids_array_size = 0;
+
+ return kfd_ioctl_unmap_memory_from_gpu(filep, p, &new_args);
+}
+
+static int kfd_ioctl_open_graphic_handle(struct file *filep,
+ struct kfd_process *p,
+ void *data)
+{
+ struct kfd_ioctl_open_graphic_handle_args *args = data;
+ struct kfd_dev *dev;
+ struct kfd_process_device *pdd;
+ void *mem;
+ int idr_handle;
+ long err;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (dev == NULL)
+ return -EINVAL;
+
+ if (dev->device_info->asic_family != CHIP_KAVERI) {
+ pr_debug("kfd_ioctl_open_graphic_handle only supported on KV\n");
+ return -EINVAL;
+ }
+
+ down_write(&p->lock);
+ pdd = kfd_bind_process_to_device(dev, p);
+ up_write(&p->lock);
+ if (IS_ERR(pdd) < 0)
+ return PTR_ERR(pdd);
+
+ err = dev->kfd2kgd->open_graphic_handle(dev->kgd,
+ args->va_addr,
+ (struct kgd_vm *) pdd->vm,
+ args->graphic_device_fd,
+ args->graphic_handle,
+ (struct kgd_mem **) &mem);
+
+ if (err != 0)
+ return err;
+
+ down_write(&p->lock);
+ /*TODO: When open_graphic_handle is implemented, we need to create
+ * the corresponding interval tree. We need to know the size of
+ * the buffer through open_graphic_handle(). We use 1 for now.*/
+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+ args->va_addr, 1);
+ up_write(&p->lock);
+ if (idr_handle < 0) {
+ /* FIXME: destroy_process_gpumem doesn't seem to be
+ * implemented anywhere */
+ dev->kfd2kgd->destroy_process_gpumem(dev->kgd, mem);
+ return -EFAULT;
+ }
+
+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
+
+ return 0;
+}
+
+static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_set_process_dgpu_aperture_args *args = data;
+ struct kfd_dev *dev;
+ struct kfd_process_device *pdd;
+ long err;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (dev == NULL)
+ return -EINVAL;
+
+ down_write(&p->lock);
+
+ pdd = kfd_bind_process_to_device(dev, p);
+ if (IS_ERR(pdd) < 0) {
+ err = PTR_ERR(pdd);
+ goto exit;
+ }
+
+ err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base,
+ args->dgpu_limit);
+
+exit:
+ up_write(&p->lock);
+ return err;
+}
+
+static int kfd_ioctl_get_dmabuf_info(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_get_dmabuf_info_args *args = data;
+ struct kfd_dev *dev = NULL;
+ struct kgd_dev *dma_buf_kgd;
+ void *metadata_buffer = NULL;
+ uint32_t flags;
+ unsigned i;
+ int r;
+
+ /* Find a KFD GPU device that supports the get_dmabuf_info query */
+ for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++)
+ if (dev && dev->kfd2kgd->get_dmabuf_info)
+ break;
+ if (!dev)
+ return -EINVAL;
+
+ if (args->metadata_ptr) {
+ metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL);
+ if (!metadata_buffer)
+ return -ENOMEM;
+ }
+
+ /* Get dmabuf info from KGD */
+ r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd,
+ &dma_buf_kgd, &args->size,
+ metadata_buffer, args->metadata_size,
+ &args->metadata_size, &flags);
+ if (r)
+ goto exit;
+
+ /* Reverse-lookup gpu_id from kgd pointer */
+ dev = kfd_device_by_kgd(dma_buf_kgd);
+ if (!dev) {
+ r = -EINVAL;
+ goto exit;
+ }
+ args->gpu_id = kfd_get_gpu_id(dev);
+
+ /* Translate flags */
+ if (flags & ALLOC_MEM_FLAGS_VRAM) {
+ args->flags = KFD_IS_DGPU(dev->device_info->asic_family) ?
+ KFD_IOC_ALLOC_MEM_FLAGS_DGPU_DEVICE :
+ KFD_IOC_ALLOC_MEM_FLAGS_APU_DEVICE;
+ } else
+ args->flags = KFD_IOC_ALLOC_MEM_FLAGS_DGPU_HOST;
+
+ /* Copy metadata buffer to user mode */
+ if (metadata_buffer) {
+ r = copy_to_user((void __user *)args->metadata_ptr,
+ metadata_buffer, args->metadata_size);
+ if (r != 0)
+ r = -EFAULT;
+ }
+
+exit:
+ kfree(metadata_buffer);
+
+ return r;
+}
+
+static int kfd_ioctl_import_dmabuf(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_import_dmabuf_args *args = data;
+ struct kfd_dev *dev;
+ struct kfd_process_device *pdd;
+ void *mem;
+ uint64_t size;
+ int idr_handle;
+ int r;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (!dev || !dev->kfd2kgd->import_dmabuf)
+ return -EINVAL;
+
+ down_write(&p->lock);
+ pdd = kfd_bind_process_to_device(dev, p);
+ up_write(&p->lock);
+ if (IS_ERR(pdd) < 0)
+ return PTR_ERR(pdd);
+
+ r = dev->kfd2kgd->import_dmabuf(dev->kgd, args->dmabuf_fd,
+ args->va_addr, pdd->vm,
+ (struct kgd_mem **)&mem, &size);
+ if (r)
+ return r;
+
+ down_write(&p->lock);
+ idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+ args->va_addr, size);
+ up_write(&p->lock);
+ if (idr_handle < 0) {
+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
+ (struct kgd_mem *)mem);
+ return -EFAULT;
+ }
+
+ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
+
+ return 0;
+}
#define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl}
@@ -899,10 +1829,65 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL,
kfd_ioctl_dbg_wave_control, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU,
+ kfd_ioctl_alloc_memory_of_gpu, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU,
+ kfd_ioctl_free_memory_of_gpu, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU,
+ kfd_ioctl_map_memory_to_gpu_wrapper, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU,
+ kfd_ioctl_unmap_memory_from_gpu_wrapper, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_OPEN_GRAPHIC_HANDLE,
+ kfd_ioctl_open_graphic_handle, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH,
+ kfd_ioctl_alloc_scratch_memory, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK,
+ kfd_ioctl_set_cu_mask, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE,
+ kfd_ioctl_set_process_dgpu_aperture, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
+ kfd_ioctl_set_trap_handler, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU_NEW,
+ kfd_ioctl_alloc_memory_of_gpu_new, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU_NEW,
+ kfd_ioctl_map_memory_to_gpu, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU_NEW,
+ kfd_ioctl_unmap_memory_from_gpu, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
+ kfd_ioctl_get_process_apertures_new, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_EVICT_MEMORY,
+ kfd_evict, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO,
+ kfd_ioctl_get_dmabuf_info, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
+ kfd_ioctl_import_dmabuf, 0)
};
#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
+static int kfd_evict(struct file *filep, struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_eviction_args *args = data;
+
+ return evict_size(p, args->size, args->type);
+
+}
static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
{
struct kfd_process *process;
@@ -994,20 +1979,37 @@ err_i1:
static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct kfd_process *process;
+ struct kfd_dev *kfd;
+ unsigned long vm_pgoff;
+ int retval;
process = kfd_get_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
- if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) ==
- KFD_MMAP_DOORBELL_MASK) {
- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK;
+ vm_pgoff = vma->vm_pgoff;
+ vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff);
+
+ switch (vm_pgoff & KFD_MMAP_TYPE_MASK) {
+ case KFD_MMAP_TYPE_DOORBELL:
return kfd_doorbell_mmap(process, vma);
- } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) ==
- KFD_MMAP_EVENTS_MASK) {
- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
+
+ case KFD_MMAP_TYPE_EVENTS:
return kfd_event_mmap(process, vma);
+
+ case KFD_MMAP_TYPE_MAP_BO:
+ kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff));
+ if (!kfd)
+ return -EFAULT;
+ retval = kfd->kfd2kgd->mmap_bo(kfd->kgd, vma);
+ return retval;
+
+ case KFD_MMAP_TYPE_RESERVED_MEM:
+ return kfd_reserved_mem_mmap(process, vma);
+
}
return -EFAULT;
}
+
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
new file mode 100644
index 000000000000..b3d4a506b0e6
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -0,0 +1,1163 @@
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mm.h>
+#include <linux/amd-iommu.h>
+#include <linux/pci.h>
+#include "kfd_crat.h"
+#include "kfd_priv.h"
+#include "kfd_topology.h"
+
+/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
+ * GPU processor ID are expressed with Bit[31]=1.
+ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
+ * used in the CRAT. */
+static uint32_t gpu_processor_id_low = 0x80001000;
+
+/* Return the next available gpu_processor_id and increment it for next GPU
+ * @total_cu_count - Total CUs present in the GPU including ones masked off
+ */
+static inline unsigned int get_and_inc_gpu_processor_id(
+ unsigned int total_cu_count)
+{
+ int current_id = gpu_processor_id_low;
+
+ gpu_processor_id_low += total_cu_count;
+ return current_id;
+}
+
+/* Static table to describe GPU Cache information */
+struct kfd_gpu_cache_info {
+ uint32_t cache_size;
+ uint32_t cache_level;
+ uint32_t flags;
+ /* Indicates how many Compute Units share this cache
+ * Value = 1 indicates the cache is not shared */
+ uint32_t num_cu_shared;
+};
+
+static struct kfd_gpu_cache_info kaveri_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+
+ },
+ {
+ /* Scalar L1 Instruction Cache (in SQC module) per bank */
+ .cache_size = 16,
+ .cache_level = 1,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+ {
+ /* Scalar L1 Data Cache (in SQC module) per bank */
+ .cache_size = 8,
+ .cache_level = 1,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 2,
+ },
+
+ /* TODO: Add L2 Cache information */
+};
+
+
+static struct kfd_gpu_cache_info carrizo_cache_info[] = {
+ {
+ /* TCP L1 Cache per CU */
+ .cache_size = 16,
+ .cache_level = 1,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 1,
+ },
+ {
+ /* Scalar L1 Instruction Cache (in SQC module) per bank */
+ .cache_size = 8,
+ .cache_level = 1,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_INST_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 4,
+ },
+ {
+ /* Scalar L1 Data Cache (in SQC module) per bank. */
+ .cache_size = 4,
+ .cache_level = 1,
+ .flags = (CRAT_CACHE_FLAGS_ENABLED |
+ CRAT_CACHE_FLAGS_DATA_CACHE |
+ CRAT_CACHE_FLAGS_SIMD_CACHE),
+ .num_cu_shared = 4,
+ },
+
+ /* TODO: Add L2 Cache information */
+};
+
+/* NOTE: In future if more information is added to struct kfd_gpu_cache_info
+ * the following ASICs may need a separate table. */
+#define tonga_cache_info carrizo_cache_info
+#define fiji_cache_info carrizo_cache_info
+
+static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
+ struct crat_subtype_computeunit *cu)
+{
+ BUG_ON(!dev);
+ BUG_ON(!cu);
+
+ dev->node_props.cpu_cores_count = cu->num_cpu_cores;
+ dev->node_props.cpu_core_id_base = cu->processor_id_low;
+ if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
+ dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+
+ pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
+ cu->processor_id_low);
+}
+
+static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
+ struct crat_subtype_computeunit *cu)
+{
+ BUG_ON(!dev);
+ BUG_ON(!cu);
+
+ dev->node_props.simd_id_base = cu->processor_id_low;
+ dev->node_props.simd_count = cu->num_simd_cores;
+ dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
+ dev->node_props.max_waves_per_simd = cu->max_waves_simd;
+ dev->node_props.wave_front_size = cu->wave_front_size;
+ dev->node_props.array_count = cu->array_count;
+ dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
+ dev->node_props.simd_per_cu = cu->num_simd_per_cu;
+ dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
+ if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
+ dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
+ pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
+}
+
+/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
+ struct list_head *device_list)
+{
+ struct kfd_topology_device *dev;
+
+ BUG_ON(!cu);
+
+ pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
+ cu->proximity_domain, cu->hsa_capability);
+ list_for_each_entry(dev, device_list, list) {
+ if (cu->proximity_domain == dev->proximity_domain) {
+ if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
+ kfd_populated_cu_info_cpu(dev, cu);
+
+ if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
+ kfd_populated_cu_info_gpu(dev, cu);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
+ struct list_head *device_list)
+{
+ struct kfd_mem_properties *props;
+ struct kfd_topology_device *dev;
+
+ BUG_ON(!mem);
+
+ pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
+ mem->proximity_domain);
+ list_for_each_entry(dev, device_list, list) {
+ if (mem->proximity_domain == dev->proximity_domain) {
+ props = kfd_alloc_struct(props);
+ if (props == NULL)
+ return -ENOMEM;
+
+ /*
+ * We're on GPU node
+ */
+ if (dev->node_props.cpu_cores_count == 0) {
+ /* APU */
+ if (mem->visibility_type == 0)
+ props->heap_type =
+ HSA_MEM_HEAP_TYPE_FB_PRIVATE;
+ /* dGPU */
+ else
+ props->heap_type = mem->visibility_type;
+ }
+ else
+ props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
+
+ if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
+ props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
+ if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
+ props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
+
+ props->size_in_bytes =
+ ((uint64_t)mem->length_high << 32) +
+ mem->length_low;
+ props->width = mem->width;
+
+ dev->node_props.mem_banks_count++;
+ list_add_tail(&props->list, &dev->mem_props);
+
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
+ struct list_head *device_list)
+{
+ struct kfd_cache_properties *props;
+ struct kfd_topology_device *dev;
+ uint32_t id;
+ uint32_t total_num_of_cu;
+
+ BUG_ON(!cache);
+
+ id = cache->processor_id_low;
+
+ list_for_each_entry(dev, device_list, list) {
+ total_num_of_cu = (dev->node_props.array_count *
+ dev->node_props.cu_per_simd_array);
+
+ /* Cache infomration in CRAT doesn't have proximity_domain
+ * information as it is associated with a CPU core or GPU
+ * Compute Unit. So map the cache using CPU core Id or SIMD
+ * (GPU) ID.
+ * TODO: This works because currently we can safely assume that
+ * Compute Units are parsed before caches are parsed. In future
+ * remove this dependency
+ */
+ if ((id >= dev->node_props.cpu_core_id_base &&
+ id <= dev->node_props.cpu_core_id_base +
+ dev->node_props.cpu_cores_count) ||
+ (id >= dev->node_props.simd_id_base &&
+ id < dev->node_props.simd_id_base +
+ total_num_of_cu)) {
+ props = kfd_alloc_struct(props);
+ if (props == NULL)
+ return -ENOMEM;
+
+ props->processor_id_low = id;
+ props->cache_level = cache->cache_level;
+ props->cache_size = cache->cache_size;
+ props->cacheline_size = cache->cache_line_size;
+ props->cachelines_per_tag = cache->lines_per_tag;
+ props->cache_assoc = cache->associativity;
+ props->cache_latency = cache->cache_latency;
+ memcpy(props->sibling_map, cache->sibling_map,
+ sizeof(props->sibling_map));
+
+ if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+ props->cache_type |= HSA_CACHE_TYPE_DATA;
+ if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
+ props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+ if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+ props->cache_type |= HSA_CACHE_TYPE_CPU;
+ if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+ props->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+ dev->cache_count++;
+ dev->node_props.caches_count++;
+ list_add_tail(&props->list, &dev->cache_props);
+
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
+ struct list_head *device_list)
+{
+ struct kfd_iolink_properties *props;
+ struct kfd_topology_device *dev;
+ uint32_t i = 0;
+ uint32_t id_from;
+ uint32_t id_to;
+
+ BUG_ON(!iolink);
+
+ id_from = iolink->proximity_domain_from;
+ id_to = iolink->proximity_domain_to;
+
+ pr_debug("Found IO link entry in CRAT table with id_from=%d\n", id_from);
+ list_for_each_entry(dev, device_list, list) {
+ if (id_from == dev->proximity_domain) {
+ props = kfd_alloc_struct(props);
+ if (props == NULL)
+ return -ENOMEM;
+
+ props->node_from = id_from;
+ props->node_to = id_to;
+ props->ver_maj = iolink->version_major;
+ props->ver_min = iolink->version_minor;
+ props->iolink_type = iolink->io_interface_type;
+
+ /*
+ * weight factor (derived from CDIR), currently always 1
+ */
+ props->weight = 1;
+
+ props->min_latency = iolink->minimum_latency;
+ props->max_latency = iolink->maximum_latency;
+ props->min_bandwidth = iolink->minimum_bandwidth_mbs;
+ props->max_bandwidth = iolink->maximum_bandwidth_mbs;
+ props->rec_transfer_size =
+ iolink->recommended_transfer_size;
+
+ dev->io_link_count++;
+ dev->node_props.io_links_count++;
+ list_add_tail(&props->list, &dev->io_link_props);
+
+ break;
+ }
+ i++;
+ }
+
+ return 0;
+}
+
+/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
+ * present in the device_list
+ * @sub_type_hdr - subtype section of crat_image
+ * @device_list - list of topology devices present in this crat_image
+ */
+static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
+ struct list_head *device_list)
+{
+ struct crat_subtype_computeunit *cu;
+ struct crat_subtype_memory *mem;
+ struct crat_subtype_cache *cache;
+ struct crat_subtype_iolink *iolink;
+ int ret = 0;
+
+ BUG_ON(!sub_type_hdr);
+
+ switch (sub_type_hdr->type) {
+ case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
+ cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+ ret = kfd_parse_subtype_cu(cu, device_list);
+ break;
+ case CRAT_SUBTYPE_MEMORY_AFFINITY:
+ mem = (struct crat_subtype_memory *)sub_type_hdr;
+ ret = kfd_parse_subtype_mem(mem, device_list);
+ break;
+ case CRAT_SUBTYPE_CACHE_AFFINITY:
+ cache = (struct crat_subtype_cache *)sub_type_hdr;
+ ret = kfd_parse_subtype_cache(cache, device_list);
+ break;
+ case CRAT_SUBTYPE_TLB_AFFINITY:
+ /*
+ * For now, nothing to do here
+ */
+ pr_debug("Found TLB entry in CRAT table (not processing)\n");
+ break;
+ case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
+ /*
+ * For now, nothing to do here
+ */
+ pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
+ break;
+ case CRAT_SUBTYPE_IOLINK_AFFINITY:
+ iolink = (struct crat_subtype_iolink *)sub_type_hdr;
+ ret = kfd_parse_subtype_iolink(iolink, device_list);
+ break;
+ default:
+ pr_warn("Unknown subtype (%d) in CRAT\n",
+ sub_type_hdr->type);
+ }
+
+ return ret;
+}
+
+/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
+ * create a kfd_topology_device and add in to device_list. Also parse
+ * CRAT subtypes and attach it to appropriate kfd_topology_device
+ * @crat_image - input image containing CRAT
+ * @device_list - [OUT] list of kfd_topology_device generated after parsing
+ * crat_image
+ * @proximity_domain - Proximity domain of the first device in the table
+ * Return - 0 if successful else -ve value
+ */
+int kfd_parse_crat_table(void *crat_image,
+ struct list_head *device_list,
+ uint32_t proximity_domain)
+{
+ struct kfd_topology_device *top_dev = NULL;
+ struct crat_subtype_generic *sub_type_hdr;
+ uint16_t node_id;
+ int ret;
+ struct crat_header *crat_table = (struct crat_header *)crat_image;
+ uint16_t num_nodes;
+ uint32_t image_len;
+ uint32_t last_header_type, last_header_length;
+
+ if (!crat_image)
+ return -EINVAL;
+
+ if (!list_empty(device_list)) {
+ pr_warn("Error device list should be empty\n");
+ }
+
+ num_nodes = crat_table->num_domains;
+ image_len = crat_table->length;
+
+ pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
+
+ for (node_id = 0; node_id < num_nodes; node_id++) {
+ top_dev = kfd_create_topology_device(device_list);
+ if (!top_dev)
+ break;
+ top_dev->proximity_domain = proximity_domain++;
+ }
+
+ if (!top_dev)
+ return -ENOMEM;
+
+ memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
+ memcpy(top_dev->oem_table_id, crat_table->oem_table_id, CRAT_OEMTABLEID_LENGTH);
+ top_dev->oem_revision = crat_table->oem_revision;
+
+ last_header_type = last_header_length = 0;
+ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+ while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
+ ((char *)crat_image) + image_len) {
+ pr_debug("kfd parsing crat sub type header %p enabled: %s type: 0x%x length %d\n",
+ sub_type_hdr,
+ (sub_type_hdr->flags &
+ CRAT_SUBTYPE_FLAGS_ENABLED)
+ ? "true" : "false",
+ sub_type_hdr->type,
+ sub_type_hdr->length);
+
+ if (sub_type_hdr->length == 0) {
+ pr_err("amdkfd: Parsing wrong CRAT's sub header last header type: %d last header len %d\n",
+ last_header_type, last_header_type);
+ pr_err("amdkfd: Current header type %d length %d\n",
+ sub_type_hdr->type, sub_type_hdr->length);
+ break;
+ }
+
+ if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
+ ret = kfd_parse_subtype(sub_type_hdr, device_list);
+ if (ret != 0)
+ return ret;
+ }
+
+ last_header_type = sub_type_hdr->type;
+ last_header_length = sub_type_hdr->length;
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+ sub_type_hdr->length);
+ }
+
+ return 0;
+}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_pcache(struct crat_subtype_cache *pcache,
+ struct kfd_gpu_cache_info *pcache_info,
+ struct kfd_cu_info *cu_info,
+ int mem_available,
+ int cu_bitmask,
+ int cache_type, unsigned int cu_processor_id,
+ int cu_block)
+{
+ unsigned int cu_sibling_map_mask;
+ int first_active_cu;
+
+ /* First check if enough memory is available */
+ if (mem_available - sizeof(struct crat_subtype_cache) < 0)
+ return -ENOMEM;
+
+ cu_sibling_map_mask = cu_bitmask;
+ cu_sibling_map_mask >>= cu_block;
+ cu_sibling_map_mask &=
+ ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+ first_active_cu = ffs(cu_sibling_map_mask);
+
+ /* CU could be inactive. In case of shared cache find the first active
+ * CU. and incase of non-shared cache check if the CU is inactive. If
+ * inactive active skip it*/
+ if (first_active_cu) {
+ memset(pcache, 0, sizeof(struct crat_subtype_cache));
+ pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
+ pcache->length = sizeof(struct crat_subtype_cache);
+ pcache->flags = pcache_info[cache_type].flags;
+ pcache->processor_id_low = cu_processor_id
+ + (first_active_cu - 1);
+ pcache->cache_level = pcache_info[cache_type].cache_level;
+ pcache->cache_size = pcache_info[cache_type].cache_size;
+
+ /* Sibling map is w.r.t processor_id_low, so shift out
+ * inactive CU */
+ cu_sibling_map_mask =
+ cu_sibling_map_mask >> (first_active_cu - 1);
+
+ pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+ pcache->sibling_map[1] =
+ (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+ pcache->sibling_map[2] =
+ (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+ pcache->sibling_map[3] =
+ (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+ return 0;
+ }
+ return 1;
+}
+
+/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info tables
+ * @kdev - [IN] GPU device
+ * @gpu_processor_id - [IN] GPU processor ID to which these caches associate
+ * @available_size - [IN] Amount of memory available in pcache
+ * @cu_info - [IN] Compute Unit info obtained from KGD
+ * @pcache - [OUT] memory into which cache data is to be filled in.
+ * @size_filled - [OUT] amount of data used up in pcache.
+ * @num_of_entries - [OUT] number of caches added
+ */
+static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
+ int gpu_processor_id,
+ int available_size,
+ struct kfd_cu_info *cu_info,
+ struct crat_subtype_cache *pcache,
+ int *size_filled,
+ int *num_of_entries)
+{
+ struct kfd_gpu_cache_info *pcache_info;
+ int num_of_cache_types = 0;
+ int i, j, k;
+ int ct = 0;
+ int mem_available = available_size;
+ unsigned int cu_processor_id;
+ int ret;
+
+ switch (kdev->device_info->asic_family) {
+ case CHIP_KAVERI:
+ pcache_info = kaveri_cache_info;
+ num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
+ break;
+ case CHIP_CARRIZO:
+ pcache_info = carrizo_cache_info;
+ num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
+ break;
+ case CHIP_TONGA:
+ pcache_info = tonga_cache_info;
+ num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
+ break;
+ case CHIP_FIJI:
+ pcache_info = fiji_cache_info;
+ num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *size_filled = 0;
+ *num_of_entries = 0;
+
+ /* For each type of cache listed in the kfd_gpu_cache_info table,
+ * go through all available Compute Units.
+ * The [i,j,k] loop will
+ * if kfd_gpu_cache_info.num_cu_shared = 1
+ * will parse through all available CU
+ * If (kfd_gpu_cache_info.num_cu_shared != 1)
+ * then it will consider only one CU from
+ * the shared unit
+ */
+
+ for (ct = 0; ct < num_of_cache_types; ct++) {
+ cu_processor_id = gpu_processor_id;
+ for (i = 0; i < cu_info->num_shader_engines; i++) {
+ for (j = 0; j < cu_info->num_shader_arrays_per_engine;
+ j++) {
+ for (k = 0; k < cu_info->num_cu_per_sh;
+ k += pcache_info[ct].num_cu_shared) {
+
+ ret = fill_in_pcache(pcache,
+ pcache_info,
+ cu_info,
+ mem_available,
+ cu_info->cu_bitmap[i][j],
+ ct,
+ cu_processor_id,
+ k);
+
+ if (ret < 0)
+ break;
+
+ if (!ret) {
+ pcache++;
+ (*num_of_entries)++;
+ mem_available -=
+ sizeof(*pcache);
+ (*size_filled) +=
+ sizeof(*pcache);
+ }
+
+ /* Move to next CU block */
+ cu_processor_id +=
+ pcache_info[ct].num_cu_shared;
+ }
+ }
+ }
+ }
+
+ pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
+
+ return 0;
+}
+
+/*
+ * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
+ * copies CRAT from ACPI (if available).
+ *
+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+ *
+ * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then
+ * *crat_image will be NULL
+ * @size: [OUT] size of crat_image
+ *
+ * Return 0 if successful else return -ve value
+ */
+int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
+{
+ struct acpi_table_header *crat_table;
+ acpi_status status;
+ void *pcrat_image;
+
+ if (!crat_image)
+ return -EINVAL;
+
+ *crat_image = NULL;
+
+ /*
+ * Fetch the CRAT table from ACPI
+ */
+ status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
+ if (status == AE_NOT_FOUND) {
+ pr_warn("CRAT table not found\n");
+ return -ENODATA;
+ } else if (ACPI_FAILURE(status)) {
+ const char *err = acpi_format_exception(status);
+ pr_err("CRAT table error: %s\n", err);
+ return -EINVAL;
+ }
+
+ pcrat_image = kmalloc(crat_table->length, GFP_KERNEL);
+ if (!pcrat_image) {
+ pr_err("No memory for allocating CRAT image\n");
+ return -ENOMEM;
+ }
+
+ memcpy(pcrat_image, crat_table, crat_table->length);
+
+ *crat_image = pcrat_image;
+ *size = crat_table->length;
+
+ return 0;
+}
+
+/* Memory required to create Virtual CRAT.
+ * Since there is no easy way to predict the amount of memory required, the
+ * following amount are allocated for CPU and GPU Virtual CRAT. This is
+ * expected to cover all known conditions. But to be safe additional check
+ * is put in the code to ensure we don't overwrite.
+ */
+#define VCRAT_SIZE_FOR_CPU PAGE_SIZE
+#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE)
+
+/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
+ *
+ * @numa_node_id: CPU NUMA node id
+ * @avail_size: Available size in the memory
+ * @sub_type_hdr: Memory into which compute info will be filled in
+ *
+ * Return 0 if successful else return -ve value
+ */
+static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
+ int proximity_domain,
+ struct crat_subtype_computeunit *sub_type_hdr)
+{
+ const struct cpumask *cpumask;
+
+ *avail_size -= sizeof(struct crat_subtype_computeunit);
+ if (*avail_size < 0)
+ return -ENOMEM;
+
+ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+
+ /* Fill in subtype header data */
+ sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+ sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+ cpumask = cpumask_of_node(numa_node_id);
+
+ /* Fill in CU data */
+ sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
+ sub_type_hdr->proximity_domain = proximity_domain;
+ sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
+ if (sub_type_hdr->processor_id_low == -1)
+ return -EINVAL;
+
+ sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
+
+ return 0;
+}
+
+/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
+ *
+ * @numa_node_id: CPU NUMA node id
+ * @avail_size: Available size in the memory
+ * @sub_type_hdr: Memory into which compute info will be filled in
+ *
+ * Return 0 if successful else return -ve value
+ */
+static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
+ int proximity_domain,
+ struct crat_subtype_memory *sub_type_hdr)
+{
+ uint64_t mem_in_bytes = 0;
+ pg_data_t *pgdat;
+ int zone_type;
+
+ *avail_size -= sizeof(struct crat_subtype_computeunit);
+ if (*avail_size < 0)
+ return -ENOMEM;
+
+ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+
+ /* Fill in subtype header data */
+ sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+ sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+ /* Fill in Memory Subunit data */
+
+ /* Unlike si_meminfo, si_meminfo_node is not exported. So
+ * the following lines are duplicated from si_meminfo_node
+ * function */
+ pgdat = NODE_DATA(numa_node_id);
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+ mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
+ mem_in_bytes <<= PAGE_SHIFT;
+
+ sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
+ sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
+ sub_type_hdr->proximity_domain = proximity_domain;
+
+ return 0;
+}
+
+/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
+ *
+ * @pcrat_image: Fill in VCRAT for CPU
+ * @size: [IN] allocated size of crat_image.
+ * [OUT] actual size of data filled in crat_image
+ */
+static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
+{
+ struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+ struct acpi_table_header *acpi_table;
+ acpi_status status;
+ struct crat_subtype_generic *sub_type_hdr;
+ int avail_size = *size;
+ int numa_node_id;
+ int ret = 0;
+
+ if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_CPU)
+ return -EINVAL;
+
+ /* Fill in CRAT Header.
+ * Modify length and total_entries as subunits are added.
+ */
+ avail_size -= sizeof(struct crat_header);
+ if (avail_size < 0)
+ return -ENOMEM;
+
+ memset(crat_table, 0, sizeof(struct crat_header));
+ memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature));
+ crat_table->length = sizeof(struct crat_header);
+
+ status = acpi_get_table("DSDT", 0, &acpi_table);
+ if (status == AE_NOT_FOUND)
+ pr_warn("DSDT table not found for OEM information\n");
+ else {
+ crat_table->oem_revision = acpi_table->revision;
+ memcpy(crat_table->oem_id, acpi_table->oem_id, CRAT_OEMID_LENGTH);
+ memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, CRAT_OEMTABLEID_LENGTH);
+ }
+ crat_table->total_entries = 0;
+ crat_table->num_domains = 0;
+
+ sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+
+ for_each_online_node(numa_node_id) {
+ /* Fill in Subtype: Compute Unit */
+ ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
+ crat_table->num_domains,
+ (struct crat_subtype_computeunit *)sub_type_hdr);
+ if (ret < 0)
+ return ret;
+ crat_table->length += sub_type_hdr->length;
+ crat_table->total_entries++;
+
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+ sub_type_hdr->length);
+
+ /* Fill in Subtype: Memory */
+ ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
+ crat_table->num_domains,
+ (struct crat_subtype_memory *)sub_type_hdr);
+ if (ret < 0)
+ return ret;
+ crat_table->length += sub_type_hdr->length;
+ crat_table->total_entries++;
+
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+ sub_type_hdr->length);
+
+ crat_table->num_domains++;
+ }
+
+ /* TODO: Add cache Subtype for CPU.
+ * Currently, CPU cache information is available in function
+ * detect_cache_attributes(cpu) defined in the file
+ * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not exported
+ * and to get the same information the code needs to be duplicated.
+ */
+
+ *size = crat_table->length;
+ pr_info("Virtual CRAT table created for CPU\n");
+
+ return 0;
+}
+
+static int kfd_fill_gpu_memory_affinity(int *avail_size,
+ struct kfd_dev *kdev, uint8_t type, uint64_t size,
+ struct crat_subtype_memory *sub_type_hdr,
+ uint32_t proximity_domain,
+ const struct kfd_local_mem_info *local_mem_info)
+{
+ *avail_size -= sizeof(struct crat_subtype_memory);
+ if (*avail_size < 0)
+ return -ENOMEM;
+
+ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+ sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+ sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+
+ sub_type_hdr->proximity_domain = proximity_domain;
+
+ pr_debug("amdkfd: fill gpu memory affinity - type 0x%x size 0x%llx\n",
+ type, size);
+
+ sub_type_hdr->length_low = lower_32_bits(size);
+ sub_type_hdr->length_high = upper_32_bits(size);
+
+ sub_type_hdr->width = local_mem_info->vram_width;
+ sub_type_hdr->visibility_type = type;
+
+ return 0;
+}
+
+/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
+ * to its NUMA node
+ *
+ * @avail_size: Available size in the memory
+ * @kdev - [IN] GPU device
+ * @sub_type_hdr: Memory into which io link info will be filled in
+ * @proximity_domain - proximity domain of the GPU node
+ *
+ * Return 0 if successful else return -ve value
+ */
+static int kfd_fill_gpu_direct_io_link(int *avail_size,
+ struct kfd_dev *kdev,
+ struct crat_subtype_iolink *sub_type_hdr,
+ uint32_t proximity_domain)
+{
+ int proximity_domain_to;
+ *avail_size -= sizeof(struct crat_subtype_iolink);
+ if (*avail_size < 0)
+ return -ENOMEM;
+
+ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+
+ /* Fill in subtype header data */
+ sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+ sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+
+ /* Fill in IOLINK subtype.
+ * TODO: Fill-in other fields of iolink subtype */
+ sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
+ sub_type_hdr->proximity_domain_from = proximity_domain;
+ proximity_domain_to =
+ kfd_get_proximity_domain(kdev->pdev->bus);
+ if (proximity_domain_to == -1)
+ return -EINVAL;
+
+ sub_type_hdr->proximity_domain_to = proximity_domain_to;
+ return 0;
+}
+
+/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
+ *
+ * @pcrat_image: Fill in VCRAT for GPU
+ * @size: [IN] allocated size of crat_image.
+ * [OUT] actual size of data filled in crat_image
+ */
+static int kfd_create_vcrat_image_gpu(void *pcrat_image,
+ size_t *size, struct kfd_dev *kdev,
+ uint32_t proximity_domain)
+{
+ struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+ struct crat_subtype_generic *sub_type_hdr;
+ struct crat_subtype_computeunit *cu;
+ struct kfd_cu_info cu_info;
+ struct amd_iommu_device_info iommu_info;
+ int avail_size = *size;
+ uint32_t total_num_of_cu;
+ int num_of_cache_entries = 0;
+ int cache_mem_filled = 0;
+ int ret = 0;
+ const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
+ AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
+ AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+ struct kfd_local_mem_info local_mem_info;
+
+ if (pcrat_image == NULL || avail_size < VCRAT_SIZE_FOR_GPU)
+ return -EINVAL;
+
+ /* Fill the CRAT Header.
+ * Modify length and total_entries as subunits are added.
+ */
+ avail_size -= sizeof(struct crat_header);
+ if (avail_size < 0)
+ return -ENOMEM;
+
+ memset(crat_table, 0, sizeof(struct crat_header));
+
+ memcpy(&crat_table->signature, CRAT_SIGNATURE, sizeof(crat_table->signature));
+ crat_table->length = sizeof(struct crat_header); /* Change length as we add more subtypes*/
+ crat_table->num_domains = 1;
+ crat_table->total_entries = 0;
+
+ /* Fill in Subtype: Compute Unit
+ * First fill in the sub type header and then sub type data
+ */
+ avail_size -= sizeof(struct crat_subtype_computeunit);
+ if (avail_size < 0)
+ return -ENOMEM;
+
+ sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
+ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+
+ sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+ sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+ /* Fill CU subtype data */
+ cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+ cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
+ cu->proximity_domain = proximity_domain;
+
+ kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info);
+ cu->num_simd_per_cu = cu_info.simd_per_cu;
+ cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
+ cu->max_waves_simd = cu_info.max_waves_per_simd;
+
+ cu->wave_front_size = cu_info.wave_front_size;
+ cu->array_count = cu_info.num_shader_arrays_per_engine *
+ cu_info.num_shader_engines;
+ total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
+ cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
+ cu->num_cu_per_array = cu_info.num_cu_per_sh;
+ cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
+ cu->num_banks = cu_info.num_shader_engines;
+ cu->lds_size_in_kb = cu_info.lds_size;
+
+ cu->hsa_capability = 0;
+
+ /* Check if this node supports IOMMU. During parsing this flag will
+ * translate to HSA_CAP_ATS_PRESENT */
+ iommu_info.flags = 0;
+ if (0 == amd_iommu_device_info(kdev->pdev, &iommu_info)) {
+ if ((iommu_info.flags & required_iommu_flags) == required_iommu_flags)
+ cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
+ }
+
+ crat_table->length += sub_type_hdr->length;
+ crat_table->total_entries++;
+
+ /* Fill in Subtype: Memory. Only on systems with large BAR (no
+ * private FB), report memory as public. On other systems
+ * report the total FB size (public+private) as a single
+ * private heap. */
+ kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info);
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+ sub_type_hdr->length);
+
+ if (local_mem_info.local_mem_size_private == 0)
+ ret = kfd_fill_gpu_memory_affinity(&avail_size,
+ kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
+ local_mem_info.local_mem_size_public,
+ (struct crat_subtype_memory *)sub_type_hdr,
+ proximity_domain,
+ &local_mem_info);
+ else
+ ret = kfd_fill_gpu_memory_affinity(&avail_size,
+ kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
+ local_mem_info.local_mem_size_public +
+ local_mem_info.local_mem_size_private,
+ (struct crat_subtype_memory *)sub_type_hdr,
+ proximity_domain,
+ &local_mem_info);
+ if (ret < 0)
+ return ret;
+
+ crat_table->length += sizeof(struct crat_subtype_memory);
+ crat_table->total_entries++;
+
+ /* TODO: Fill in cache information. This information is NOT readily
+ * available in KGD */
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+ sub_type_hdr->length);
+ ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
+ avail_size,
+ &cu_info,
+ (struct crat_subtype_cache *)sub_type_hdr,
+ &cache_mem_filled,
+ &num_of_cache_entries);
+
+ if (ret < 0)
+ return ret;
+
+ crat_table->length += cache_mem_filled;
+ crat_table->total_entries += num_of_cache_entries;
+ avail_size -= cache_mem_filled;
+
+ /* Fill in Subtype: IO_LINKS
+ * Only direct links are added here which is Link from GPU to
+ * to its NUMA node. Indirect links are added by userspace.
+ */
+ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+ cache_mem_filled);
+ ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev,
+ (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
+
+ if (ret < 0)
+ return ret;
+
+ crat_table->length += sub_type_hdr->length;
+ crat_table->total_entries++;
+
+ *size = crat_table->length;
+ pr_info("Virtual CRAT table created for GPU\n");
+
+ return ret;
+}
+
+/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
+ * creates a Virtual CRAT (VCRAT) image
+ *
+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+ *
+ * @crat_image: VCRAT image created because ACPI does not have a
+ * CRAT for this device
+ * @size: [OUT] size of virtual crat_image
+ * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device
+ * COMPUTE_UNIT_GPU - Create VCRAT for GPU
+ * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
+ * -- this option is not currently implemented. The assumption
+ * is that all AMD APUs will have CRAT
+ * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
+ *
+ * Return 0 if successful else return -ve value
+*/
+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+ int flags, struct kfd_dev *kdev, uint32_t proximity_domain)
+{
+ void *pcrat_image;
+ int ret = 0;
+
+ if (!crat_image)
+ return -EINVAL;
+
+ *crat_image = NULL;
+
+ /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and
+ * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover
+ * all the current conditions. A check is put not to overwrite beyond
+ * allocated size
+ */
+ switch (flags) {
+ case COMPUTE_UNIT_CPU:
+ pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL);
+ if (!pcrat_image)
+ return -ENOMEM;
+ *size = VCRAT_SIZE_FOR_CPU;
+ ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
+ break;
+ case COMPUTE_UNIT_GPU:
+ if (!kdev)
+ return -EINVAL;
+ pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
+ if (!pcrat_image)
+ return -ENOMEM;
+ *size = VCRAT_SIZE_FOR_GPU;
+ ret = kfd_create_vcrat_image_gpu(pcrat_image, size,
+ kdev, proximity_domain);
+ break;
+ case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) :
+ /*TODO:*/
+ ret = -EINVAL;
+ pr_err("VCRAT not implemented for APU\n");
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret == 0)
+ *crat_image = pcrat_image;
+
+ return ret;
+}
+
+
+/* kfd_destroy_crat_image
+ *
+ * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
+ *
+ */
+void kfd_destroy_crat_image(void *crat_image)
+{
+ if (crat_image)
+ kfree(crat_image);
+ return;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index a374fa3d3ee6..9af3745646df 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -24,6 +24,7 @@
#define KFD_CRAT_H_INCLUDED
#include <linux/types.h>
+#include "kfd_priv.h"
#pragma pack(1)
@@ -44,6 +45,10 @@
#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
+/* Compute Unit flags */
+#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */
+#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */
+
struct crat_header {
uint32_t signature;
uint32_t length;
@@ -105,7 +110,7 @@ struct crat_subtype_computeunit {
uint8_t wave_front_size;
uint8_t num_banks;
uint16_t micro_engine_id;
- uint8_t num_arrays;
+ uint8_t array_count;
uint8_t num_cu_per_array;
uint8_t num_simd_per_cu;
uint8_t max_slots_scatch_cu;
@@ -127,13 +132,14 @@ struct crat_subtype_memory {
uint8_t length;
uint16_t reserved;
uint32_t flags;
- uint32_t promixity_domain;
+ uint32_t proximity_domain;
uint32_t base_addr_low;
uint32_t base_addr_high;
uint32_t length_low;
uint32_t length_high;
uint32_t width;
- uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH];
+ uint8_t visibility_type; /* for virtual (dGPU) CRAT */
+ uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1];
};
/*
@@ -222,9 +228,12 @@ struct crat_subtype_ccompute {
/*
* HSA IO Link Affinity structure and definitions
*/
-#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001
-#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002
-#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc
+#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0)
+#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1)
+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2)
+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3)
+#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4)
+#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0
/*
* IO interface types
@@ -232,8 +241,16 @@ struct crat_subtype_ccompute {
#define CRAT_IOLINK_TYPE_UNDEFINED 0
#define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1
#define CRAT_IOLINK_TYPE_PCIEXPRESS 2
-#define CRAT_IOLINK_TYPE_OTHER 3
-#define CRAT_IOLINK_TYPE_MAX 255
+#define CRAT_IOLINK_TYPE_AMBA 3
+#define CRAT_IOLINK_TYPE_MIPI 4
+#define CRAT_IOLINK_TYPE_QPI_1_1 5
+#define CRAT_IOLINK_TYPE_RESERVED1 6
+#define CRAT_IOLINK_TYPE_RESERVED2 7
+#define CRAT_IOLINK_TYPE_RAPID_IO 8
+#define CRAT_IOLINK_TYPE_INFINIBAND 9
+#define CRAT_IOLINK_TYPE_RESERVED3 10
+#define CRAT_IOLINK_TYPE_OTHER 11
+#define CRAT_IOLINK_TYPE_MAX 255
#define CRAT_IOLINK_RESERVED_LENGTH 24
@@ -291,4 +308,11 @@ struct cdit_header {
#pragma pack()
+int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
+void kfd_destroy_crat_image(void *crat_image);
+int kfd_parse_crat_table(void *crat_image,
+ struct list_head *device_list,
+ uint32_t proximity_domain);
+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+ int flags, struct kfd_dev *kdev, uint32_t proximity_domain);
#endif /* KFD_CRAT_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index d5e19b5fbbfb..4f2311e703c5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -42,8 +42,6 @@
static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev)
{
- BUG_ON(!dev || !dev->kfd2kgd);
-
dev->kfd2kgd->address_watch_disable(dev->kgd);
}
@@ -51,129 +49,118 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
unsigned int pasid, uint64_t vmid0_address,
uint32_t *packet_buff, size_t size_in_bytes)
{
+ int status = 0;
+ unsigned int *ib_packet_buff = NULL;
struct pm4__release_mem *rm_packet;
struct pm4__indirect_buffer_pasid *ib_packet;
+ struct kernel_queue *kq = dbgdev->kq;
+ size_t pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + sizeof(struct pm4__indirect_buffer_pasid);
struct kfd_mem_obj *mem_obj;
- size_t pq_packets_size_in_bytes;
+
+ uint64_t *rm_state = NULL;
+
union ULARGE_INTEGER *largep;
union ULARGE_INTEGER addr;
- struct kernel_queue *kq;
- uint64_t *rm_state;
- unsigned int *ib_packet_buff;
- int status;
-
- BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes);
-
- kq = dbgdev->kq;
-
- pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) +
- sizeof(struct pm4__indirect_buffer_pasid);
-
- /*
- * We acquire a buffer from DIQ
- * The receive packet buff will be sitting on the Indirect Buffer
- * and in the PQ we put the IB packet + sync packet(s).
- */
- status = kq->ops.acquire_packet_buffer(kq,
- pq_packets_size_in_bytes / sizeof(uint32_t),
- &ib_packet_buff);
- if (status != 0) {
- pr_err("amdkfd: acquire_packet_buffer failed\n");
- return status;
- }
- memset(ib_packet_buff, 0, pq_packets_size_in_bytes);
+ do {
+ if ((kq == NULL) || (packet_buff == NULL) || (size_in_bytes == 0)) {
+ pr_debug("Error! kfd: In func %s >> Illegal packet parameters\n", __func__);
+ status = -EINVAL;
+ break;
+ }
+ /* todo - enter proper locking to be multithreaded safe */
- ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff);
+ /* We acquire a buffer from DIQ
+ * The receive packet buff will be sitting on the Indirect Buffer
+ * and in the PQ we put the IB packet + sync packet(s).
+ */
+ status = kq->ops.acquire_packet_buffer(kq, pq_packets_size_in_bytes / sizeof(uint32_t), &ib_packet_buff);
+ if (status != 0) {
+ pr_debug("Error! kfd: In func %s >> acquire_packet_buffer failed\n", __func__);
+ break;
+ }
- ib_packet->header.count = 3;
- ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID;
- ib_packet->header.type = PM4_TYPE_3;
+ memset(ib_packet_buff, 0, pq_packets_size_in_bytes);
- largep = (union ULARGE_INTEGER *) &vmid0_address;
+ ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff);
- ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2;
- ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
+ ib_packet->header.count = 3;
+ ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID;
+ ib_packet->header.type = PM4_TYPE_3;
- ib_packet->control = (1 << 23) | (1 << 31) |
- ((size_in_bytes / sizeof(uint32_t)) & 0xfffff);
+ largep = (union ULARGE_INTEGER *) &vmid0_address;
- ib_packet->bitfields5.pasid = pasid;
+ ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2;
+ ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
- /*
- * for now we use release mem for GPU-CPU synchronization
- * Consider WaitRegMem + WriteData as a better alternative
- * we get a GART allocations ( gpu/cpu mapping),
- * for the sync variable, and wait until:
- * (a) Sync with HW
- * (b) Sync var is written by CP to mem.
- */
- rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
- (sizeof(struct pm4__indirect_buffer_pasid) /
- sizeof(unsigned int)));
+ ib_packet->control = (1 << 23) | (1 << 31) |
+ ((size_in_bytes / sizeof(uint32_t)) & 0xfffff);
- status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t),
- &mem_obj);
+ ib_packet->bitfields5.pasid = pasid;
- if (status != 0) {
- pr_err("amdkfd: Failed to allocate GART memory\n");
- kq->ops.rollback_packet(kq);
- return status;
- }
+ /*
+ * for now we use release mem for GPU-CPU synchronization
+ * Consider WaitRegMem + WriteData as a better alternative
+ * we get a GART allocations ( gpu/cpu mapping),
+ * for the sync variable, and wait until:
+ * (a) Sync with HW
+ * (b) Sync var is written by CP to mem.
+ */
+ rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
+ (sizeof(struct pm4__indirect_buffer_pasid) / sizeof(unsigned int)));
- rm_state = (uint64_t *) mem_obj->cpu_ptr;
+ status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t),
+ &mem_obj);
- *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING;
+ if (status == 0) {
- rm_packet->header.opcode = IT_RELEASE_MEM;
- rm_packet->header.type = PM4_TYPE_3;
- rm_packet->header.count = sizeof(struct pm4__release_mem) /
- sizeof(unsigned int) - 2;
+ rm_state = (uint64_t *) mem_obj->cpu_ptr;
- rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
- rm_packet->bitfields2.event_index =
- event_index___release_mem__end_of_pipe;
+ *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING;
- rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
- rm_packet->bitfields2.atc = 0;
- rm_packet->bitfields2.tc_wb_action_ena = 1;
+ rm_packet->header.opcode = IT_RELEASE_MEM;
+ rm_packet->header.type = PM4_TYPE_3;
+ rm_packet->header.count = sizeof(struct pm4__release_mem) / sizeof(unsigned int) - 2;
- addr.quad_part = mem_obj->gpu_addr;
+ rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+ rm_packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
+ rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+ rm_packet->bitfields2.atc = 0;
+ rm_packet->bitfields2.tc_wb_action_ena = 1;
- rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2;
- rm_packet->address_hi = addr.u.high_part;
+ addr.quad_part = mem_obj->gpu_addr;
- rm_packet->bitfields3.data_sel =
- data_sel___release_mem__send_64_bit_data;
+ rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2;
+ rm_packet->address_hi = addr.u.high_part;
- rm_packet->bitfields3.int_sel =
- int_sel___release_mem__send_data_after_write_confirm;
+ rm_packet->bitfields3.data_sel = data_sel___release_mem__send_64_bit_data;
+ rm_packet->bitfields3.int_sel = int_sel___release_mem__send_data_after_write_confirm;
+ rm_packet->bitfields3.dst_sel = dst_sel___release_mem__memory_controller;
- rm_packet->bitfields3.dst_sel =
- dst_sel___release_mem__memory_controller;
+ rm_packet->data_lo = QUEUESTATE__ACTIVE;
- rm_packet->data_lo = QUEUESTATE__ACTIVE;
+ kq->ops.submit_packet(kq);
- kq->ops.submit_packet(kq);
+ /* Wait till CP writes sync code: */
- /* Wait till CP writes sync code: */
- status = amdkfd_fence_wait_timeout(
- (unsigned int *) rm_state,
- QUEUESTATE__ACTIVE, 1500);
+ status = amdkfd_fence_wait_timeout(
+ (unsigned int *) rm_state,
+ QUEUESTATE__ACTIVE, 1500);
+
+ } else {
+ pr_debug("Error! kfd: In func %s >> failed to allocate GART memory\n", __func__);
+ }
+ } while (false);
- kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+ if (rm_state != NULL)
+ kfd_gtt_sa_free(dbgdev->dev, mem_obj);
return status;
}
static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev)
{
- BUG_ON(!dbgdev);
-
- /*
- * no action is needed in this case,
- * just make sure diq will not be used
- */
+ /* no action is needed in this case, just make sure diq will not be used */
dbgdev->kq = NULL;
@@ -182,57 +169,68 @@ static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev)
static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
{
+
+ int status = 0;
+ struct kernel_queue *kq = NULL;
struct queue_properties properties;
unsigned int qid;
- struct kernel_queue *kq = NULL;
- int status;
+ struct process_queue_manager *pqm = dbgdev->pqm;
- BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->dev);
+ do {
- status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
- &properties, 0, KFD_QUEUE_TYPE_DIQ,
- &qid);
+ if (!pqm) {
+ pr_debug("Error! kfd: In func %s >> No PQM\n", __func__);
+ status = -EFAULT;
+ break;
+ }
- if (status) {
- pr_err("amdkfd: Failed to create DIQ\n");
- return status;
- }
+ properties.type = KFD_QUEUE_TYPE_DIQ;
- pr_debug("DIQ Created with queue id: %d\n", qid);
+ status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
+ &properties, &qid);
- kq = pqm_get_kernel_queue(dbgdev->pqm, qid);
+ if (status != 0) {
+ pr_debug("Error! kfd: In func %s >> Create Queue failed\n", __func__);
+ break;
+ }
- if (kq == NULL) {
- pr_err("amdkfd: Error getting DIQ\n");
- pqm_destroy_queue(dbgdev->pqm, qid);
- return -EFAULT;
- }
+ pr_debug("kfd: DIQ Created with queue id: %d\n", qid);
+
+ kq = pqm_get_kernel_queue(dbgdev->pqm, qid);
+
+ if (kq == NULL) {
+ pr_debug("Error! kfd: In func %s >> Error getting Kernel Queue\n", __func__);
+ status = -ENOMEM;
+ break;
+ }
+
+ dbgdev->kq = kq;
- dbgdev->kq = kq;
+ } while (false);
return status;
}
static int dbgdev_unregister_nodiq(struct kfd_dbgdev *dbgdev)
{
- BUG_ON(!dbgdev || !dbgdev->dev);
-
/* disable watch address */
+
dbgdev_address_watch_disable_nodiq(dbgdev->dev);
return 0;
}
static int dbgdev_unregister_diq(struct kfd_dbgdev *dbgdev)
{
- /* todo - disable address watch */
- int status;
-
- BUG_ON(!dbgdev || !dbgdev->pqm || !dbgdev->kq);
-
- status = pqm_destroy_queue(dbgdev->pqm,
- dbgdev->kq->queue->properties.queue_id);
- dbgdev->kq = NULL;
-
+ /* todo - if needed, kill wavefronts and disable watch */
+ int status = 0;
+ if ((dbgdev == NULL) || (dbgdev->pqm == NULL) || (dbgdev->kq == NULL)) {
+ pr_debug("kfd Err:In func %s >> can't destroy diq\n", __func__);
+ status = -EFAULT;
+ } else {
+ pqm_destroy_queue(dbgdev->pqm,
+ dbgdev->kq->queue->properties.queue_id);
+ dbgdev->kq = NULL;
+ }
return status;
}
@@ -241,341 +239,350 @@ static void dbgdev_address_watch_set_registers(
union TCP_WATCH_ADDR_H_BITS *addrHi,
union TCP_WATCH_ADDR_L_BITS *addrLo,
union TCP_WATCH_CNTL_BITS *cntl,
- unsigned int index, unsigned int vmid)
+ unsigned int index, unsigned int vmid,
+ unsigned int asic_family)
{
union ULARGE_INTEGER addr;
- BUG_ON(!adw_info || !addrHi || !addrLo || !cntl);
-
addr.quad_part = 0;
addrHi->u32All = 0;
addrLo->u32All = 0;
cntl->u32All = 0;
if (adw_info->watch_mask != NULL)
- cntl->bitfields.mask =
- (uint32_t) (adw_info->watch_mask[index] &
- ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK);
+ cntl->bitfields.mask = (uint32_t) (adw_info->watch_mask[index] & ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK);
else
cntl->bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
addr.quad_part = (unsigned long long) adw_info->watch_address[index];
- addrHi->bitfields.addr = addr.u.high_part &
- ADDRESS_WATCH_REG_ADDHIGH_MASK;
+ addrHi->bitfields.addr = addr.u.high_part & ADDRESS_WATCH_REG_ADDHIGH_MASK;
addrLo->bitfields.addr =
(addr.u.low_part >> ADDRESS_WATCH_REG_ADDLOW_SHIFT);
cntl->bitfields.mode = adw_info->watch_mode[index];
cntl->bitfields.vmid = (uint32_t) vmid;
- /* for now assume it is an ATC address */
- cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
-
+ /* for APU assume it is an ATC address. */
+ if (KFD_IS_DGPU(asic_family) == false)
+ cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask);
- pr_debug("\t\t%20s %08x\n", "set reg add high :",
- addrHi->bitfields.addr);
- pr_debug("\t\t%20s %08x\n", "set reg add low :",
- addrLo->bitfields.addr);
+ pr_debug("\t\t%20s %08x\n", "set reg add high :", addrHi->bitfields.addr);
+ pr_debug("\t\t%20s %08x\n", "set reg add low :", addrLo->bitfields.addr);
+
}
static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev,
struct dbg_address_watch_info *adw_info)
{
+
+ int status = 0;
+
union TCP_WATCH_ADDR_H_BITS addrHi;
union TCP_WATCH_ADDR_L_BITS addrLo;
union TCP_WATCH_CNTL_BITS cntl;
- struct kfd_process_device *pdd;
+
+ unsigned int vmid;
unsigned int i;
- BUG_ON(!dbgdev || !dbgdev->dev || !adw_info);
+ struct kfd_process_device *pdd;
- /* taking the vmid for that process on the safe way using pdd */
- pdd = kfd_get_process_device_data(dbgdev->dev,
- adw_info->process);
- if (!pdd) {
- pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n");
- return -EFAULT;
- }
+ do {
+ /* taking the vmid for that process on the safe way using pdd */
+ pdd = kfd_get_process_device_data(dbgdev->dev,
+ adw_info->process);
+ if (!pdd) {
+ pr_debug("Error! kfd: In func %s >> no PDD available\n", __func__);
+ status = -EFAULT;
+ break;
+ }
- addrHi.u32All = 0;
- addrLo.u32All = 0;
- cntl.u32All = 0;
+ addrHi.u32All = 0;
+ addrLo.u32All = 0;
+ cntl.u32All = 0;
- if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
- (adw_info->num_watch_points == 0)) {
- pr_err("amdkfd: num_watch_points is invalid\n");
- return -EINVAL;
- }
+ vmid = pdd->qpd.vmid;
- if ((adw_info->watch_mode == NULL) ||
- (adw_info->watch_address == NULL)) {
- pr_err("amdkfd: adw_info fields are not valid\n");
- return -EINVAL;
- }
+ if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES)
+ || (adw_info->num_watch_points == 0)) {
+ status = -EINVAL;
+ break;
+ }
- for (i = 0 ; i < adw_info->num_watch_points ; i++) {
- dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo,
- &cntl, i, pdd->qpd.vmid);
-
- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
- pr_debug("\t\t%20s %08x\n", "register index :", i);
- pr_debug("\t\t%20s %08x\n", "vmid is :", pdd->qpd.vmid);
- pr_debug("\t\t%20s %08x\n", "Address Low is :",
- addrLo.bitfields.addr);
- pr_debug("\t\t%20s %08x\n", "Address high is :",
- addrHi.bitfields.addr);
- pr_debug("\t\t%20s %08x\n", "Address high is :",
- addrHi.bitfields.addr);
- pr_debug("\t\t%20s %08x\n", "Control Mask is :",
- cntl.bitfields.mask);
- pr_debug("\t\t%20s %08x\n", "Control Mode is :",
- cntl.bitfields.mode);
- pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
- cntl.bitfields.vmid);
- pr_debug("\t\t%20s %08x\n", "Control atc is :",
- cntl.bitfields.atc);
- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
-
- pdd->dev->kfd2kgd->address_watch_execute(
- dbgdev->dev->kgd,
- i,
- cntl.u32All,
- addrHi.u32All,
- addrLo.u32All);
- }
+ if ((adw_info->watch_mode == NULL) || (adw_info->watch_address == NULL)) {
+ status = -EINVAL;
+ break;
+ }
- return 0;
+ for (i = 0; i < adw_info->num_watch_points; i++) {
+
+ dbgdev_address_watch_set_registers(
+ adw_info,
+ &addrHi,
+ &addrLo,
+ &cntl,
+ i,
+ vmid,
+ dbgdev->dev->device_info->asic_family
+ );
+
+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+ pr_debug("\t\t%20s %08x\n", "register index :", i);
+ pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
+ pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr);
+ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr);
+ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr);
+ pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask);
+ pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode);
+ pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid);
+ pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc);
+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+
+ pdd->dev->kfd2kgd->address_watch_execute(
+ dbgdev->dev->kgd,
+ i,
+ cntl.u32All,
+ addrHi.u32All,
+ addrLo.u32All);
+ }
+
+ } while (false);
+
+ return status;
}
static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
struct dbg_address_watch_info *adw_info)
{
- struct pm4__set_config_reg *packets_vec;
+
+ int status = 0;
+ unsigned int i = 0;
union TCP_WATCH_ADDR_H_BITS addrHi;
union TCP_WATCH_ADDR_L_BITS addrLo;
union TCP_WATCH_CNTL_BITS cntl;
- struct kfd_mem_obj *mem_obj;
- unsigned int aw_reg_add_dword;
- uint32_t *packet_buff_uint;
- unsigned int i;
- int status;
- size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
+
/* we do not control the vmid in DIQ mode, just a place holder */
unsigned int vmid = 0;
- BUG_ON(!dbgdev || !dbgdev->dev || !adw_info);
+ struct kfd_mem_obj *mem_obj;
+ uint32_t *packet_buff_uint = NULL;
+
+ struct pm4__set_config_reg *packets_vec = NULL;
+
+ size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
+
+ unsigned int aw_reg_add_dword;
addrHi.u32All = 0;
addrLo.u32All = 0;
cntl.u32All = 0;
- if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
- (adw_info->num_watch_points == 0)) {
- pr_err("amdkfd: num_watch_points is invalid\n");
- return -EINVAL;
- }
+ do {
- if ((NULL == adw_info->watch_mode) ||
- (NULL == adw_info->watch_address)) {
- pr_err("amdkfd: adw_info fields are not valid\n");
- return -EINVAL;
- }
+ if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) || (adw_info->num_watch_points == 0)) {
+ status = -EINVAL;
+ break;
+ }
+
+ if ((NULL == adw_info->watch_mode) || (NULL == adw_info->watch_address)) {
+ status = -EINVAL;
+ break;
+ }
- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
+ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
- if (status != 0) {
- pr_err("amdkfd: Failed to allocate GART memory\n");
- return status;
- }
+ if (status != 0)
+ break;
- packet_buff_uint = mem_obj->cpu_ptr;
-
- memset(packet_buff_uint, 0, ib_size);
-
- packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);
-
- packets_vec[0].header.count = 1;
- packets_vec[0].header.opcode = IT_SET_CONFIG_REG;
- packets_vec[0].header.type = PM4_TYPE_3;
- packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
- packets_vec[0].bitfields2.insert_vmid = 1;
- packets_vec[1].ordinal1 = packets_vec[0].ordinal1;
- packets_vec[1].bitfields2.insert_vmid = 0;
- packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
- packets_vec[2].bitfields2.insert_vmid = 0;
- packets_vec[3].ordinal1 = packets_vec[0].ordinal1;
- packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
- packets_vec[3].bitfields2.insert_vmid = 1;
-
- for (i = 0; i < adw_info->num_watch_points; i++) {
- dbgdev_address_watch_set_registers(adw_info,
- &addrHi,
- &addrLo,
- &cntl,
- i,
- vmid);
-
- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
- pr_debug("\t\t%20s %08x\n", "register index :", i);
- pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
- pr_debug("\t\t%20s %p\n", "Add ptr is :",
- adw_info->watch_address);
- pr_debug("\t\t%20s %08llx\n", "Add is :",
- adw_info->watch_address[i]);
- pr_debug("\t\t%20s %08x\n", "Address Low is :",
- addrLo.bitfields.addr);
- pr_debug("\t\t%20s %08x\n", "Address high is :",
- addrHi.bitfields.addr);
- pr_debug("\t\t%20s %08x\n", "Control Mask is :",
- cntl.bitfields.mask);
- pr_debug("\t\t%20s %08x\n", "Control Mode is :",
- cntl.bitfields.mode);
- pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
- cntl.bitfields.vmid);
- pr_debug("\t\t%20s %08x\n", "Control atc is :",
- cntl.bitfields.atc);
- pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
-
- aw_reg_add_dword =
- dbgdev->dev->kfd2kgd->address_watch_get_offset(
- dbgdev->dev->kgd,
- i,
- ADDRESS_WATCH_REG_CNTL);
+ packet_buff_uint = mem_obj->cpu_ptr;
+
+ memset(packet_buff_uint, 0, ib_size);
- aw_reg_add_dword /= sizeof(uint32_t);
+ packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);
- packets_vec[0].bitfields2.reg_offset =
- aw_reg_add_dword - AMD_CONFIG_REG_BASE;
+ packets_vec[0].header.count = 1;
+ packets_vec[0].header.opcode = IT_SET_CONFIG_REG;
+ packets_vec[0].header.type = PM4_TYPE_3;
+ packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
+ packets_vec[0].bitfields2.insert_vmid = 1;
+ packets_vec[1].ordinal1 = packets_vec[0].ordinal1;
+ packets_vec[1].bitfields2.insert_vmid = 0;
+ packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
+ packets_vec[2].bitfields2.insert_vmid = 0;
+ packets_vec[3].ordinal1 = packets_vec[0].ordinal1;
+ packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
+ packets_vec[3].bitfields2.insert_vmid = 1;
- packets_vec[0].reg_data[0] = cntl.u32All;
+ for (i = 0; i < adw_info->num_watch_points; i++) {
- aw_reg_add_dword =
- dbgdev->dev->kfd2kgd->address_watch_get_offset(
- dbgdev->dev->kgd,
+ dbgdev_address_watch_set_registers(
+ adw_info,
+ &addrHi,
+ &addrLo,
+ &cntl,
i,
- ADDRESS_WATCH_REG_ADDR_HI);
+ vmid,
+ dbgdev->dev->device_info->asic_family
+ );
+
+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+ pr_debug("\t\t%20s %08x\n", "register index :", i);
+ pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
+ pr_debug("\t\t%20s %p\n", "Add ptr is :", adw_info->watch_address);
+ pr_debug("\t\t%20s %08llx\n", "Add is :", adw_info->watch_address[i]);
+ pr_debug("\t\t%20s %08x\n", "Address Low is :", addrLo.bitfields.addr);
+ pr_debug("\t\t%20s %08x\n", "Address high is :", addrHi.bitfields.addr);
+ pr_debug("\t\t%20s %08x\n", "Control Mask is :", cntl.bitfields.mask);
+ pr_debug("\t\t%20s %08x\n", "Control Mode is :", cntl.bitfields.mode);
+ pr_debug("\t\t%20s %08x\n", "Control Vmid is :", cntl.bitfields.vmid);
+ pr_debug("\t\t%20s %08x\n", "Control atc is :", cntl.bitfields.atc);
+ pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+
+ aw_reg_add_dword =
+ dbgdev->dev->kfd2kgd
+ ->address_watch_get_offset(
+ dbgdev->dev->kgd,
+ i,
+ ADDRESS_WATCH_REG_CNTL);
+
+ packets_vec[0].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
+ packets_vec[0].reg_data[0] = cntl.u32All;
- aw_reg_add_dword /= sizeof(uint32_t);
+ aw_reg_add_dword =
+ dbgdev->dev->kfd2kgd
+ ->address_watch_get_offset(
+ dbgdev->dev->kgd,
+ i,
+ ADDRESS_WATCH_REG_ADDR_HI);
- packets_vec[1].bitfields2.reg_offset =
- aw_reg_add_dword - AMD_CONFIG_REG_BASE;
- packets_vec[1].reg_data[0] = addrHi.u32All;
- aw_reg_add_dword =
- dbgdev->dev->kfd2kgd->address_watch_get_offset(
- dbgdev->dev->kgd,
- i,
- ADDRESS_WATCH_REG_ADDR_LO);
+ packets_vec[1].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
+ packets_vec[1].reg_data[0] = addrHi.u32All;
- aw_reg_add_dword /= sizeof(uint32_t);
+ aw_reg_add_dword =
+ dbgdev->dev->kfd2kgd
+ ->address_watch_get_offset(
+ dbgdev->dev->kgd,
+ i,
+ ADDRESS_WATCH_REG_ADDR_LO);
- packets_vec[2].bitfields2.reg_offset =
- aw_reg_add_dword - AMD_CONFIG_REG_BASE;
- packets_vec[2].reg_data[0] = addrLo.u32All;
- /* enable watch flag if address is not zero*/
- if (adw_info->watch_address[i] > 0)
- cntl.bitfields.valid = 1;
- else
- cntl.bitfields.valid = 0;
+ packets_vec[2].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
+ packets_vec[2].reg_data[0] = addrLo.u32All;
- aw_reg_add_dword =
- dbgdev->dev->kfd2kgd->address_watch_get_offset(
- dbgdev->dev->kgd,
- i,
- ADDRESS_WATCH_REG_CNTL);
+ /* enable watch flag if address is not zero*/
+ if (adw_info->watch_address[i] > 0)
+ cntl.bitfields.valid = 1;
+ else
+ cntl.bitfields.valid = 0;
- aw_reg_add_dword /= sizeof(uint32_t);
+ aw_reg_add_dword =
+ dbgdev->dev->kfd2kgd
+ ->address_watch_get_offset(
+ dbgdev->dev->kgd,
+ i,
+ ADDRESS_WATCH_REG_CNTL);
- packets_vec[3].bitfields2.reg_offset =
- aw_reg_add_dword - AMD_CONFIG_REG_BASE;
- packets_vec[3].reg_data[0] = cntl.u32All;
- status = dbgdev_diq_submit_ib(
- dbgdev,
- adw_info->process->pasid,
- mem_obj->gpu_addr,
- packet_buff_uint,
- ib_size);
+ packets_vec[3].bitfields2.reg_offset = aw_reg_add_dword - CONFIG_REG_BASE;
+ packets_vec[3].reg_data[0] = cntl.u32All;
+
+ status = dbgdev_diq_submit_ib(
+ dbgdev,
+ adw_info->process->pasid,
+ mem_obj->gpu_addr,
+ packet_buff_uint,
+ ib_size);
+
+ if (status != 0) {
+ pr_debug("Error! kfd: In func %s >> failed to submit DIQ packet\n", __func__);
+ break;
+ }
- if (status != 0) {
- pr_err("amdkfd: Failed to submit IB to DIQ\n");
- break;
}
- }
- kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+ } while (false);
+ if (packet_buff_uint != NULL)
+ kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+
return status;
+
}
static int dbgdev_wave_control_set_registers(
struct dbg_wave_control_info *wac_info,
union SQ_CMD_BITS *in_reg_sq_cmd,
- union GRBM_GFX_INDEX_BITS *in_reg_gfx_index)
+ union GRBM_GFX_INDEX_BITS *in_reg_gfx_index,
+ unsigned int asic_family)
{
int status = 0;
union SQ_CMD_BITS reg_sq_cmd;
union GRBM_GFX_INDEX_BITS reg_gfx_index;
- struct HsaDbgWaveMsgAMDGen2 *pMsg;
-
- BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index);
reg_sq_cmd.u32All = 0;
+
reg_gfx_index.u32All = 0;
- pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2;
switch (wac_info->mode) {
- /* Send command to single wave */
- case HSA_DBG_WAVEMODE_SINGLE:
- /*
- * Limit access to the process waves only,
- * by setting vmid check
- */
+ case HSA_DBG_WAVEMODE_SINGLE: /* Send command to single wave */
+ /*limit access to the process waves only,by setting vmid check */
reg_sq_cmd.bits.check_vmid = 1;
- reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD;
- reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId;
+ reg_sq_cmd.bits.simd_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.SIMD;
+ reg_sq_cmd.bits.wave_id = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.WaveId;
reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE;
- reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
- reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
- reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
+ reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray;
+ reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine;
+ reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU;
break;
- /* Send command to all waves with matching VMID */
- case HSA_DBG_WAVEMODE_BROADCAST_PROCESS:
+ case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: /* Send command to all waves with matching VMID */
+
reg_gfx_index.bits.sh_broadcast_writes = 1;
reg_gfx_index.bits.se_broadcast_writes = 1;
reg_gfx_index.bits.instance_broadcast_writes = 1;
reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
-
break;
- /* Send command to all CU waves with matching VMID */
- case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU:
+ case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: /* Send command to all CU waves with matching VMID */
reg_sq_cmd.bits.check_vmid = 1;
reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST;
- reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray;
- reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine;
- reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU;
+ reg_gfx_index.bits.sh_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderArray;
+ reg_gfx_index.bits.se_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.ShaderEngine;
+ reg_gfx_index.bits.instance_index = wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.ui32.HSACU;
break;
default:
- return -EINVAL;
+ status = -EINVAL;
+ break;
}
switch (wac_info->operand) {
case HSA_DBG_WAVEOP_HALT:
- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
+ if (asic_family == CHIP_KAVERI) {
+ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
+ pr_debug("kfd:dbgdev: halting KV\n");
+ } else {
+ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT;
+ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT;
+ pr_debug("kfd:dbgdev: halting CZ\n");
+ }
break;
case HSA_DBG_WAVEOP_RESUME:
- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
+ if (asic_family == CHIP_KAVERI) {
+ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
+ pr_debug("kfd:dbgdev: resuming KV\n");
+ } else {
+ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT;
+ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME;
+ pr_debug("kfd:dbgdev: resuming CZ\n");
+ }
break;
case HSA_DBG_WAVEOP_KILL:
@@ -601,128 +608,114 @@ static int dbgdev_wave_control_set_registers(
}
if (status == 0) {
- *in_reg_sq_cmd = reg_sq_cmd;
+ *in_reg_sq_cmd = reg_sq_cmd;
*in_reg_gfx_index = reg_gfx_index;
}
-
return status;
+
}
static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
struct dbg_wave_control_info *wac_info)
{
- int status;
+ int status = 0;
union SQ_CMD_BITS reg_sq_cmd;
union GRBM_GFX_INDEX_BITS reg_gfx_index;
struct kfd_mem_obj *mem_obj;
- uint32_t *packet_buff_uint;
- struct pm4__set_config_reg *packets_vec;
+ uint32_t *packet_buff_uint = NULL;
+ struct pm4__set_config_reg *packets_vec = NULL;
size_t ib_size = sizeof(struct pm4__set_config_reg) * 3;
- BUG_ON(!dbgdev || !wac_info);
-
reg_sq_cmd.u32All = 0;
+ do {
- status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
- &reg_gfx_index);
- if (status) {
- pr_err("amdkfd: Failed to set wave control registers\n");
- return status;
- }
-
- /* we do not control the VMID in DIQ,so reset it to a known value */
- reg_sq_cmd.bits.vm_id = 0;
-
- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
- pr_debug("\t\t mode is: %u\n", wac_info->mode);
- pr_debug("\t\t operand is: %u\n", wac_info->operand);
- pr_debug("\t\t trap id is: %u\n", wac_info->trapId);
- pr_debug("\t\t msg value is: %u\n",
- wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
- pr_debug("\t\t vmid is: N/A\n");
-
- pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid);
- pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd);
- pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id);
- pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id);
- pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode);
- pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id);
- pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id);
-
- pr_debug("\t\t ibw is : %u\n",
- reg_gfx_index.bitfields.instance_broadcast_writes);
- pr_debug("\t\t ii is : %u\n",
- reg_gfx_index.bitfields.instance_index);
- pr_debug("\t\t sebw is : %u\n",
- reg_gfx_index.bitfields.se_broadcast_writes);
- pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index);
- pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index);
- pr_debug("\t\t sbw is : %u\n",
- reg_gfx_index.bitfields.sh_broadcast_writes);
-
- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
-
- if (status != 0) {
- pr_err("amdkfd: Failed to allocate GART memory\n");
- return status;
- }
-
- packet_buff_uint = mem_obj->cpu_ptr;
+ status = dbgdev_wave_control_set_registers(wac_info,
+ &reg_sq_cmd,
+ &reg_gfx_index,
+ dbgdev->dev->device_info->asic_family);
- memset(packet_buff_uint, 0, ib_size);
+ /* we do not control the VMID in DIQ,so reset it to a known value */
+ reg_sq_cmd.bits.vm_id = 0;
+ if (status != 0)
+ break;
+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+ pr_debug("\t\t mode is: %u\n", wac_info->mode);
+ pr_debug("\t\t operand is: %u\n", wac_info->operand);
+ pr_debug("\t\t trap id is: %u\n", wac_info->trapId);
+ pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
+ pr_debug("\t\t vmid is: N/A\n");
+
+ pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid);
+ pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd);
+ pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id);
+ pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id);
+ pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode);
+ pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id);
+ pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id);
+
+ pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes);
+ pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index);
+ pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes);
+ pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index);
+ pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index);
+ pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes);
+
+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+ status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
+
+ if (status != 0)
+ break;
- packets_vec = (struct pm4__set_config_reg *) packet_buff_uint;
- packets_vec[0].header.count = 1;
- packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
- packets_vec[0].header.type = PM4_TYPE_3;
- packets_vec[0].bitfields2.reg_offset =
- GRBM_GFX_INDEX / (sizeof(uint32_t)) -
- USERCONFIG_REG_BASE;
+ packet_buff_uint = mem_obj->cpu_ptr;
- packets_vec[0].bitfields2.insert_vmid = 0;
- packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
+ memset(packet_buff_uint, 0, ib_size);
- packets_vec[1].header.count = 1;
- packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
- packets_vec[1].header.type = PM4_TYPE_3;
- packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) -
- AMD_CONFIG_REG_BASE;
+ packets_vec = (struct pm4__set_config_reg *) packet_buff_uint;
+ packets_vec[0].header.count = 1;
+ packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
+ packets_vec[0].header.type = PM4_TYPE_3;
+ packets_vec[0].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE;
+ packets_vec[0].bitfields2.insert_vmid = 0;
+ packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
- packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
- packets_vec[1].bitfields2.insert_vmid = 1;
- packets_vec[1].reg_data[0] = reg_sq_cmd.u32All;
+ packets_vec[1].header.count = 1;
+ packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
+ packets_vec[1].header.type = PM4_TYPE_3;
+ packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - CONFIG_REG_BASE;
+ packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
+ packets_vec[1].bitfields2.insert_vmid = 1;
+ packets_vec[1].reg_data[0] = reg_sq_cmd.u32All;
- /* Restore the GRBM_GFX_INDEX register */
+ /* Restore the GRBM_GFX_INDEX register */
- reg_gfx_index.u32All = 0;
- reg_gfx_index.bits.sh_broadcast_writes = 1;
- reg_gfx_index.bits.instance_broadcast_writes = 1;
- reg_gfx_index.bits.se_broadcast_writes = 1;
+ reg_gfx_index.u32All = 0;
+ reg_gfx_index.bits.sh_broadcast_writes = 1;
+ reg_gfx_index.bits.instance_broadcast_writes = 1;
+ reg_gfx_index.bits.se_broadcast_writes = 1;
- packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
- packets_vec[2].bitfields2.reg_offset =
- GRBM_GFX_INDEX / (sizeof(uint32_t)) -
- USERCONFIG_REG_BASE;
+ packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
+ packets_vec[2].bitfields2.reg_offset = GRBM_GFX_INDEX / (sizeof(uint32_t)) - USERCONFIG_REG_BASE;
+ packets_vec[2].bitfields2.insert_vmid = 0;
+ packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
- packets_vec[2].bitfields2.insert_vmid = 0;
- packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
+ status = dbgdev_diq_submit_ib(
+ dbgdev,
+ wac_info->process->pasid,
+ mem_obj->gpu_addr,
+ packet_buff_uint,
+ ib_size);
- status = dbgdev_diq_submit_ib(
- dbgdev,
- wac_info->process->pasid,
- mem_obj->gpu_addr,
- packet_buff_uint,
- ib_size);
+ if (status != 0)
+ pr_debug("%s\n", " Critical Error ! Submit diq packet failed ");
- if (status != 0)
- pr_err("amdkfd: Failed to submit IB to DIQ\n");
+ } while (false);
- kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+ if (packet_buff_uint != NULL)
+ kfd_gtt_sa_free(dbgdev->dev, mem_obj);
return status;
}
@@ -730,66 +723,69 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev,
struct dbg_wave_control_info *wac_info)
{
- int status;
+ int status = 0;
+ unsigned int vmid = 0xffff;
union SQ_CMD_BITS reg_sq_cmd;
union GRBM_GFX_INDEX_BITS reg_gfx_index;
- struct kfd_process_device *pdd;
- BUG_ON(!dbgdev || !dbgdev->dev || !wac_info);
+ struct kfd_process_device *pdd = NULL;
reg_sq_cmd.u32All = 0;
+ status = 0;
/* taking the VMID for that process on the safe way using PDD */
pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process);
- if (!pdd) {
- pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n");
- return -EFAULT;
- }
- status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
- &reg_gfx_index);
- if (status) {
- pr_err("amdkfd: Failed to set wave control registers\n");
- return status;
+ if (pdd) {
+ status = dbgdev_wave_control_set_registers(wac_info,
+ &reg_sq_cmd,
+ &reg_gfx_index,
+ dbgdev->dev->device_info->asic_family);
+ if (status == 0) {
+
+ /* for non DIQ we need to patch the VMID: */
+
+ vmid = pdd->qpd.vmid;
+ reg_sq_cmd.bits.vm_id = vmid;
+
+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+ pr_debug("\t\t mode is: %u\n", wac_info->mode);
+ pr_debug("\t\t operand is: %u\n", wac_info->operand);
+ pr_debug("\t\t trap id is: %u\n", wac_info->trapId);
+ pr_debug("\t\t msg value is: %u\n", wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
+ pr_debug("\t\t vmid is: %u\n", vmid);
+
+ pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid);
+ pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd);
+ pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id);
+ pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id);
+ pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode);
+ pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id);
+ pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id);
+
+ pr_debug("\t\t ibw is : %u\n", reg_gfx_index.bitfields.instance_broadcast_writes);
+ pr_debug("\t\t ii is : %u\n", reg_gfx_index.bitfields.instance_index);
+ pr_debug("\t\t sebw is : %u\n", reg_gfx_index.bitfields.se_broadcast_writes);
+ pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index);
+ pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index);
+ pr_debug("\t\t sbw is : %u\n", reg_gfx_index.bitfields.sh_broadcast_writes);
+
+ pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+
+ dbgdev->dev->kfd2kgd
+ ->wave_control_execute(dbgdev->dev->kgd,
+ reg_gfx_index.u32All,
+ reg_sq_cmd.u32All);
+ } else {
+ status = -EINVAL;
+ }
+ } else {
+ status = -EFAULT;
}
- /* for non DIQ we need to patch the VMID: */
+ return status;
- reg_sq_cmd.bits.vm_id = pdd->qpd.vmid;
-
- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
- pr_debug("\t\t mode is: %u\n", wac_info->mode);
- pr_debug("\t\t operand is: %u\n", wac_info->operand);
- pr_debug("\t\t trap id is: %u\n", wac_info->trapId);
- pr_debug("\t\t msg value is: %u\n",
- wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
- pr_debug("\t\t vmid is: %u\n", pdd->qpd.vmid);
-
- pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid);
- pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd);
- pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id);
- pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id);
- pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode);
- pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id);
- pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id);
-
- pr_debug("\t\t ibw is : %u\n",
- reg_gfx_index.bitfields.instance_broadcast_writes);
- pr_debug("\t\t ii is : %u\n",
- reg_gfx_index.bitfields.instance_index);
- pr_debug("\t\t sebw is : %u\n",
- reg_gfx_index.bitfields.se_broadcast_writes);
- pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index);
- pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index);
- pr_debug("\t\t sbw is : %u\n",
- reg_gfx_index.bitfields.sh_broadcast_writes);
-
- pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
-
- return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd,
- reg_gfx_index.u32All,
- reg_sq_cmd.u32All);
}
int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
@@ -800,13 +796,8 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
union GRBM_GFX_INDEX_BITS reg_gfx_index;
struct kfd_process_device *pdd;
struct dbg_wave_control_info wac_info;
- int temp;
- int first_vmid_to_scan = 8;
- int last_vmid_to_scan = 15;
-
- first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1;
- temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan;
- last_vmid_to_scan = first_vmid_to_scan + ffz(temp);
+ int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
+ int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
reg_sq_cmd.u32All = 0;
status = 0;
@@ -823,7 +814,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
(dev->kgd, vmid)) {
- if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
+ if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid
(dev->kgd, vmid) == p->pasid) {
pr_debug("Killing wave fronts of vmid %d and pasid %d\n",
vmid, p->pasid);
@@ -833,7 +824,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
}
if (vmid > last_vmid_to_scan) {
- pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid);
+ pr_err("amdkfd: didn't find vmid for pasid (%d)\n", p->pasid);
return -EFAULT;
}
@@ -843,7 +834,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
return -EFAULT;
status = dbgdev_wave_control_set_registers(&wac_info, &reg_sq_cmd,
- &reg_gfx_index);
+ &reg_gfx_index, dev->device_info->asic_family);
if (status != 0)
return -EINVAL;
@@ -858,15 +849,12 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
}
void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
- enum DBGDEV_TYPE type)
+ DBGDEV_TYPE type)
{
- BUG_ON(!pdbgdev || !pdev);
-
pdbgdev->dev = pdev;
pdbgdev->kq = NULL;
pdbgdev->type = type;
pdbgdev->pqm = NULL;
-
switch (type) {
case DBGDEV_TYPE_NODIQ:
pdbgdev->dbgdev_register = dbgdev_register_nodiq;
@@ -876,10 +864,12 @@ void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
break;
case DBGDEV_TYPE_DIQ:
default:
+
pdbgdev->dbgdev_register = dbgdev_register_diq;
pdbgdev->dbgdev_unregister = dbgdev_unregister_diq;
pdbgdev->dbgdev_wave_control = dbgdev_wave_control_diq;
pdbgdev->dbgdev_address_watch = dbgdev_address_watch_diq;
+
break;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
index 03424c20920c..82f48ff3bf9a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
@@ -23,6 +23,10 @@
#ifndef KFD_DBGDEV_H_
#define KFD_DBGDEV_H_
+/*
+ * SQ_IND_CMD_CMD enum
+ */
+
enum {
SQ_CMD_VMID_OFFSET = 28,
ADDRESS_WATCH_CNTL_OFFSET = 24
@@ -48,9 +52,9 @@ enum {
/* CONFIG reg space definition */
enum {
- AMD_CONFIG_REG_BASE = 0x2000, /* in dwords */
- AMD_CONFIG_REG_END = 0x2B00,
- AMD_CONFIG_REG_SIZE = AMD_CONFIG_REG_END - AMD_CONFIG_REG_BASE
+ CONFIG_REG_BASE = 0x2000, /* in dwords */
+ CONFIG_REG_END = 0x2B00,
+ CONFIG_REG_SIZE = CONFIG_REG_END - CONFIG_REG_BASE
};
/* SH reg space definition */
@@ -60,22 +64,43 @@ enum {
SH_REG_SIZE = SH_REG_END - SH_REG_BASE
};
+/* SQ_CMD definitions */
+
+enum {
+ SQ_IND_CMD_DATA_RESUME = 0,
+ SQ_IND_CMD_DATA_HALT = 1
+};
+
+enum SQ_IND_CMD_NEW {
+ SQ_IND_CMD_NEW_NULL = 0x00000000,
+ SQ_IND_CMD_NEW_SETHALT = 0x00000001,
+ SQ_IND_CMD_NEW_SAVECTX = 0x00000002,
+ SQ_IND_CMD_NEW_KILL = 0x00000003,
+ SQ_IND_CMD_NEW_DEBUG = 0x00000004,
+ SQ_IND_CMD_NEW_TRAP = 0x00000005,
+ SQ_IND_CMD_NEW_SET_PRIO = 0x00000006
+
+};
+
enum SQ_IND_CMD_CMD {
SQ_IND_CMD_CMD_NULL = 0x00000000,
SQ_IND_CMD_CMD_HALT = 0x00000001,
SQ_IND_CMD_CMD_RESUME = 0x00000002,
SQ_IND_CMD_CMD_KILL = 0x00000003,
SQ_IND_CMD_CMD_DEBUG = 0x00000004,
- SQ_IND_CMD_CMD_TRAP = 0x00000005,
+ SQ_IND_CMD_CMD_TRAP = 0x00000005
};
+/*
+ * SQ_IND_CMD_MODE enum
+ */
-enum SQ_IND_CMD_MODE {
+typedef enum SQ_IND_CMD_MODE {
SQ_IND_CMD_MODE_SINGLE = 0x00000000,
SQ_IND_CMD_MODE_BROADCAST = 0x00000001,
SQ_IND_CMD_MODE_BROADCAST_QUEUE = 0x00000002,
SQ_IND_CMD_MODE_BROADCAST_PIPE = 0x00000003,
SQ_IND_CMD_MODE_BROADCAST_ME = 0x00000004,
-};
+} SQ_IND_CMD_MODE;
union SQ_IND_INDEX_BITS {
struct {
@@ -106,18 +131,32 @@ union SQ_IND_CMD_BITS {
union SQ_CMD_BITS {
struct {
uint32_t cmd:3;
- uint32_t:1;
+ uint32_t:1;
uint32_t mode:3;
uint32_t check_vmid:1;
uint32_t trap_id:3;
- uint32_t:5;
+ uint32_t:5;
uint32_t wave_id:4;
uint32_t simd_id:2;
- uint32_t:2;
+ uint32_t:2;
uint32_t queue_id:3;
- uint32_t:1;
+ uint32_t:1;
uint32_t vm_id:4;
} bitfields, bits;
+ struct {
+ uint32_t cmd:3;
+ uint32_t:1;
+ uint32_t mode:3;
+ uint32_t check_vmid:1;
+ uint32_t data:3;
+ uint32_t:5;
+ uint32_t wave_id:4;
+ uint32_t simd_id:2;
+ uint32_t:2;
+ uint32_t queue_id:3;
+ uint32_t:1;
+ uint32_t vm_id:4;
+ } bitfields_sethalt, bits_sethalt;
uint32_t u32All;
signed int i32All;
float f32All;
@@ -169,7 +208,7 @@ union TCP_WATCH_ADDR_L_BITS {
};
enum {
- QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */
+ QUEUESTATE__INVALID = 0, /* so by default we'll get invalid state */
QUEUESTATE__ACTIVE_COMPLETION_PENDING,
QUEUESTATE__ACTIVE
};
@@ -187,7 +226,6 @@ union ULARGE_INTEGER {
#define KFD_CIK_VMID_END_OFFSET (KFD_CIK_VMID_START_OFFSET + (8))
-void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
- enum DBGDEV_TYPE type);
+void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, DBGDEV_TYPE type);
-#endif /* KFD_DBGDEV_H_ */
+#endif /* KFD_DBGDEV_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
index 56d676396342..5d269ea94957 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
@@ -36,42 +36,50 @@
static DEFINE_MUTEX(kfd_dbgmgr_mutex);
-struct mutex *kfd_get_dbgmgr_mutex(void)
+struct mutex *
+get_dbgmgr_mutex(void)
{
return &kfd_dbgmgr_mutex;
}
+/*===========================================================================*/
-static void kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr)
+static void
+kfd_dbgmgr_uninitialize(struct kfd_dbgmgr *pmgr)
{
- BUG_ON(!pmgr);
-
kfree(pmgr->dbgdev);
-
pmgr->dbgdev = NULL;
pmgr->pasid = 0;
pmgr->dev = NULL;
}
-void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr)
+/*===========================================================================*/
+
+void
+kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr)
{
if (pmgr != NULL) {
kfd_dbgmgr_uninitialize(pmgr);
kfree(pmgr);
+ pmgr = NULL;
}
}
-bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
+/*===========================================================================*/
+
+bool
+kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
{
- enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ;
+ DBGDEV_TYPE type = DBGDEV_TYPE_DIQ;
struct kfd_dbgmgr *new_buff;
BUG_ON(pdev == NULL);
BUG_ON(!pdev->init_complete);
new_buff = kfd_alloc_struct(new_buff);
- if (!new_buff) {
- pr_err("amdkfd: Failed to allocate dbgmgr instance\n");
+ if (!new_buff)
+ {
+ dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgmgr instance\n", __func__);
return false;
}
@@ -79,7 +87,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
new_buff->dev = pdev;
new_buff->dbgdev = kfd_alloc_struct(new_buff->dbgdev);
if (!new_buff->dbgdev) {
- pr_err("amdkfd: Failed to allocate dbgdev instance\n");
+ dev_err(NULL, "Error! kfd: In func %s >> failed to allocate dbgdev\n", __func__);
kfree(new_buff);
return false;
}
@@ -94,75 +102,200 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
return true;
}
-long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
+/*===========================================================================*/
+
+long
+kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
{
- BUG_ON(!p || !pmgr || !pmgr->dbgdev);
+ long status = 0;
- if (pmgr->pasid != 0) {
- pr_debug("H/W debugger is already active using pasid %d\n",
- pmgr->pasid);
- return -EBUSY;
- }
+ do {
+
+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) {
+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
+ /* Invalid Pointer. */
+ status = -EINVAL;
+ break;
+ }
+ if (pmgr->pasid != 0) {
+ /* HW debugger is already active. */
+ status = -EBUSY;
+ break;
+ }
+
+ /* remember pasid */
+
+ pmgr->pasid = p->pasid;
+
+ /* provide the pqm for diq generation */
- /* remember pasid */
- pmgr->pasid = p->pasid;
+ pmgr->dbgdev->pqm = &p->pqm;
- /* provide the pqm for diq generation */
- pmgr->dbgdev->pqm = &p->pqm;
+ /* activate the actual registering */
+ /* todo: you should lock with the process mutex here */
+ pmgr->dbgdev->dbgdev_register(pmgr->dbgdev);
+ /* todo: you should unlock with the process mutex here */
- /* activate the actual registering */
- pmgr->dbgdev->dbgdev_register(pmgr->dbgdev);
+ } while (false);
- return 0;
+ return status;
}
-long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
+/* ========================================================================== */
+
+long
+kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
{
- BUG_ON(!p || !pmgr || !pmgr->dbgdev);
- /* Is the requests coming from the already registered process? */
- if (pmgr->pasid != p->pasid) {
- pr_debug("H/W debugger is not registered by calling pasid %d\n",
- p->pasid);
- return -EINVAL;
- }
+ long status = 0;
- pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev);
+ do {
- pmgr->pasid = 0;
+ if ((pmgr == NULL) || (pmgr->dev == NULL)
+ || (pmgr->dbgdev == NULL) || (p == NULL)) {
+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
+ /* Invalid Pointer */
+ status = -EINVAL;
+ break;
+ }
+ if (pmgr->pasid != p->pasid) {
+ /* Is the requests coming from the already registered process? */
+ status = -EINVAL;
+ break;
+ }
+
+ /* todo: you should lock with the process mutex here */
+
+ pmgr->dbgdev->dbgdev_unregister(pmgr->dbgdev);
- return 0;
+ /* todo: you should unlock with the process mutex here */
+
+ pmgr->pasid = 0;
+
+ } while (false);
+
+ return status;
}
-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
- struct dbg_wave_control_info *wac_info)
+/* =========================================================================== */
+
+long
+kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info)
{
- BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info);
+ long status = 0;
- /* Is the requests coming from the already registered process? */
- if (pmgr->pasid != wac_info->process->pasid) {
- pr_debug("H/W debugger support was not registered for requester pasid %d\n",
- wac_info->process->pasid);
- return -EINVAL;
- }
+ dev_info(NULL, "kfd: In func %s\n", __func__);
+
+ do {
+
+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (wac_info == NULL)
+ || (wac_info->process == NULL)) {
+ /* Invalid Pointer */
+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
+ status = -EINVAL;
+ break;
+ }
+ /* Is the requests coming from the already registered process? */
+ if (pmgr->pasid != wac_info->process->pasid) {
+ /* HW debugger support was not registered for requester process */
+ status = -EINVAL;
+ break;
+ }
+
+ status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info);
+
+ } while (false);
+
+ return status;
- return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info);
}
-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
- struct dbg_address_watch_info *adw_info)
+/* =========================================================================== */
+
+long
+kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info)
{
- BUG_ON(!pmgr || !pmgr->dbgdev || !adw_info);
+ long status = 0;
+ dev_info(NULL, "kfd: In func %s\n", __func__);
- /* Is the requests coming from the already registered process? */
- if (pmgr->pasid != adw_info->process->pasid) {
- pr_debug("H/W debugger support was not registered for requester pasid %d\n",
- adw_info->process->pasid);
- return -EINVAL;
- }
+ do {
+
+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL) || (adw_info == NULL)
+ || (adw_info->process == NULL)) {
+ /* Invalid Pointer */
+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
+ status = -EINVAL;
+ break;
+ }
+ /* Is the requests coming from the already registered process? */
+ if (pmgr->pasid != adw_info->process->pasid) {
+ /* HW debugger support was not registered for requester process */
+ status = -EINVAL;
+ break;
+ }
+
+ status = (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev, adw_info);
+
+ } while (false);
+
+ return status;
- return (long) pmgr->dbgdev->dbgdev_address_watch(pmgr->dbgdev,
- adw_info);
}
+
+/* =========================================================================== */
+/*
+ * Handle abnormal process termination
+ * if we are in the midst of a debug session, we should kill all pending waves
+ * of the debugged process and unregister the process from the Debugger.
+ */
+long
+kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process)
+{
+ long status = 0;
+ struct dbg_wave_control_info wac_info;
+
+ dev_info(NULL, "kfd: In func %s\n", __func__);
+
+ do {
+
+ if ((pmgr == NULL) || (pmgr->dev == NULL) || (pmgr->dbgdev == NULL)) {
+ /* Invalid Pointer */
+ dev_info(NULL, "Error! kfd: In func %s >> Illegal pointers\n", __func__);
+ status = -EINVAL;
+ break;
+ }
+ /* first, we kill all the wavefronts of this process */
+
+ wac_info.process = process;
+ wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS;
+ wac_info.operand = HSA_DBG_WAVEOP_KILL;
+ wac_info.trapId = 0x0; /* not used for the KILL */
+ wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = 0; /* not used for kill */
+ wac_info.dbgWave_msg.MemoryVA = NULL; /* not used for kill */
+
+ status = (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, &wac_info);
+
+ if (status != 0) {
+ dev_info(NULL, "Error! kfd: In func %s: wave control failed, status is: %ld\n", __func__, status);
+ break;
+ }
+ if (pmgr->pasid == wac_info.process->pasid) {
+ /* if terminated process was registered for debug, then unregister it */
+ status = kfd_dbgmgr_unregister(pmgr, process);
+ pmgr->pasid = 0;
+ }
+ if (status != 0)
+ dev_info(NULL,
+ "Error! kfd: In func %s: unregister failed, status is: %ld debugger can not be reused\n",
+ __func__, status);
+
+ } while (false);
+
+ return status;
+
+}
+
+
+/*///////////////////////////////////////////////////////////////////////////////////////// */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
index 257a745ad0b5..2b6484ee8d16 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
@@ -26,252 +26,242 @@
#include "kfd_priv.h"
-/* must align with hsakmttypes definition */
+/*
+ * SQ_IND_CMD_CMD enum
+ */
+
+
+/* must align with hsakmttypes definition. */
#pragma pack(push, 4)
-enum HSA_DBG_WAVEOP {
- HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */
- HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */
- HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */
- HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter
- debug mode */
- HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take
- a trap */
+typedef enum _HSA_DBG_WAVEOP {
+ HSA_DBG_WAVEOP_HALT = 1, /* Halts a wavefront */
+ HSA_DBG_WAVEOP_RESUME = 2, /* Resumes a wavefront */
+ HSA_DBG_WAVEOP_KILL = 3, /* Kills a wavefront */
+ HSA_DBG_WAVEOP_DEBUG = 4, /* Causes wavefront to enter debug mode */
+ HSA_DBG_WAVEOP_TRAP = 5, /* Causes wavefront to take a trap */
HSA_DBG_NUM_WAVEOP = 5,
HSA_DBG_MAX_WAVEOP = 0xFFFFFFFF
-};
+} HSA_DBG_WAVEOP;
-enum HSA_DBG_WAVEMODE {
- /* send command to a single wave */
- HSA_DBG_WAVEMODE_SINGLE = 0,
- /*
- * Broadcast to all wavefronts of all processes is not
- * supported for HSA user mode
- */
-
- /* send to waves within current process */
- HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2,
- /* send to waves within current process on CU */
- HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3,
+typedef enum _HSA_DBG_WAVEMODE {
+ HSA_DBG_WAVEMODE_SINGLE = 0, /* send command to a single wave */
+ /* Broadcast to all wavefronts of all processes is not supported for HSA user mode */
+ HSA_DBG_WAVEMODE_BROADCAST_PROCESS = 2, /* send to waves within current process */
+ HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU = 3, /* send to waves within current process on CU */
HSA_DBG_NUM_WAVEMODE = 3,
HSA_DBG_MAX_WAVEMODE = 0xFFFFFFFF
-};
+} HSA_DBG_WAVEMODE;
-enum HSA_DBG_WAVEMSG_TYPE {
+typedef enum _HSA_DBG_WAVEMSG_TYPE {
HSA_DBG_WAVEMSG_AUTO = 0,
HSA_DBG_WAVEMSG_USER = 1,
HSA_DBG_WAVEMSG_ERROR = 2,
HSA_DBG_NUM_WAVEMSG,
HSA_DBG_MAX_WAVEMSG = 0xFFFFFFFF
-};
+} HSA_DBG_WAVEMSG_TYPE;
-enum HSA_DBG_WATCH_MODE {
- HSA_DBG_WATCH_READ = 0, /* Read operations only */
- HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */
- HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */
- HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */
+typedef enum _HSA_DBG_WATCH_MODE {
+ HSA_DBG_WATCH_READ = 0, /* Read operations only */
+ HSA_DBG_WATCH_NONREAD = 1, /* Write or Atomic operations only */
+ HSA_DBG_WATCH_ATOMIC = 2, /* Atomic Operations only */
+ HSA_DBG_WATCH_ALL = 3, /* Read, Write or Atomic operations */
HSA_DBG_WATCH_NUM,
HSA_DBG_WATCH_SIZE = 0xFFFFFFFF
-};
+} HSA_DBG_WATCH_MODE;
/* This structure is hardware specific and may change in the future */
-struct HsaDbgWaveMsgAMDGen2 {
+typedef struct _HsaDbgWaveMsgAMDGen2 {
union {
- struct ui32 {
- uint32_t UserData:8; /* user data */
- uint32_t ShaderArray:1; /* Shader array */
- uint32_t Priv:1; /* Privileged */
- uint32_t Reserved0:4; /* This field is reserved,
- should be 0 */
- uint32_t WaveId:4; /* wave id */
- uint32_t SIMD:2; /* SIMD id */
- uint32_t HSACU:4; /* Compute unit */
- uint32_t ShaderEngine:2;/* Shader engine */
- uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */
- uint32_t Reserved1:4; /* This field is reserved,
- should be 0 */
+ struct {
+ uint32_t UserData:8; /* user data */
+ uint32_t ShaderArray:1; /* Shader array */
+ uint32_t Priv:1; /* Privileged */
+ uint32_t Reserved0:4; /* This field is reserved, should be 0 */
+ uint32_t WaveId:4; /* wave id */
+ uint32_t SIMD:2; /* SIMD id */
+ uint32_t HSACU:4; /* Compute unit */
+ uint32_t ShaderEngine:2; /* Shader engine */
+ uint32_t MessageType:2; /* see HSA_DBG_WAVEMSG_TYPE */
+ uint32_t Reserved1:4; /* This field is reserved, should be 0 */
} ui32;
uint32_t Value;
};
- uint32_t Reserved2;
-};
-union HsaDbgWaveMessageAMD {
- struct HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2;
- /* for future HsaDbgWaveMsgAMDGen3; */
-};
-
-struct HsaDbgWaveMessage {
- void *MemoryVA; /* ptr to associated host-accessible data */
- union HsaDbgWaveMessageAMD DbgWaveMsg;
-};
+ uint32_t Reserved2;
-/*
- * TODO: This definitions to be MOVED to kfd_event, once it is implemented.
- *
- * HSA sync primitive, Event and HW Exception notification API definitions.
- * The API functions allow the runtime to define a so-called sync-primitive,
- * a SW object combining a user-mode provided "syncvar" and a scheduler event
- * that can be signaled through a defined GPU interrupt. A syncvar is
- * a process virtual memory location of a certain size that can be accessed
- * by CPU and GPU shader code within the process to set and query the content
- * within that memory. The definition of the content is determined by the HSA
- * runtime and potentially GPU shader code interfacing with the HSA runtime.
- * The syncvar values may be commonly written through an PM4 WRITE_DATA packet
- * in the user mode instruction stream. The OS scheduler event is typically
- * associated and signaled by an interrupt issued by the GPU, but other HSA
- * system interrupt conditions from other HW (e.g. IOMMUv2) may be surfaced
- * by the KFD by this mechanism, too. */
-
-/* these are the new definitions for events */
-enum HSA_EVENTTYPE {
- HSA_EVENTTYPE_SIGNAL = 0, /* user-mode generated GPU signal */
- HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */
- HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change
- (start/stop) */
- HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */
- HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */
- HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */
- HSA_EVENTTYPE_PROFILE_EVENT = 6,/* GPU signal for profiling */
- HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state
- (EOP pm4) */
+} HsaDbgWaveMsgAMDGen2;
+
+typedef union _HsaDbgWaveMessageAMD {
+ HsaDbgWaveMsgAMDGen2 WaveMsgInfoGen2;
+ /* for future HsaDbgWaveMsgAMDGen3; */
+} HsaDbgWaveMessageAMD;
+
+typedef struct _HsaDbgWaveMessage {
+ void *MemoryVA; /* ptr to associated host-accessible data */
+ HsaDbgWaveMessageAMD DbgWaveMsg;
+} HsaDbgWaveMessage;
+
+/* TODO: This definitions to be MOVED to kfd_event, once it is implemented.
+
+ HSA sync primitive, Event and HW Exception notification API definitions
+ The API functions allow the runtime to define a so-called sync-primitive, a SW object
+ combining a user-mode provided "syncvar" and a scheduler event that can be signaled
+ through a defined GPU interrupt. A syncvar is a process virtual memory location of
+ a certain size that can be accessed by CPU and GPU shader code within the process to set
+ and query the content within that memory. The definition of the content is determined by
+ the HSA runtime and potentially GPU shader code interfacing with the HSA runtime.
+ The syncvar values may be commonly written through an PM4 WRITE_DATA packet in the
+ user mode instruction stream. The OS scheduler event is typically associated and
+ signaled by an interrupt issued by the GPU, but other HSA system interrupt conditions
+ from other HW (e.g. IOMMUv2) may besurfaced by the KFD by this mechanism, too. */
+
+/* these are the new definitions for events */
+
+typedef enum _HSA_EVENTTYPE {
+ HSA_EVENTTYPE_SIGNAL = 0, /* /user-mode generated GPU signal */
+ HSA_EVENTTYPE_NODECHANGE = 1, /* HSA node change (attach/detach) */
+ HSA_EVENTTYPE_DEVICESTATECHANGE = 2, /* HSA device state change( start/stop ) */
+ HSA_EVENTTYPE_HW_EXCEPTION = 3, /* GPU shader exception event */
+ HSA_EVENTTYPE_SYSTEM_EVENT = 4, /* GPU SYSCALL with parameter info */
+ HSA_EVENTTYPE_DEBUG_EVENT = 5, /* GPU signal for debugging */
+ HSA_EVENTTYPE_PROFILE_EVENT = 6, /* GPU signal for profiling */
+ HSA_EVENTTYPE_QUEUE_EVENT = 7, /* GPU signal queue idle state (EOP pm4) */
/* ... */
HSA_EVENTTYPE_MAXID,
HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF
-};
+} HSA_EVENTTYPE;
+
+typedef uint32_t HSA_EVENTID;
-/* Sub-definitions for various event types: Syncvar */
-struct HsaSyncVar {
- union SyncVar {
- void *UserData; /* pointer to user mode data */
- uint64_t UserDataPtrValue; /* 64bit compatibility of value */
+/* Subdefinitions for various event types: Syncvar */
+
+typedef struct _HsaSyncVar {
+ union {
+ void *UserData; /* pointer to user mode data */
+ uint64_t UserDataPtrValue; /* 64bit compatibility of value */
} SyncVar;
uint64_t SyncVarSize;
-};
+} HsaSyncVar;
-/* Sub-definitions for various event types: NodeChange */
+/*
+ Subdefinitions for various event types: NodeChange
+*/
-enum HSA_EVENTTYPE_NODECHANGE_FLAGS {
+typedef enum _HSA_EVENTTYPE_NODECHANGE_FLAGS {
HSA_EVENTTYPE_NODECHANGE_ADD = 0,
HSA_EVENTTYPE_NODECHANGE_REMOVE = 1,
HSA_EVENTTYPE_NODECHANGE_SIZE = 0xFFFFFFFF
-};
+} HSA_EVENTTYPE_NODECHANGE_FLAGS;
-struct HsaNodeChange {
- /* HSA node added/removed on the platform */
- enum HSA_EVENTTYPE_NODECHANGE_FLAGS Flags;
-};
+typedef struct _HsaNodeChange {
+ HSA_EVENTTYPE_NODECHANGE_FLAGS Flags; /* HSA node added/removed on the platform */
+} HsaNodeChange;
+
+/*
+ Sub-definitions for various event types: DeviceStateChange
+*/
-/* Sub-definitions for various event types: DeviceStateChange */
-enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS {
- /* device started (and available) */
- HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0,
- /* device stopped (i.e. unavailable) */
- HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1,
+typedef enum _HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS {
+ HSA_EVENTTYPE_DEVICESTATUSCHANGE_START = 0, /* device started (and available) */
+ HSA_EVENTTYPE_DEVICESTATUSCHANGE_STOP = 1, /* device stopped (i.e. unavailable) */
HSA_EVENTTYPE_DEVICESTATUSCHANGE_SIZE = 0xFFFFFFFF
-};
+} HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS;
-enum HSA_DEVICE {
+typedef enum _HSA_DEVICE {
HSA_DEVICE_CPU = 0,
HSA_DEVICE_GPU = 1,
MAX_HSA_DEVICE = 2
-};
+} HSA_DEVICE;
-struct HsaDeviceStateChange {
+typedef struct _HsaDeviceStateChange {
uint32_t NodeId; /* F-NUMA node that contains the device */
- enum HSA_DEVICE Device; /* device type: GPU or CPU */
- enum HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */
-};
+ HSA_DEVICE Device; /* device type: GPU or CPU */
+ HSA_EVENTTYPE_DEVICESTATECHANGE_FLAGS Flags; /* event flags */
+} HsaDeviceStateChange;
-struct HsaEventData {
- enum HSA_EVENTTYPE EventType; /* event type */
- union EventData {
- /*
- * return data associated with HSA_EVENTTYPE_SIGNAL
- * and other events
- */
- struct HsaSyncVar SyncVar;
+typedef struct _HsaEventData {
+ HSA_EVENTTYPE EventType; /* event type */
+ union {
+ /* return data associated with HSA_EVENTTYPE_SIGNAL and other events */
+ HsaSyncVar SyncVar;
/* data associated with HSA_EVENTTYPE_NODE_CHANGE */
- struct HsaNodeChange NodeChangeState;
+ HsaNodeChange NodeChangeState;
/* data associated with HSA_EVENTTYPE_DEVICE_STATE_CHANGE */
- struct HsaDeviceStateChange DeviceState;
+ HsaDeviceStateChange DeviceState;
} EventData;
- /* the following data entries are internal to the KFD & thunk itself */
+ /* the following data entries are internal to the KFD & thunk itself. */
- /* internal thunk store for Event data (OsEventHandle) */
- uint64_t HWData1;
- /* internal thunk store for Event data (HWAddress) */
- uint64_t HWData2;
- /* internal thunk store for Event data (HWData) */
- uint32_t HWData3;
-};
+ uint64_t HWData1; /* internal thunk store for Event data (OsEventHandle) */
+ uint64_t HWData2; /* internal thunk store for Event data (HWAddress) */
+ uint32_t HWData3; /* internal thunk store for Event data (HWData) */
+} HsaEventData;
-struct HsaEventDescriptor {
- /* event type to allocate */
- enum HSA_EVENTTYPE EventType;
- /* H-NUMA node containing GPU device that is event source */
- uint32_t NodeId;
- /* pointer to user mode syncvar data, syncvar->UserDataPtrValue
- * may be NULL
- */
- struct HsaSyncVar SyncVar;
-};
+typedef struct _HsaEventDescriptor {
+ HSA_EVENTTYPE EventType; /* event type to allocate */
+ uint32_t NodeId; /* H-NUMA node containing GPU device that is event source */
+ HsaSyncVar SyncVar; /* pointer to user mode syncvar data, syncvar->UserDataPtrValue may be NULL */
+} HsaEventDescriptor;
+
+typedef struct _HsaEvent {
+ HSA_EVENTID EventId;
+ HsaEventData EventData;
+} HsaEvent;
-struct HsaEvent {
- uint32_t EventId;
- struct HsaEventData EventData;
-};
#pragma pack(pop)
-enum DBGDEV_TYPE {
+typedef enum _DBGDEV_TYPE {
DBGDEV_TYPE_ILLEGAL = 0,
DBGDEV_TYPE_NODIQ = 1,
DBGDEV_TYPE_DIQ = 2,
DBGDEV_TYPE_TEST = 3
-};
+} DBGDEV_TYPE;
struct dbg_address_watch_info {
struct kfd_process *process;
- enum HSA_DBG_WATCH_MODE *watch_mode;
+ HSA_DBG_WATCH_MODE *watch_mode;
uint64_t *watch_address;
uint64_t *watch_mask;
- struct HsaEvent *watch_event;
+ HsaEvent *watch_event;
uint32_t num_watch_points;
};
struct dbg_wave_control_info {
struct kfd_process *process;
uint32_t trapId;
- enum HSA_DBG_WAVEOP operand;
- enum HSA_DBG_WAVEMODE mode;
- struct HsaDbgWaveMessage dbgWave_msg;
+ HSA_DBG_WAVEOP operand;
+ HSA_DBG_WAVEMODE mode;
+ HsaDbgWaveMessage dbgWave_msg;
};
struct kfd_dbgdev {
/* The device that owns this data. */
+
struct kfd_dev *dev;
/* kernel queue for DIQ */
+
struct kernel_queue *kq;
/* a pointer to the pqm of the calling process */
+
struct process_queue_manager *pqm;
/* type of debug device ( DIQ, non DIQ, etc. ) */
- enum DBGDEV_TYPE type;
+
+ DBGDEV_TYPE type;
/* virtualized function pointers to device dbg */
+
int (*dbgdev_register)(struct kfd_dbgdev *dbgdev);
int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev);
- int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev,
- struct dbg_address_watch_info *adw_info);
- int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev,
- struct dbg_wave_control_info *wac_info);
+ int (*dbgdev_address_watch)(struct kfd_dbgdev *dbgdev, struct dbg_address_watch_info *adw_info);
+ int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, struct dbg_wave_control_info *wac_info);
};
@@ -282,13 +272,12 @@ struct kfd_dbgmgr {
};
/* prototypes for debug manager functions */
-struct mutex *kfd_get_dbgmgr_mutex(void);
+struct mutex *get_dbgmgr_mutex(void);
void kfd_dbgmgr_destroy(struct kfd_dbgmgr *pmgr);
bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev);
long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p);
-long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
- struct dbg_wave_control_info *wac_info);
-long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
- struct dbg_address_watch_info *adw_info);
-#endif /* KFD_DBGMGR_H_ */
+long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, struct dbg_wave_control_info *wac_info);
+long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, struct dbg_address_watch_info *adw_info);
+long kfd_dbgmgr_abnormal_termination(struct kfd_dbgmgr *pmgr, struct kfd_process *process);
+#endif /* KFD_DBGMGR_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 3f95f7cb4019..20592baeaf95 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -24,9 +24,11 @@
#include <linux/bsearch.h>
#include <linux/pci.h>
#include <linux/slab.h>
+#include <linux/highmem.h>
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#include "kfd_pm4_headers.h"
+#include "cwsr_trap_handler_carrizo.h"
#define MQD_SIZE_ALIGNED 768
@@ -38,7 +40,8 @@ static const struct kfd_device_info kaveri_device_info = {
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
- .mqd_size_aligned = MQD_SIZE_ALIGNED
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .is_need_iommu_device = true
};
static const struct kfd_device_info carrizo_device_info = {
@@ -49,14 +52,50 @@ static const struct kfd_device_info carrizo_device_info = {
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
- .mqd_size_aligned = MQD_SIZE_ALIGNED
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .is_need_iommu_device = true
};
+static const struct kfd_device_info tonga_device_info = {
+ .asic_family = CHIP_TONGA,
+ .max_pasid_bits = 16,
+ .max_no_of_hqd = 24,
+ .ih_ring_entry_size = 4 * sizeof(uint32_t),
+ .event_interrupt_class = &event_interrupt_class_cik,
+ .num_of_watch_points = 4,
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .is_need_iommu_device = false
+};
+
+static const struct kfd_device_info fiji_device_info = {
+ .asic_family = CHIP_FIJI,
+ .max_pasid_bits = 16,
+ .max_no_of_hqd = 24,
+ .ih_ring_entry_size = 4 * sizeof(uint32_t),
+ .event_interrupt_class = &event_interrupt_class_cik,
+ .num_of_watch_points = 4,
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .is_need_iommu_device = false
+}
+;
struct kfd_deviceid {
unsigned short did;
const struct kfd_device_info *device_info;
};
+/*
+ * //
+// TONGA/AMETHYST device IDs (performance segment)
+//
+#define DEVICE_ID_VI_TONGA_P_6920 0x6920 // unfused
+#define DEVICE_ID_VI_TONGA_P_6921 0x6921 // Amethyst XT
+#define DEVICE_ID_VI_TONGA_P_6928 0x6928 // Tonga GL XT
+#define DEVICE_ID_VI_TONGA_P_692B 0x692B // Tonga GL PRO
+#define DEVICE_ID_VI_TONGA_P_692F 0x692F // Tonga GL PRO VF
+#define DEVICE_ID_VI_TONGA_P_6938 0x6938 // Tonga XT
+#define DEVICE_ID_VI_TONGA_P_6939 0x6939 // Tonga PRO
+ *
+ */
/* Please keep this sorted by increasing device id. */
static const struct kfd_deviceid supported_devices[] = {
{ 0x1304, &kaveri_device_info }, /* Kaveri */
@@ -85,13 +124,23 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x9874, &carrizo_device_info }, /* Carrizo */
{ 0x9875, &carrizo_device_info }, /* Carrizo */
{ 0x9876, &carrizo_device_info }, /* Carrizo */
- { 0x9877, &carrizo_device_info } /* Carrizo */
+ { 0x9877, &carrizo_device_info }, /* Carrizo */
+ { 0x6920, &tonga_device_info }, /* Tonga */
+ { 0x6921, &tonga_device_info }, /* Tonga */
+ { 0x6928, &tonga_device_info }, /* Tonga */
+ { 0x692B, &tonga_device_info }, /* Tonga */
+ { 0x692F, &tonga_device_info }, /* Tonga */
+ { 0x6938, &tonga_device_info }, /* Tonga */
+ { 0x6939, &tonga_device_info }, /* Tonga */
+ { 0x7300, &fiji_device_info } /* Fiji */
};
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
unsigned int chunk_size);
static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
+static int kfd_resume(struct kfd_dev *kfd);
+
static const struct kfd_device_info *lookup_device_info(unsigned short did)
{
size_t i;
@@ -117,6 +166,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
if (!device_info)
return NULL;
+ BUG_ON(!f2g);
+
kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
if (!kfd)
return NULL;
@@ -170,15 +221,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
pasid_limit,
kfd->doorbell_process_limit - 1);
- err = amd_iommu_init_device(kfd->pdev, pasid_limit);
- if (err < 0) {
- dev_err(kfd_device, "error initializing iommu device\n");
- return false;
- }
-
if (!kfd_set_pasid_limit(pasid_limit)) {
dev_err(kfd_device, "error setting pasid limit\n");
- amd_iommu_free_device(kfd->pdev);
return false;
}
@@ -219,13 +263,81 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
return AMD_IOMMU_INV_PRI_RSP_INVALID;
}
+static int kfd_cwsr_init(struct kfd_dev *kfd)
+{
+ /*
+ * Initialize the CWSR required memory for TBA and TMA
+ * only support CWSR on VI and up with FW version >=625.
+ */
+ if (cwsr_enable &&
+ (kfd->mec_fw_version >= KFD_CWSR_CZ_FW_VER)) {
+ void *cwsr_addr = NULL;
+ unsigned int size = sizeof(cwsr_trap_carrizo_hex);
+
+ if (size > PAGE_SIZE) {
+ pr_err("amdkfd: wrong CWSR ISA size.\n");
+ return -EINVAL;
+ }
+ kfd->cwsr_size =
+ ALIGN(size, PAGE_SIZE) + PAGE_SIZE;
+ kfd->cwsr_pages = alloc_pages(GFP_KERNEL | __GFP_HIGHMEM,
+ get_order(kfd->cwsr_size));
+ if (!kfd->cwsr_pages) {
+ pr_err("amdkfd: error alloc CWSR isa memory.\n");
+ return -ENOMEM;
+ }
+ /*Only first page used for cwsr ISA code */
+ cwsr_addr = kmap(kfd->cwsr_pages);
+ memset(cwsr_addr, 0, PAGE_SIZE);
+ memcpy(cwsr_addr, cwsr_trap_carrizo_hex, size);
+ kunmap(kfd->cwsr_pages);
+ kfd->tma_offset = ALIGN(size, PAGE_SIZE);
+ kfd->cwsr_enabled = true;
+ dev_info(kfd_device,
+ "Reserved %d pages for cwsr.\n",
+ (kfd->cwsr_size >> PAGE_SHIFT));
+ }
+
+ return 0;
+}
+
+static void kfd_cwsr_fini(struct kfd_dev *kfd)
+{
+ if (kfd->cwsr_pages)
+ __free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size));
+}
+
bool kgd2kfd_device_init(struct kfd_dev *kfd,
const struct kgd2kfd_shared_resources *gpu_resources)
{
unsigned int size;
+ unsigned int vmid_bitmap_kfd, vmid_num_kfd;
+
+ kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+ KGD_ENGINE_MEC1);
kfd->shared_resources = *gpu_resources;
+ vmid_bitmap_kfd = kfd->shared_resources.compute_vmid_bitmap;
+ kfd->vm_info.first_vmid_kfd = ffs(vmid_bitmap_kfd) - 1;
+ kfd->vm_info.last_vmid_kfd = fls(vmid_bitmap_kfd) - 1;
+ vmid_num_kfd = kfd->vm_info.last_vmid_kfd
+ - kfd->vm_info.first_vmid_kfd + 1;
+ kfd->vm_info.vmid_num_kfd = vmid_num_kfd;
+
+ /* If MEC firmware is too old, turn off hws multiple process mapping */
+ if (kfd->mec_fw_version < KFD_MULTI_PROC_MAPPING_HWS_SUPPORT)
+ kfd->max_proc_per_quantum = 0;
+ /* Verify module parameters regarding mapped process number*/
+ else if ((hws_max_conc_proc < 0)
+ || (hws_max_conc_proc > vmid_num_kfd)) {
+ dev_err(kfd_device,
+ "hws_max_conc_proc (%d) must be between 0 and %d, use %d instead\n",
+ hws_max_conc_proc, vmid_num_kfd, vmid_num_kfd);
+ kfd->max_proc_per_quantum = vmid_num_kfd;
+ } else
+ kfd->max_proc_per_quantum = hws_max_conc_proc;
+
/* calculate max size of mqds needed for queues */
size = max_num_of_queues_per_device *
kfd->device_info->mqd_size_aligned;
@@ -280,16 +392,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
goto kfd_interrupt_error;
}
- if (!device_iommu_pasid_init(kfd)) {
- dev_err(kfd_device,
- "Error initializing iommuv2 for device (%x:%x)\n",
- kfd->pdev->vendor, kfd->pdev->device);
- goto device_iommu_pasid_error;
- }
- amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
- iommu_pasid_shutdown_callback);
- amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
-
kfd->dqm = device_queue_manager_init(kfd);
if (!kfd->dqm) {
dev_err(kfd_device,
@@ -298,13 +400,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
goto device_queue_manager_error;
}
- if (kfd->dqm->ops.start(kfd->dqm) != 0) {
- dev_err(kfd_device,
- "Error starting queuen manager for device (%x:%x)\n",
- kfd->pdev->vendor, kfd->pdev->device);
- goto dqm_start_error;
+ if (kfd->device_info->is_need_iommu_device) {
+ if (!device_iommu_pasid_init(kfd)) {
+ dev_err(kfd_device,
+ "Error initializing iommuv2 for device (%x:%x)\n",
+ kfd->pdev->vendor, kfd->pdev->device);
+ goto device_iommu_pasid_error;
+ }
}
+ if (kfd_cwsr_init(kfd))
+ goto device_iommu_pasid_error;
+
+ if (kfd_resume(kfd))
+ goto kfd_resume_error;
+
kfd->dbgmgr = NULL;
kfd->init_complete = true;
@@ -316,11 +426,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
goto out;
-dqm_start_error:
+kfd_resume_error:
+ kfd_cwsr_fini(kfd);
+device_iommu_pasid_error:
device_queue_manager_uninit(kfd->dqm);
device_queue_manager_error:
- amd_iommu_free_device(kfd->pdev);
-device_iommu_pasid_error:
kfd_interrupt_exit(kfd);
kfd_interrupt_error:
kfd_topology_remove_device(kfd);
@@ -338,8 +448,9 @@ out:
void kgd2kfd_device_exit(struct kfd_dev *kfd)
{
if (kfd->init_complete) {
+ kgd2kfd_suspend(kfd);
+ kfd_cwsr_fini(kfd);
device_queue_manager_uninit(kfd->dqm);
- amd_iommu_free_device(kfd->pdev);
kfd_interrupt_exit(kfd);
kfd_topology_remove_device(kfd);
kfd_gtt_sa_fini(kfd);
@@ -355,32 +466,68 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
if (kfd->init_complete) {
kfd->dqm->ops.stop(kfd->dqm);
- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
- amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
- amd_iommu_free_device(kfd->pdev);
+ if (kfd->device_info->is_need_iommu_device) {
+ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+ amd_iommu_free_device(kfd->pdev);
+ }
}
}
-int kgd2kfd_resume(struct kfd_dev *kfd)
+int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem)
{
- unsigned int pasid_limit;
- int err;
+ return evict_bo(dev, mem);
+}
+int kgd2kfd_restore(struct kfd_dev *kfd)
+{
+ return restore(kfd);
+}
+
+int kgd2kfd_resume(struct kfd_dev *kfd)
+{
BUG_ON(kfd == NULL);
- pasid_limit = kfd_get_pasid_limit();
+ if (!kfd->init_complete)
+ return 0;
+
+ return kfd_resume(kfd);
+
+}
+
+static int kfd_resume(struct kfd_dev *kfd)
+{
+ int err = 0;
+
+ if (kfd->device_info->is_need_iommu_device) {
+ unsigned int pasid_limit = kfd_get_pasid_limit();
- if (kfd->init_complete) {
err = amd_iommu_init_device(kfd->pdev, pasid_limit);
- if (err < 0)
+ if (err)
return -ENXIO;
amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
- iommu_pasid_shutdown_callback);
- amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
- kfd->dqm->ops.start(kfd->dqm);
+ iommu_pasid_shutdown_callback);
+ amd_iommu_set_invalid_ppr_cb(kfd->pdev,
+ iommu_invalid_ppr_cb);
}
- return 0;
+ err = kfd->dqm->ops.start(kfd->dqm);
+ if (err) {
+ dev_err(kfd_device,
+ "Error starting queue manager for device (%x:%x)\n",
+ kfd->pdev->vendor, kfd->pdev->device);
+ goto dqm_start_error;
+ }
+
+ kfd->kfd2kgd->write_config_static_mem(kfd->kgd, true, 1, 3, 0);
+
+ return err;
+
+dqm_start_error:
+ if (kfd->device_info->is_need_iommu_device)
+ amd_iommu_free_device(kfd->pdev);
+
+ return err;
}
/* This is called directly from KGD at ISR. */
@@ -399,6 +546,58 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
spin_unlock(&kfd->interrupt_lock);
}
+int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm)
+{
+ struct kfd_process *p;
+ struct kfd_process_device *pdd;
+ int r;
+
+ BUG_ON(kfd == NULL);
+ if (!kfd->init_complete)
+ return 0;
+
+ /* Because we are called from arbitrary context (workqueue) as opposed
+ * to process context, kfd_process could attempt to exit while we are
+ * running so the lookup function returns a read-locked process. */
+ p = kfd_lookup_process_by_mm(mm);
+ if (!p)
+ return -ENODEV;
+
+ r = -ENODEV;
+ pdd = kfd_get_process_device_data(kfd, p);
+ if (pdd)
+ r = process_evict_queues(kfd->dqm, &pdd->qpd);
+
+ up_read(&p->lock);
+ return r;
+}
+
+int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm)
+{
+ struct kfd_process *p;
+ struct kfd_process_device *pdd;
+ int r;
+
+ BUG_ON(kfd == NULL);
+ if (!kfd->init_complete)
+ return 0;
+
+ /* Because we are called from arbitrary context (workqueue) as opposed
+ * to process context, kfd_process could attempt to exit while we are
+ * running so the lookup function returns a read-locked process. */
+ p = kfd_lookup_process_by_mm(mm);
+ if (!p)
+ return -ENODEV;
+
+ r = -ENODEV;
+ pdd = kfd_get_process_device_data(kfd, p);
+ if (pdd)
+ r = process_restore_queues(kfd->dqm, &pdd->qpd);
+
+ up_read(&p->lock);
+ return r;
+}
+
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
unsigned int chunk_size)
{
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f49c551195b3..78033c13d2ed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -44,9 +44,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd);
-static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock);
-static int destroy_queues_cpsch(struct device_queue_manager *dqm,
- bool preempt_static_queues, bool lock);
+static int execute_queues_cpsch(struct device_queue_manager *dqm);
+static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+ enum kfd_unmap_queues_filter filter,
+ uint32_t filter_param, bool reset);
static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
@@ -100,11 +101,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
if (dqm->vmid_bitmap == 0)
return -ENOMEM;
- bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM);
+ bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap,
+ dqm->dev->vm_info.vmid_num_kfd);
clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap);
- /* Kaveri kfd vmid's starts from vmid 8 */
- allocated_vmid = bit + KFD_VMID_START_OFFSET;
+ allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
pr_debug("kfd: vmid allocation %d\n", allocated_vmid);
qpd->vmid = allocated_vmid;
q->properties.vmid = allocated_vmid;
@@ -112,6 +113,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid);
program_sh_mem_settings(dqm, qpd);
+ dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd,
+ allocated_vmid,
+ qpd->page_table_base);
+ /*invalidate the VM context after pasid and vmid mapping is set up*/
+ radeon_flush_tlb(dqm->dev, qpd->pqm->process->pasid);
return 0;
}
@@ -119,7 +125,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
- int bit = qpd->vmid - KFD_VMID_START_OFFSET;
+ int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
/* Release the vmid mapping */
set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
@@ -159,6 +165,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
}
*allocated_vmid = qpd->vmid;
q->properties.vmid = qpd->vmid;
+ /*
+ * Eviction state logic: we only mark active queues as evicted
+ * to avoid the overhead of restoring inactive queues later
+ */
+ if (qpd->evicted)
+ q->properties.is_evicted = (q->properties.queue_size > 0 &&
+ q->properties.queue_percent > 0 &&
+ q->properties.queue_address != 0);
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
retval = create_compute_queue_nocpsch(dqm, q, qpd);
@@ -261,8 +275,12 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
q->pipe,
q->queue);
+ dqm->dev->kfd2kgd->alloc_memory_of_scratch(
+ dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
+
retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
- q->queue, (uint32_t __user *) q->properties.write_ptr);
+ q->queue, (uint32_t __user *) q->properties.write_ptr,
+ qpd->page_table_base);
if (retval != 0) {
deallocate_hqd(dqm, q);
mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
@@ -342,34 +360,56 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
{
int retval;
struct mqd_manager *mqd;
+ struct kfd_process_device *pdd;
+
bool prev_active = false;
BUG_ON(!dqm || !q || !q->mqd);
mutex_lock(&dqm->lock);
+
+ pdd = kfd_get_process_device_data(q->device, q->process);
+ if (!pdd) {
+ mutex_unlock(&dqm->lock);
+ return -ENODEV;
+ }
mqd = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
if (mqd == NULL) {
mutex_unlock(&dqm->lock);
return -ENOMEM;
}
+ /*
+ * Eviction state logic: we only mark active queues as evicted
+ * to avoid the overhead of restoring inactive queues later
+ */
+ if (pdd->qpd.evicted > 0)
+ q->properties.is_evicted = (q->properties.queue_size > 0 &&
+ q->properties.queue_percent > 0 &&
+ q->properties.queue_address != 0);
+ /* save previous activity state for counters */
if (q->properties.is_active)
prev_active = true;
- /*
- *
- * check active state vs. the previous state
- * and modify counter accordingly
- */
+
retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+ q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
+ q->queue,
+ (uint32_t __user *)q->properties.write_ptr, 0);
+ /*
+ * check active state vs. the previous state
+ * and modify counter accordingly
+ */
if ((q->properties.is_active) && (!prev_active))
dqm->queue_count++;
else if ((!q->properties.is_active) && (prev_active))
dqm->queue_count--;
if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
- retval = execute_queues_cpsch(dqm, false);
+ retval = execute_queues_cpsch(dqm);
mutex_unlock(&dqm->lock);
return retval;
@@ -395,15 +435,115 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
return mqd;
}
+int process_evict_queues(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct queue *q, *next;
+ struct mqd_manager *mqd;
+ int retval = 0;
+
+ BUG_ON(!dqm || !qpd);
+
+ mutex_lock(&dqm->lock);
+ if (qpd->evicted++ > 0) { /* already evicted, do nothing */
+ mutex_unlock(&dqm->lock);
+ return 0;
+ }
+ /* unactivate all active queues on the qpd */
+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+ mqd = dqm->ops.get_mqd_manager(dqm,
+ get_mqd_type_from_queue_type(q->properties.type));
+ if (!mqd) { /* should not be here */
+ BUG();
+ continue;
+ }
+ /* if the queue is not active anyway, it is not evicted */
+ if (q->properties.is_active == true)
+ q->properties.is_evicted = true;
+
+ retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+ q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
+ q->queue,
+ (uint32_t __user *)q->properties.write_ptr, 0);
+ if (q->properties.is_evicted)
+ dqm->queue_count--;
+ }
+ if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
+ retval = execute_queues_cpsch(dqm);
+
+ mutex_unlock(&dqm->lock);
+ return retval;
+
+}
+
+int process_restore_queues(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct queue *q, *next;
+ struct mqd_manager *mqd;
+ int retval = 0;
+
+
+ BUG_ON(!dqm || !qpd);
+
+ mutex_lock(&dqm->lock);
+ if (qpd->evicted == 0) { /* already restored, do nothing */
+ mutex_unlock(&dqm->lock);
+ return 0;
+ }
+
+ if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
+ qpd->evicted--;
+ mutex_unlock(&dqm->lock);
+ return 0;
+ }
+
+ /* activate all active queues on the qpd */
+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+ mqd = dqm->ops.get_mqd_manager(dqm,
+ get_mqd_type_from_queue_type(q->properties.type));
+ if (!mqd) { /* should not be here */
+ BUG();
+ continue;
+ }
+ if (q->properties.is_evicted) {
+ q->properties.is_evicted = false;
+ retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
+ if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+ q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ retval =
+ mqd->load_mqd(
+ mqd,
+ q->mqd,
+ q->pipe,
+ q->queue,
+ (uint32_t __user *)q->properties.write_ptr,
+ 0);
+ dqm->queue_count++;
+ }
+ }
+ if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
+ retval = execute_queues_cpsch(dqm);
+
+ if (retval == 0)
+ qpd->evicted = 0;
+ mutex_unlock(&dqm->lock);
+ return retval;
+
+}
+
static int register_process_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
+ struct kfd_process_device *pdd;
struct device_process_node *n;
int retval;
BUG_ON(!dqm || !qpd);
- pr_debug("kfd: In func %s\n", __func__);
+ pr_debug("In func %s\n", __func__);
n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL);
if (!n)
@@ -414,6 +554,11 @@ static int register_process_nocpsch(struct device_queue_manager *dqm,
mutex_lock(&dqm->lock);
list_add(&n->list, &dqm->queues);
+ pdd = qpd_to_pdd(qpd);
+ qpd->page_table_base =
+ dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
+ pr_debug("Retrieved PD address == 0x%08u\n", qpd->page_table_base);
+
retval = dqm->ops_asic_specific.register_process(dqm, qpd);
dqm->processes_count++;
@@ -531,10 +676,8 @@ static void init_interrupts(struct device_queue_manager *dqm)
BUG_ON(dqm == NULL);
for (i = 0 ; i < get_pipes_num(dqm) ; i++)
- dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd,
- i + get_first_pipe(dqm));
+ dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i);
}
-
static int init_scheduler(struct device_queue_manager *dqm)
{
int retval;
@@ -570,7 +713,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
for (i = 0; i < get_pipes_num(dqm); i++)
dqm->allocated_queues[i] = (1 << QUEUES_PER_PIPE) - 1;
- dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1;
+ dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
init_scheduler(dqm);
@@ -643,8 +786,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
if (retval != 0)
return retval;
- q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
- q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM;
+ q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
+ q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
pr_debug("kfd: sdma id is: %d\n", q->sdma_id);
pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id);
@@ -659,7 +802,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
}
retval = mqd->load_mqd(mqd, q->mqd, 0,
- 0, NULL);
+ 0, NULL, 0);
if (retval != 0) {
deallocate_sdma_queue(dqm, q->sdma_id);
mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
@@ -684,8 +827,7 @@ static int set_sched_resources(struct device_queue_manager *dqm)
queue_num = get_pipes_num_cpsch() * QUEUES_PER_PIPE;
queue_mask = (1 << queue_num) - 1;
- res.vmid_mask = (1 << VMID_PER_DEVICE) - 1;
- res.vmid_mask <<= KFD_VMID_START_OFFSET;
+ res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
res.queue_mask = queue_mask << (get_first_pipe(dqm) * QUEUES_PER_PIPE);
res.gws_mask = res.oac_mask = res.gds_heap_base =
res.gds_heap_size = 0;
@@ -712,6 +854,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
dqm->queue_count = dqm->processes_count = 0;
dqm->sdma_queue_count = 0;
dqm->active_runlist = false;
+ dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
retval = dqm->ops_asic_specific.initialize(dqm);
if (retval != 0)
goto fail_init_pipelines;
@@ -732,7 +875,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
retval = 0;
- retval = pm_init(&dqm->packets, dqm);
+ retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version);
if (retval != 0)
goto fail_packet_manager_init;
@@ -759,7 +902,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
kfd_bind_process_to_device(dqm->dev,
node->qpd->pqm->process);
- execute_queues_cpsch(dqm, true);
+ mutex_lock(&dqm->lock);
+ execute_queues_cpsch(dqm);
+ mutex_unlock(&dqm->lock);
return 0;
fail_allocate_vidmem:
@@ -776,7 +921,11 @@ static int stop_cpsch(struct device_queue_manager *dqm)
BUG_ON(!dqm);
- destroy_queues_cpsch(dqm, true, true);
+ mutex_lock(&dqm->lock);
+
+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
+
+ mutex_unlock(&dqm->lock);
list_for_each_entry(node, &dqm->queues, list) {
pdd = qpd_to_pdd(node->qpd);
@@ -815,7 +964,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
list_add(&kq->list, &qpd->priv_queue_list);
dqm->queue_count++;
qpd->is_debug = true;
- execute_queues_cpsch(dqm, false);
+ execute_queues_cpsch(dqm);
mutex_unlock(&dqm->lock);
return 0;
@@ -831,11 +980,11 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
mutex_lock(&dqm->lock);
/* here we actually preempt the DIQ */
- destroy_queues_cpsch(dqm, true, false);
+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
list_del(&kq->list);
dqm->queue_count--;
qpd->is_debug = false;
- execute_queues_cpsch(dqm, false);
+ execute_queues_cpsch(dqm);
/*
* Unconditionally decrement this counter, regardless of the queue's
* type.
@@ -846,14 +995,6 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
mutex_unlock(&dqm->lock);
}
-static void select_sdma_engine_id(struct queue *q)
-{
- static int sdma_id;
-
- q->sdma_id = sdma_id;
- sdma_id = (sdma_id + 1) % 2;
-}
-
static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd, int *allocate_vmid)
{
@@ -876,9 +1017,15 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
goto out;
}
- if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
- select_sdma_engine_id(q);
-
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ retval = allocate_sdma_queue(dqm, &q->sdma_id);
+ if (retval != 0)
+ goto out;
+ q->properties.sdma_queue_id =
+ q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
+ q->properties.sdma_engine_id =
+ q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
+ }
mqd = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
@@ -886,8 +1033,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
mutex_unlock(&dqm->lock);
return -ENOMEM;
}
+ /*
+ * Eviction state logic: we only mark active queues as evicted
+ * to avoid the overhead of restoring inactive queues later
+ */
+ if (qpd->evicted)
+ q->properties.is_evicted = (q->properties.queue_size > 0 &&
+ q->properties.queue_percent > 0 &&
+ q->properties.queue_address != 0);
dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd);
+
+ q->properties.tba_addr = qpd->tba_addr;
+ q->properties.tma_addr = qpd->tma_addr;
retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (retval != 0)
@@ -896,7 +1054,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
list_add(&q->list, &qpd->queues_list);
if (q->properties.is_active) {
dqm->queue_count++;
- retval = execute_queues_cpsch(dqm, false);
+ retval = execute_queues_cpsch(dqm);
}
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
@@ -933,20 +1091,20 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
return 0;
}
-static int destroy_sdma_queues(struct device_queue_manager *dqm,
+static int unmap_sdma_queues(struct device_queue_manager *dqm,
unsigned int sdma_engine)
{
return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false,
+ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false,
sdma_engine);
}
-static int destroy_queues_cpsch(struct device_queue_manager *dqm,
- bool preempt_static_queues, bool lock)
+/* dqm->lock mutex has to be locked before calling this function */
+static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+ enum kfd_unmap_queues_filter filter,
+ uint32_t filter_param, bool reset)
{
int retval;
- enum kfd_preempt_type_filter preempt_type;
- struct kfd_process_device *pdd;
BUG_ON(!dqm);
@@ -956,23 +1114,21 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm,
mutex_lock(&dqm->lock);
if (!dqm->active_runlist)
goto out;
+ if (dqm->active_runlist == false)
+ return retval;
pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n",
dqm->sdma_queue_count);
if (dqm->sdma_queue_count > 0) {
- destroy_sdma_queues(dqm, 0);
- destroy_sdma_queues(dqm, 1);
+ unmap_sdma_queues(dqm, 0);
+ unmap_sdma_queues(dqm, 1);
}
- preempt_type = preempt_static_queues ?
- KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES :
- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES;
-
retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
- preempt_type, 0, false, 0);
+ filter, filter_param, reset, 0);
if (retval != 0)
- goto out;
+ return retval;
*dqm->fence_addr = KFD_FENCE_INIT;
pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr,
@@ -981,55 +1137,47 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm,
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
if (retval != 0) {
- pdd = kfd_get_process_device_data(dqm->dev,
- kfd_get_process(current));
- pdd->reset_wavefronts = true;
- goto out;
+ pr_err("kfd: unmapping queues failed.");
+ return retval;
}
+
pm_release_ib(&dqm->packets);
dqm->active_runlist = false;
-out:
- if (lock)
- mutex_unlock(&dqm->lock);
return retval;
}
-static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock)
+/* dqm->lock mutex has to be locked before calling this function */
+static int execute_queues_cpsch(struct device_queue_manager *dqm)
{
int retval;
BUG_ON(!dqm);
- if (lock)
- mutex_lock(&dqm->lock);
-
- retval = destroy_queues_cpsch(dqm, false, false);
+ retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
+ 0, false);
if (retval != 0) {
pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption");
- goto out;
+ return retval;
}
if (dqm->queue_count <= 0 || dqm->processes_count <= 0) {
retval = 0;
- goto out;
+ return retval;
}
if (dqm->active_runlist) {
retval = 0;
- goto out;
+ return retval;
}
retval = pm_send_runlist(&dqm->packets, &dqm->queues);
if (retval != 0) {
pr_err("kfd: failed to execute runlist");
- goto out;
+ return retval;
}
dqm->active_runlist = true;
-out:
- if (lock)
- mutex_unlock(&dqm->lock);
return retval;
}
@@ -1067,14 +1215,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
goto failed;
}
- if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
+ deallocate_sdma_queue(dqm, q->sdma_id);
+ }
list_del(&q->list);
if (q->properties.is_active)
dqm->queue_count--;
- execute_queues_cpsch(dqm, false);
+ retval = execute_queues_cpsch(dqm);
mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
@@ -1088,7 +1238,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
mutex_unlock(&dqm->lock);
- return 0;
+ return retval;
failed:
failed_try_destroy_debugged_queue:
@@ -1172,6 +1322,172 @@ out:
return false;
}
+static int set_trap_handler(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ uint64_t tba_addr,
+ uint64_t tma_addr)
+{
+ uint64_t *tma;
+
+ tma = (uint64_t *)(qpd->cwsr_kaddr + dqm->dev->tma_offset);
+ tma[0] = tba_addr;
+ tma[1] = tma_addr;
+ return 0;
+}
+
+
+static int set_page_directory_base(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct kfd_process_device *pdd;
+ uint32_t pd_base;
+ int retval = 0;
+
+ BUG_ON(!dqm || !qpd);
+
+ mutex_lock(&dqm->lock);
+
+ pdd = qpd_to_pdd(qpd);
+
+ /* Retrieve PD base */
+ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
+
+ /* If it has not changed, just get out */
+ if (qpd->page_table_base == pd_base)
+ goto out;
+
+ /* Update PD Base in QPD */
+ qpd->page_table_base = pd_base;
+ pr_debug("Updated PD address == 0x%08u\n", pd_base);
+
+ /*
+ * Preempt queues, destroy runlist and create new runlist. Queues
+ * will have the update PD base address
+ */
+ if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
+ retval = execute_queues_cpsch(dqm);
+
+out:
+ mutex_unlock(&dqm->lock);
+
+ return retval;
+}
+
+static int process_termination_nocpsch(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct queue *q, *next;
+ struct mqd_manager *mqd;
+ struct device_process_node *cur, *next_dpn;
+
+ mutex_lock(&dqm->lock);
+
+ /* Clear all user mode queues */
+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+ mqd = dqm->ops.get_mqd_manager(dqm,
+ get_mqd_type_from_queue_type(q->properties.type));
+ if (!mqd) {
+ mutex_unlock(&dqm->lock);
+ return -ENOMEM;
+ }
+
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ dqm->sdma_queue_count--;
+ deallocate_sdma_queue(dqm, q->sdma_id);
+ }
+
+ list_del(&q->list);
+ if (q->properties.is_active)
+ dqm->queue_count--;
+
+ dqm->total_queue_count--;
+ mqd->destroy_mqd(mqd, q->mqd,
+ KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+ QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
+ q->pipe, q->queue);
+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ if (list_empty(&qpd->queues_list))
+ deallocate_vmid(dqm, qpd, q);
+ }
+
+ /* Unregister process */
+ list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+ if (qpd == cur->qpd) {
+ list_del(&cur->list);
+ kfree(cur);
+ dqm->processes_count--;
+ break;
+ }
+ }
+
+ mutex_unlock(&dqm->lock);
+
+ return 0;
+}
+
+
+static int process_termination_cpsch(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ int retval;
+ struct queue *q, *next;
+ struct kernel_queue *kq, *kq_next;
+ struct mqd_manager *mqd;
+ struct device_process_node *cur, *next_dpn;
+
+ retval = 0;
+
+ mutex_lock(&dqm->lock);
+
+ /* Clean all kernel queues */
+ list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
+ list_del(&kq->list);
+ dqm->queue_count--;
+ qpd->is_debug = false;
+ dqm->total_queue_count--;
+ }
+
+ /* Clear all user mode queues */
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ dqm->sdma_queue_count--;
+ deallocate_sdma_queue(dqm, q->sdma_id);
+ }
+
+ if (q->properties.is_active)
+ dqm->queue_count--;
+
+ dqm->total_queue_count--;
+ }
+
+ /* Unregister process */
+ list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+ if (qpd == cur->qpd) {
+ list_del(&cur->list);
+ kfree(cur);
+ dqm->processes_count--;
+ break;
+ }
+ }
+
+ retval = execute_queues_cpsch(dqm);
+
+ /* lastly, free mqd resources */
+ list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+ mqd = dqm->ops.get_mqd_manager(dqm,
+ get_mqd_type_from_queue_type(q->properties.type));
+ if (!mqd) {
+ mutex_unlock(&dqm->lock);
+ return -ENOMEM;
+ }
+ list_del(&q->list);
+ mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ }
+
+ mutex_unlock(&dqm->lock);
+ return retval;
+}
+
struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
{
struct device_queue_manager *dqm;
@@ -1202,6 +1518,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+ dqm->ops.set_trap_handler = set_trap_handler;
+ dqm->ops.set_page_directory_base = set_page_directory_base;
+ dqm->ops.process_termination = process_termination_cpsch;
break;
case KFD_SCHED_POLICY_NO_HWS:
/* initialize dqm for no cp scheduling */
@@ -1216,6 +1535,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.initialize = initialize_nocpsch;
dqm->ops.uninitialize = uninitialize_nocpsch;
dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+ dqm->ops.set_trap_handler = set_trap_handler;
+ dqm->ops.set_page_directory_base = set_page_directory_base;
+ dqm->ops.process_termination = process_termination_nocpsch;
break;
default:
BUG();
@@ -1230,6 +1552,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
case CHIP_KAVERI:
device_queue_manager_init_cik(&dqm->ops_asic_specific);
break;
+
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ device_queue_manager_init_vi_tonga(&dqm->ops_asic_specific);
+ break;
}
if (dqm->ops.initialize(dqm) != 0) {
@@ -1247,3 +1574,20 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
dqm->ops.uninitialize(dqm);
kfree(dqm);
}
+
+int kfd_process_vm_fault(struct device_queue_manager *dqm,
+ unsigned int pasid)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ int ret = 0;
+
+ if (!p)
+ return -EINVAL;
+ pdd = kfd_get_process_device_data(dqm->dev, p);
+ if (pdd)
+ ret = process_evict_queues(dqm, &pdd->qpd);
+ up_read(&p->lock);
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a625b9137da2..19132d980cce 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -29,12 +29,9 @@
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500)
+#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (9000)
#define QUEUES_PER_PIPE (8)
#define PIPE_PER_ME_CP_SCHEDULING (3)
-#define CIK_VMID_NUM (8)
-#define KFD_VMID_START_OFFSET (8)
-#define VMID_PER_DEVICE CIK_VMID_NUM
#define KFD_DQM_FIRST_PIPE (0)
#define CIK_SDMA_QUEUES (4)
#define CIK_SDMA_QUEUES_PER_ENGINE (2)
@@ -81,6 +78,12 @@ struct device_process_node {
* @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the
* memory apertures.
*
+ * @set_page_directory_base: Sets the PD base address (GPU local memory)
+ * in all the queues of the relevant process running on the specified device.
+ * It preempts the queues, updates the value and execute the runlist again.
+ *
+ * @process_termination: Clears all process queues belongs to that device.
+ *
*/
struct device_queue_manager_ops {
@@ -124,6 +127,16 @@ struct device_queue_manager_ops {
enum cache_policy alternate_policy,
void __user *alternate_aperture_base,
uint64_t alternate_aperture_size);
+
+ int (*set_trap_handler)(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ uint64_t tba_addr,
+ uint64_t tma_addr);
+
+ int (*set_page_directory_base)(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
+ int (*process_termination)(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
};
struct device_queue_manager_asic_ops {
@@ -180,6 +193,8 @@ struct device_queue_manager {
void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops);
void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops);
+void device_queue_manager_init_vi_tonga(
+ struct device_queue_manager_asic_ops *ops);
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
int init_pipelines(struct device_queue_manager *dqm,
@@ -187,6 +202,12 @@ int init_pipelines(struct device_queue_manager *dqm,
unsigned int get_first_pipe(struct device_queue_manager *dqm);
unsigned int get_pipes_num(struct device_queue_manager *dqm);
+int process_evict_queues(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
+int process_restore_queues(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
+
+
static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
{
return (pdd->lds_base >> 16) & 0xFF;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
index c6f435aa803f..fdcd5178a862 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
@@ -24,6 +24,7 @@
#include "kfd_device_queue_manager.h"
#include "cik_regs.h"
#include "oss/oss_2_4_sh_mask.h"
+#include "gca/gfx_7_2_sh_mask.h"
static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
@@ -125,6 +126,7 @@ static int register_process_cik(struct device_queue_manager *dqm,
} else {
temp = get_sh_mem_bases_nybble_64(pdd);
qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+ qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__PRIVATE_ATC__SHIFT;
}
pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
index 7e9cae9d349b..c023e50fe027 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
@@ -39,6 +39,31 @@ static int initialize_cpsch_vi(struct device_queue_manager *dqm);
static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
+/*
+ * Tonga device queue manager functions
+ */
+static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size);
+static int register_process_vi_tonga(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
+static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
+ struct queue *q,
+ struct qcm_process_device *qpd);
+
+void device_queue_manager_init_vi_tonga(
+ struct device_queue_manager_asic_ops *ops)
+{
+ ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga;
+ ops->register_process = register_process_vi_tonga;
+ ops->initialize = initialize_cpsch_vi;
+ ops->init_sdma_vm = init_sdma_vm_tonga;
+}
+
+
void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops)
{
ops->set_cache_memory_policy = set_cache_memory_policy_vi;
@@ -104,6 +129,33 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
return true;
}
+static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ enum cache_policy default_policy,
+ enum cache_policy alternate_policy,
+ void __user *alternate_aperture_base,
+ uint64_t alternate_aperture_size)
+{
+ uint32_t default_mtype;
+ uint32_t ape1_mtype;
+
+ default_mtype = (default_policy == cache_policy_coherent) ?
+ MTYPE_UC :
+ MTYPE_NC_NV;
+
+ ape1_mtype = (alternate_policy == cache_policy_coherent) ?
+ MTYPE_UC :
+ MTYPE_NC_NV;
+
+ qpd->sh_mem_config =
+ SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+ default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+ ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
+
+ return true;
+}
+
static int register_process_vi(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
@@ -137,6 +189,8 @@ static int register_process_vi(struct device_queue_manager *dqm,
qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
qpd->sh_mem_config |= SH_MEM_ADDRESS_MODE_HSA64 <<
SH_MEM_CONFIG__ADDRESS_MODE__SHIFT;
+ qpd->sh_mem_config |= 1 <<
+ SH_MEM_CONFIG__PRIVATE_ATC__SHIFT;
}
pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
@@ -145,6 +199,41 @@ static int register_process_vi(struct device_queue_manager *dqm,
return 0;
}
+static int register_process_vi_tonga(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct kfd_process_device *pdd;
+ unsigned int temp;
+
+ BUG_ON(!dqm || !qpd);
+
+ pdd = qpd_to_pdd(qpd);
+
+ /* check if sh_mem_config register already configured */
+ if (qpd->sh_mem_config == 0) {
+ qpd->sh_mem_config =
+ SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+ MTYPE_UC <<
+ SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+ MTYPE_UC <<
+ SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
+
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ }
+
+ /* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+ * aperture addresses. */
+ temp = get_sh_mem_bases_nybble_64(pdd);
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+
+ pr_debug("kfd: sh_mem_bases nybble: 0x%X and register 0x%X\n",
+ temp, qpd->sh_mem_bases);
+
+ return 0;
+}
+
static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd)
{
@@ -161,6 +250,23 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
q->properties.sdma_vm_addr = value;
}
+static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
+ struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ uint32_t value = 0;
+
+ if (q->process->is_32bit_user_mode)
+ value |= (1 << SDMA0_RLC0_VIRTUAL_ADDR__PTR32__SHIFT) |
+ get_sh_mem_bases_32(qpd_to_pdd(qpd));
+ else
+ value |= ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
+ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
+ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
+ q->properties.sdma_vm_addr = value;
+}
+
+
static int initialize_cpsch_vi(struct device_queue_manager *dqm)
{
return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
index a7d3cb3fead0..d6a7e2af30f5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -142,13 +142,14 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- pr_debug("mapping doorbell page:\n");
- pr_debug(" target user address == 0x%08llX\n",
- (unsigned long long) vma->vm_start);
- pr_debug(" physical address == 0x%08llX\n", address);
- pr_debug(" vm_flags == 0x%04lX\n", vma->vm_flags);
- pr_debug(" size == 0x%04lX\n",
- doorbell_process_allocation());
+ pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n"
+ " target user address == 0x%08llX\n"
+ " physical address == 0x%08llX\n"
+ " vm_flags == 0x%04lX\n"
+ " size == 0x%04lX\n",
+ (unsigned long long) vma->vm_start, address, vma->vm_flags,
+ doorbell_process_allocation());
+
return io_remap_pfn_range(vma,
vma->vm_start,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index a6a4b2b1c0d9..335f81e09327 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -32,11 +32,10 @@
#include "kfd_events.h"
#include <linux/device.h>
-/*
- * A task can only be on a single wait_queue at a time, but we need to support
+/* A task can only be on a single wait_queue at a time, but we need to support
* waiting on multiple events (any/all).
- * Instead of each event simply having a wait_queue with sleeping tasks, it
- * has a singly-linked list of tasks.
+ * Instead of each event simply having a wait_queue with sleeping tasks, it has a
+ * singly-linked list of tasks.
* A thread that wants to sleep creates an array of these, one for each event
* and adds one to each event's waiter chain.
*/
@@ -52,12 +51,11 @@ struct kfd_event_waiter {
uint32_t input_index;
};
-/*
- * Over-complicated pooled allocator for event notification slots.
+/* Over-complicated pooled allocator for event notification slots.
*
- * Each signal event needs a 64-bit signal slot where the signaler will write
- * a 1 before sending an interrupt.l (This is needed because some interrupts
- * do not contain enough spare data bits to identify an event.)
+ * Each signal event needs a 64-bit signal slot where the signaler will write a 1
+ * before sending an interrupt.l (This is needed because some interrupts do not
+ * contain enough spare data bits to identify an event.)
* We get whole pages from vmalloc and map them to the process VA.
* Individual signal events are then allocated a slot in a page.
*/
@@ -65,6 +63,7 @@ struct kfd_event_waiter {
struct signal_page {
struct list_head event_pages; /* kfd_process.signal_event_pages */
uint64_t *kernel_address;
+ uint64_t handle;
uint64_t __user *user_address;
uint32_t page_index; /* Index into the mmap aperture. */
unsigned int free_slots;
@@ -74,8 +73,7 @@ struct signal_page {
#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT
#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE)
#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1)
-#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \
- SLOT_BITMAP_SIZE * sizeof(long))
+#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + SLOT_BITMAP_SIZE * sizeof(long))
/*
* For signal events, the event ID is used as the interrupt user data.
@@ -85,23 +83,27 @@ struct signal_page {
#define INTERRUPT_DATA_BITS 8
#define SIGNAL_EVENT_ID_SLOT_SHIFT 0
+/* We can only create 8 debug events */
+
+#define KFD_DEBUG_EVENT_LIMIT 8
+#define KFD_DEBUG_EVENT_MASK 0x1F
+#define KFD_DEBUG_EVENT_SHIFT 5
+
static uint64_t *page_slots(struct signal_page *page)
{
return page->kernel_address;
}
-static bool allocate_free_slot(struct kfd_process *process,
- struct signal_page **out_page,
- unsigned int *out_slot_index)
+static bool
+allocate_free_slot(struct kfd_process *process,
+ struct signal_page **out_page,
+ unsigned int *out_slot_index)
{
struct signal_page *page;
list_for_each_entry(page, &process->signal_event_pages, event_pages) {
if (page->free_slots > 0) {
- unsigned int slot =
- find_first_zero_bit(page->used_slot_bitmap,
- SLOTS_PER_PAGE);
-
+ unsigned int slot = find_first_zero_bit(page->used_slot_bitmap, SLOTS_PER_PAGE);
__set_bit(slot, page->used_slot_bitmap);
page->free_slots--;
@@ -130,6 +132,8 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p)
{
void *backing_store;
struct signal_page *page;
+ unsigned int slot;
+ int i;
page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL);
if (!page)
@@ -137,17 +141,23 @@ static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p)
page->free_slots = SLOTS_PER_PAGE;
- backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, \
get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
if (!backing_store)
goto fail_alloc_signal_store;
/* prevent user-mode info leaks */
- memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
- KFD_SIGNAL_EVENT_LIMIT * 8);
-
+ memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT, KFD_SIGNAL_EVENT_LIMIT * 8);
page->kernel_address = backing_store;
+ /* Set bits of debug events to prevent allocation */
+ for (i = 0 ; i < KFD_DEBUG_EVENT_LIMIT ; i++) {
+ slot = (i << KFD_DEBUG_EVENT_SHIFT) |
+ KFD_DEBUG_EVENT_MASK;
+ __set_bit(slot, page->used_slot_bitmap);
+ page->free_slots--;
+ }
+
if (list_empty(&p->signal_event_pages))
page->page_index = 0;
else
@@ -169,10 +179,10 @@ fail_alloc_signal_page:
return false;
}
-static bool allocate_event_notification_slot(struct file *devkfd,
- struct kfd_process *p,
- struct signal_page **page,
- unsigned int *signal_slot_index)
+static bool
+allocate_event_notification_slot(struct file *devkfd, struct kfd_process *p,
+ struct signal_page **page,
+ unsigned int *signal_slot_index)
{
bool ret;
@@ -186,6 +196,88 @@ static bool allocate_event_notification_slot(struct file *devkfd,
return ret;
}
+static bool
+allocate_signal_page_dgpu(struct kfd_process *p,
+ uint64_t *kernel_address, uint64_t handle)
+{
+ struct signal_page *my_page;
+
+ my_page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL);
+ if (!my_page)
+ return false;
+
+ /* prevent user-mode info leaks */
+ memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
+ KFD_SIGNAL_EVENT_LIMIT * 8);
+
+ my_page->kernel_address = kernel_address;
+ my_page->handle = handle;
+ my_page->user_address = NULL;
+ my_page->free_slots = SLOTS_PER_PAGE;
+ if (list_empty(&p->signal_event_pages))
+ my_page->page_index = 0;
+ else
+ my_page->page_index = list_tail_entry(&p->signal_event_pages,
+ struct signal_page,
+ event_pages)->page_index + 1;
+
+ pr_debug("allocated new event signal page at %p, for process %p\n",
+ my_page, p);
+ pr_debug("page index is %d\n", my_page->page_index);
+
+ list_add(&my_page->event_pages, &p->signal_event_pages);
+
+ return true;
+}
+
+void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle)
+{
+ struct signal_page *page, *tmp;
+
+ list_for_each_entry_safe(page, tmp, &p->signal_event_pages,
+ event_pages) {
+ if (page->handle == handle) {
+ list_del(&page->event_pages);
+ kfree(page);
+ break;
+ }
+ }
+}
+
+static bool
+allocate_debug_event_notification_slot(struct file *devkfd,
+ struct kfd_process *p,
+ struct signal_page **out_page,
+ unsigned int *out_slot_index)
+{
+ struct signal_page *page;
+ unsigned int slot;
+ bool ret;
+
+ if (list_empty(&p->signal_event_pages)) {
+ ret = allocate_signal_page(devkfd, p);
+ if (ret == false)
+ return ret;
+ }
+
+ page = list_entry((&p->signal_event_pages)->next, struct signal_page,
+ event_pages);
+ slot = (p->debug_event_count << KFD_DEBUG_EVENT_SHIFT) |
+ KFD_DEBUG_EVENT_MASK;
+
+ pr_debug("page == %p\n", page);
+ pr_debug("slot == %d\n", slot);
+
+ page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT;
+ *out_page = page;
+ *out_slot_index = slot;
+
+ pr_debug("allocated debug event signal slot in page %p, slot %d\n",
+ page, slot);
+
+ return true;
+}
+
/* Assumes that the process's event_mutex is locked. */
static void release_event_notification_slot(struct signal_page *page,
size_t slot_index)
@@ -202,10 +294,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p,
{
struct signal_page *page;
- /*
- * This is safe because we don't delete signal pages until the
- * process exits.
- */
+ /* This is safe because we don't delete signal pages until the process exits. */
list_for_each_entry(page, &p->signal_event_pages, event_pages)
if (page->page_index == page_index)
return page;
@@ -213,10 +302,7 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p,
return NULL;
}
-/*
- * Assumes that p->event_mutex is held and of course that p is not going
- * away (current or locked).
- */
+/* Assumes that p->event_mutex is held and of course that p is not going away (current or locked). */
static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
{
struct kfd_event *ev;
@@ -231,32 +317,27 @@ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
static u32 make_signal_event_id(struct signal_page *page,
unsigned int signal_slot_index)
{
- return page->page_index |
- (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT);
+ return page->page_index | (signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT);
}
-/*
- * Produce a kfd event id for a nonsignal event.
- * These are arbitrary numbers, so we do a sequential search through
- * the hash table for an unused number.
+/* Produce a kfd event id for a nonsignal event.
+ * These are arbitrary numbers, so we do a sequential search through the hash table
+ * for an unused number.
*/
static u32 make_nonsignal_event_id(struct kfd_process *p)
{
u32 id;
for (id = p->next_nonsignal_event_id;
- id < KFD_LAST_NONSIGNAL_EVENT_ID &&
- lookup_event_by_id(p, id) != NULL;
- id++)
+ id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL;
+ id++)
;
if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
- /*
- * What if id == LAST_NONSIGNAL_EVENT_ID - 1?
- * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so
- * the first loop fails immediately and we proceed with the
- * wraparound loop below.
+ /* What if id == LAST_NONSIGNAL_EVENT_ID - 1?
+ * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so the first loop
+ * fails immediately and we proceed with the wraparound loop below.
*/
p->next_nonsignal_event_id = id + 1;
@@ -264,54 +345,68 @@ static u32 make_nonsignal_event_id(struct kfd_process *p)
}
for (id = KFD_FIRST_NONSIGNAL_EVENT_ID;
- id < KFD_LAST_NONSIGNAL_EVENT_ID &&
- lookup_event_by_id(p, id) != NULL;
- id++)
+ id < KFD_LAST_NONSIGNAL_EVENT_ID && lookup_event_by_id(p, id) != NULL;
+ id++)
;
if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
p->next_nonsignal_event_id = id + 1;
return id;
+ } else {
+ p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+ return 0;
}
-
- p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
- return 0;
}
-static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p,
- struct signal_page *page,
- unsigned int signal_slot)
+static struct kfd_event *
+lookup_event_by_page_slot(struct kfd_process *p,
+ struct signal_page *page, unsigned int signal_slot)
{
return lookup_event_by_id(p, make_signal_event_id(page, signal_slot));
}
-static int create_signal_event(struct file *devkfd,
- struct kfd_process *p,
- struct kfd_event *ev)
+static int
+create_signal_event(struct file *devkfd, struct kfd_process *p, struct kfd_event *ev)
{
- if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
+ if ((ev->type == KFD_EVENT_TYPE_SIGNAL) &&
+ (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT)) {
pr_warn("amdkfd: Signal event wasn't created because limit was reached\n");
return -ENOMEM;
+ } else if ((ev->type == KFD_EVENT_TYPE_DEBUG) &&
+ (p->debug_event_count == KFD_DEBUG_EVENT_LIMIT)) {
+ pr_warn("amdkfd: Debug event wasn't created because limit was reached\n");
+ return -ENOMEM;
}
- if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page,
+ if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
+ if (!allocate_event_notification_slot(devkfd, p,
+ &ev->signal_page,
&ev->signal_slot_index)) {
- pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n");
- return -ENOMEM;
- }
+ pr_warn("amdkfd: Signal event wasn't created because out of kernel memory\n");
+ return -ENOMEM;
+ }
- p->signal_event_count++;
+ p->signal_event_count++;
- ev->user_signal_address =
- &ev->signal_page->user_address[ev->signal_slot_index];
+ if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) ==
+ KFD_DEBUG_EVENT_MASK)
+ p->signal_event_count++;
- ev->event_id = make_signal_event_id(ev->signal_page,
- ev->signal_slot_index);
+ } else if (ev->type == KFD_EVENT_TYPE_DEBUG) {
+ if (!allocate_debug_event_notification_slot(devkfd, p,
+ &ev->signal_page,
+ &ev->signal_slot_index)) {
+ pr_warn("amdkfd: Debug event wasn't created because out of kernel memory\n");
+ return -ENOMEM;
+ }
- pr_debug("signal event number %zu created with id %d, address %p\n",
- p->signal_event_count, ev->event_id,
- ev->user_signal_address);
+ p->debug_event_count++;
+ }
+
+ ev->user_signal_address = &ev->signal_page->user_address[ev->signal_slot_index];
+
+ ev->event_id = make_signal_event_id(ev->signal_page, ev->signal_slot_index);
pr_debug("signal event number %zu created with id %d, address %p\n",
p->signal_event_count, ev->event_id,
@@ -320,12 +415,10 @@ static int create_signal_event(struct file *devkfd,
return 0;
}
-/*
- * No non-signal events are supported yet.
- * We create them as events that never signal.
- * Set event calls from user-mode are failed.
- */
-static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
+/* No non-signal events are supported yet.
+ * We create them as events that never signal. Set event calls from user-mode are failed. */
+static int
+create_other_event(struct kfd_process *p, struct kfd_event *ev)
{
ev->event_id = make_nonsignal_event_id(p);
if (ev->event_id == 0)
@@ -341,20 +434,25 @@ void kfd_event_init_process(struct kfd_process *p)
INIT_LIST_HEAD(&p->signal_event_pages);
p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
p->signal_event_count = 0;
+ p->debug_event_count = 0;
}
static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
{
if (ev->signal_page != NULL) {
- release_event_notification_slot(ev->signal_page,
- ev->signal_slot_index);
- p->signal_event_count--;
+ if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
+ release_event_notification_slot(ev->signal_page,
+ ev->signal_slot_index);
+ p->signal_event_count--;
+ if ((p->signal_event_count & KFD_DEBUG_EVENT_MASK) ==
+ KFD_DEBUG_EVENT_MASK)
+ p->signal_event_count--;
+ } else if (ev->type == KFD_EVENT_TYPE_DEBUG) {
+ p->debug_event_count--;
+ }
}
- /*
- * Abandon the list of waiters. Individual waiting threads will
- * clean up their own data.
- */
+ /* Abandon the list of waiters. Individual waiting threads will clean up their own data.*/
list_del(&ev->waiters);
hash_del(&ev->events);
@@ -371,18 +469,17 @@ static void destroy_events(struct kfd_process *p)
destroy_event(p, ev);
}
-/*
- * We assume that the process is being destroyed and there is no need to
- * unmap the pages or keep bookkeeping data in order.
- */
+/* We assume that the process is being destroyed and there is no need to unmap the pages
+ * or keep bookkeeping data in order. */
static void shutdown_signal_pages(struct kfd_process *p)
{
struct signal_page *page, *tmp;
- list_for_each_entry_safe(page, tmp, &p->signal_event_pages,
- event_pages) {
- free_pages((unsigned long)page->kernel_address,
- get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+ list_for_each_entry_safe(page, tmp, &p->signal_event_pages, event_pages) {
+ if (page->user_address) {
+ free_pages((unsigned long)page->kernel_address,
+ get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+ }
kfree(page);
}
}
@@ -395,8 +492,7 @@ void kfd_event_free_process(struct kfd_process *p)
static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
{
- return ev->type == KFD_EVENT_TYPE_SIGNAL ||
- ev->type == KFD_EVENT_TYPE_DEBUG;
+ return ev->type == KFD_EVENT_TYPE_SIGNAL || ev->type == KFD_EVENT_TYPE_DEBUG;
}
static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
@@ -407,11 +503,12 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint32_t event_type, bool auto_reset, uint32_t node_id,
uint32_t *event_id, uint32_t *event_trigger_data,
- uint64_t *event_page_offset, uint32_t *event_slot_index)
+ uint64_t *event_page_offset, uint32_t *event_slot_index,
+ void *kern_addr)
{
int ret = 0;
- struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
if (!ev)
return -ENOMEM;
@@ -421,17 +518,20 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
INIT_LIST_HEAD(&ev->waiters);
- *event_page_offset = 0;
-
mutex_lock(&p->event_mutex);
+ if (kern_addr && list_empty(&p->signal_event_pages))
+ allocate_signal_page_dgpu(p, kern_addr, *event_page_offset);
+
+ *event_page_offset = 0;
+
switch (event_type) {
case KFD_EVENT_TYPE_SIGNAL:
case KFD_EVENT_TYPE_DEBUG:
ret = create_signal_event(devkfd, p, ev);
if (!ret) {
*event_page_offset = (ev->signal_page->page_index |
- KFD_MMAP_EVENTS_MASK);
+ KFD_MMAP_TYPE_EVENTS);
*event_page_offset <<= PAGE_SHIFT;
*event_slot_index = ev->signal_slot_index;
}
@@ -538,8 +638,7 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
{
- page_slots(ev->signal_page)[ev->signal_slot_index] =
- UNSIGNALED_EVENT_SLOT;
+ page_slots(ev->signal_page)[ev->signal_slot_index] = UNSIGNALED_EVENT_SLOT;
}
static bool is_slot_signaled(struct signal_page *page, unsigned int index)
@@ -547,8 +646,7 @@ static bool is_slot_signaled(struct signal_page *page, unsigned int index)
return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT;
}
-static void set_event_from_interrupt(struct kfd_process *p,
- struct kfd_event *ev)
+static void set_event_from_interrupt(struct kfd_process *p, struct kfd_event *ev)
{
if (ev && event_can_be_gpu_signaled(ev)) {
acknowledge_signal(p, ev);
@@ -561,42 +659,39 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
{
struct kfd_event *ev;
- /*
- * Because we are called from arbitrary context (workqueue) as opposed
+ /* Because we are called from arbitrary context (workqueue) as opposed
* to process context, kfd_process could attempt to exit while we are
- * running so the lookup function returns a locked process.
- */
+ * running so the lookup function returns a read-locked process. */
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
-
if (!p)
return; /* Presumably process exited. */
mutex_lock(&p->event_mutex);
- if (valid_id_bits >= INTERRUPT_DATA_BITS) {
+ if ((valid_id_bits >= INTERRUPT_DATA_BITS) &&
+ ((partial_id & KFD_DEBUG_EVENT_MASK) ==
+ KFD_DEBUG_EVENT_MASK)) {
/* Partial ID is a full ID. */
ev = lookup_event_by_id(p, partial_id);
set_event_from_interrupt(p, ev);
} else {
- /*
- * Partial ID is in fact partial. For now we completely
- * ignore it, but we could use any bits we did receive to
- * search faster.
- */
+ /* Partial ID is in fact partial. For now we completely ignore it,
+ * but we could use any bits we did receive to search faster. */
struct signal_page *page;
unsigned i;
- list_for_each_entry(page, &p->signal_event_pages, event_pages)
- for (i = 0; i < SLOTS_PER_PAGE; i++)
+ list_for_each_entry(page, &p->signal_event_pages, event_pages) {
+ for (i = 0; i < SLOTS_PER_PAGE; i++) {
if (is_slot_signaled(page, i)) {
- ev = lookup_event_by_page_slot(p,
- page, i);
+ ev = lookup_event_by_page_slot(p, page, i);
set_event_from_interrupt(p, ev);
}
+ }
+ }
}
mutex_unlock(&p->event_mutex);
- mutex_unlock(&p->mutex);
+ up_read(&p->lock);
}
static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
@@ -604,20 +699,20 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
struct kfd_event_waiter *event_waiters;
uint32_t i;
- event_waiters = kmalloc_array(num_events,
- sizeof(struct kfd_event_waiter),
- GFP_KERNEL);
+ event_waiters = kmalloc(num_events * sizeof(struct kfd_event_waiter), GFP_KERNEL);
- for (i = 0; (event_waiters) && (i < num_events) ; i++) {
- INIT_LIST_HEAD(&event_waiters[i].waiters);
- event_waiters[i].sleeping_task = current;
- event_waiters[i].activated = false;
+ if (event_waiters) {
+ for (i = 0; i < num_events; i++) {
+ INIT_LIST_HEAD(&event_waiters[i].waiters);
+ event_waiters[i].sleeping_task = current;
+ event_waiters[i].activated = false;
+ }
}
return event_waiters;
}
-static int init_event_waiter(struct kfd_process *p,
+static int init_event_waiter_get_status(struct kfd_process *p,
struct kfd_event_waiter *waiter,
uint32_t event_id,
uint32_t input_index)
@@ -632,13 +727,21 @@ static int init_event_waiter(struct kfd_process *p,
waiter->activated = ev->signaled;
ev->signaled = ev->signaled && !ev->auto_reset;
- list_add(&waiter->waiters, &ev->waiters);
-
return 0;
}
+static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
+{
+ struct kfd_event *ev = waiter->event;
+
+ /* Only add to the wait list if we actually need to
+ * wait on this event. */
+ if (!waiter->activated)
+ list_add(&waiter->waiters, &ev->waiters);
+}
+
static bool test_event_condition(bool all, uint32_t num_events,
- struct kfd_event_waiter *event_waiters)
+ struct kfd_event_waiter *event_waiters)
{
uint32_t i;
uint32_t activated_count = 0;
@@ -663,23 +766,15 @@ static bool copy_signaled_event_data(uint32_t num_events,
struct kfd_event_waiter *event_waiters,
struct kfd_event_data __user *data)
{
- struct kfd_hsa_memory_exception_data *src;
- struct kfd_hsa_memory_exception_data __user *dst;
- struct kfd_event_waiter *waiter;
- struct kfd_event *event;
uint32_t i;
- for (i = 0; i < num_events; i++) {
- waiter = &event_waiters[i];
- event = waiter->event;
- if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
- dst = &data[waiter->input_index].memory_exception_data;
- src = &event->memory_exception_data;
- if (copy_to_user(dst, src,
- sizeof(struct kfd_hsa_memory_exception_data)))
+ for (i = 0; i < num_events; i++)
+ if (event_waiters[i].activated &&
+ event_waiters[i].event->type == KFD_EVENT_TYPE_MEMORY)
+ if (copy_to_user(&data[event_waiters[i].input_index].memory_exception_data,
+ &event_waiters[i].event->memory_exception_data,
+ sizeof(struct kfd_hsa_memory_exception_data)))
return false;
- }
- }
return true;
@@ -695,11 +790,9 @@ static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
if (user_timeout_ms == KFD_EVENT_TIMEOUT_INFINITE)
return MAX_SCHEDULE_TIMEOUT;
- /*
- * msecs_to_jiffies interprets all values above 2^31-1 as infinite,
+ /* msecs_to_jiffies interprets all values above 2^31-1 as infinite,
* but we consider them finite.
- * This hack is wrong, but nobody is likely to notice.
- */
+ * This hack is wrong, but nobody is likely to notice. */
user_timeout_ms = min_t(uint32_t, user_timeout_ms, 0x7FFFFFFF);
return msecs_to_jiffies(user_timeout_ms) + 1;
@@ -724,11 +817,16 @@ int kfd_wait_on_events(struct kfd_process *p,
(struct kfd_event_data __user *) data;
uint32_t i;
int ret = 0;
+
struct kfd_event_waiter *event_waiters = NULL;
long timeout = user_timeout_to_jiffies(user_timeout_ms);
mutex_lock(&p->event_mutex);
+ /* Set to something unreasonable - this is really
+ * just a bool for now. */
+ *wait_result = KFD_WAIT_TIMEOUT;
+
event_waiters = alloc_event_waiters(num_events);
if (!event_waiters) {
ret = -ENOMEM;
@@ -742,14 +840,34 @@ int kfd_wait_on_events(struct kfd_process *p,
sizeof(struct kfd_event_data)))
goto fail;
- ret = init_event_waiter(p, &event_waiters[i],
+ ret = init_event_waiter_get_status(p, &event_waiters[i],
event_data.event_id, i);
if (ret)
goto fail;
}
+ /* Check condition once. */
+ if (test_event_condition(all, num_events, event_waiters)) {
+ if (copy_signaled_event_data(num_events,
+ event_waiters, events))
+ *wait_result = KFD_WAIT_COMPLETE;
+ else
+ *wait_result = KFD_WAIT_ERROR;
+ free_waiters(num_events, event_waiters);
+ } else {
+ /* Add to wait lists if we need to wait. */
+ for (i = 0; i < num_events; i++)
+ init_event_waiter_add_to_waitlist(&event_waiters[i]);
+ }
+
mutex_unlock(&p->event_mutex);
+ /* Return if all waits were already satisfied. */
+ if (*wait_result != KFD_WAIT_TIMEOUT) {
+ __set_current_state(TASK_RUNNING);
+ return ret;
+ }
+
while (true) {
if (fatal_signal_pending(current)) {
ret = -EINTR;
@@ -758,17 +876,17 @@ int kfd_wait_on_events(struct kfd_process *p,
if (signal_pending(current)) {
/*
- * This is wrong when a nonzero, non-infinite timeout
- * is specified. We need to use
- * ERESTARTSYS_RESTARTBLOCK, but struct restart_block
- * contains a union with data for each user and it's
- * in generic kernel code that I don't want to
- * touch yet.
+ * This is wrong when a nonzero, non-infinite timeout is specified.
+ * We need to use ERESTARTSYS_RESTARTBLOCK, but struct restart_block
+ * contains a union with data for each user and it's in generic
+ * kernel code that I don't want to touch yet.
*/
ret = -ERESTARTSYS;
break;
}
+ set_current_state(TASK_INTERRUPTIBLE);
+
if (test_event_condition(all, num_events, event_waiters)) {
if (copy_signaled_event_data(num_events,
event_waiters, events))
@@ -783,7 +901,7 @@ int kfd_wait_on_events(struct kfd_process *p,
break;
}
- timeout = schedule_timeout_interruptible(timeout);
+ timeout = schedule_timeout(timeout);
}
__set_current_state(TASK_RUNNING);
@@ -823,8 +941,7 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
page = lookup_signal_page_by_index(p, page_index);
if (!page) {
/* Probably KFD bug, but mmap is user-accessible. */
- pr_debug("signal page could not be found for page_index %u\n",
- page_index);
+ pr_debug("signal page could not be found for page_index %u\n", page_index);
return -EINVAL;
}
@@ -856,23 +973,29 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
static void lookup_events_by_type_and_signal(struct kfd_process *p,
int type, void *event_data)
{
- struct kfd_hsa_memory_exception_data *ev_data;
struct kfd_event *ev;
int bkt;
bool send_signal = true;
- ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
-
- hash_for_each(p->events, bkt, ev, events)
+ hash_for_each(p->events, bkt, ev, events) {
if (ev->type == type) {
send_signal = false;
dev_dbg(kfd_device,
"Event found: id %X type %d",
ev->event_id, ev->type);
set_event(ev);
- if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
- ev->memory_exception_data = *ev_data;
+ if (ev->type == KFD_EVENT_TYPE_MEMORY && event_data)
+ ev->memory_exception_data =
+ *(struct kfd_hsa_memory_exception_data *)event_data;
}
+ }
+
+ if (type == KFD_EVENT_TYPE_MEMORY) {
+ dev_warn(kfd_device,
+ "Sending SIGSEGV to HSA Process with PID %d ",
+ p->lead_thread->pid);
+ send_sig(SIGSEGV, p->lead_thread, 0);
+ }
/* Send SIGTERM no event of type "type" has been found*/
if (send_signal) {
@@ -899,7 +1022,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
/*
* Because we are called from arbitrary context (workqueue) as opposed
* to process context, kfd_process could attempt to exit while we are
- * running so the lookup function returns a locked process.
+ * running so the lookup function returns a read-locked process.
*/
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -914,24 +1037,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
memory_exception_data.gpu_id = dev->id;
memory_exception_data.va = address;
/* Set failure reason */
- memory_exception_data.failure.NotPresent = 1;
- memory_exception_data.failure.NoExecute = 0;
- memory_exception_data.failure.ReadOnly = 0;
+ memory_exception_data.failure.NotPresent = true;
+ memory_exception_data.failure.NoExecute = false;
+ memory_exception_data.failure.ReadOnly = false;
if (vma) {
if (vma->vm_start > address) {
- memory_exception_data.failure.NotPresent = 1;
- memory_exception_data.failure.NoExecute = 0;
- memory_exception_data.failure.ReadOnly = 0;
+ memory_exception_data.failure.NotPresent = true;
+ memory_exception_data.failure.NoExecute = false;
+ memory_exception_data.failure.ReadOnly = false;
} else {
- memory_exception_data.failure.NotPresent = 0;
+ memory_exception_data.failure.NotPresent = false;
if (is_write_requested && !(vma->vm_flags & VM_WRITE))
- memory_exception_data.failure.ReadOnly = 1;
+ memory_exception_data.failure.ReadOnly = true;
else
- memory_exception_data.failure.ReadOnly = 0;
+ memory_exception_data.failure.ReadOnly = false;
if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
- memory_exception_data.failure.NoExecute = 1;
+ memory_exception_data.failure.NoExecute = true;
else
- memory_exception_data.failure.NoExecute = 0;
+ memory_exception_data.failure.NoExecute = false;
}
}
@@ -944,7 +1067,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
&memory_exception_data);
mutex_unlock(&p->event_mutex);
- mutex_unlock(&p->mutex);
+ up_read(&p->lock);
}
void kfd_signal_hw_exception_event(unsigned int pasid)
@@ -952,7 +1075,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
/*
* Because we are called from arbitrary context (workqueue) as opposed
* to process context, kfd_process could attempt to exit while we are
- * running so the lookup function returns a locked process.
+ * running so the lookup function returns a read-locked process.
*/
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -965,5 +1088,42 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
mutex_unlock(&p->event_mutex);
- mutex_unlock(&p->mutex);
+ up_read(&p->lock);
+}
+
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ struct kfd_vm_fault_info *info)
+{
+ struct kfd_event *ev;
+ int bkt;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ struct kfd_hsa_memory_exception_data memory_exception_data;
+
+ if (!p)
+ return; /* Presumably process exited. */
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+ memory_exception_data.gpu_id = dev->id;
+ /* Set failure reason */
+ if (info) {
+ memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
+ memory_exception_data.failure.NotPresent =
+ info->prot_valid ? true : false;
+ memory_exception_data.failure.NoExecute =
+ info->prot_exec ? true : false;
+ memory_exception_data.failure.ReadOnly =
+ info->prot_write ? true : false;
+ }
+ mutex_lock(&p->event_mutex);
+
+ hash_for_each(p->events, bkt, ev, events) {
+ if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+ ev->memory_exception_data = memory_exception_data;
+ set_event(ev);
+ }
+ }
+
+ mutex_unlock(&p->event_mutex);
+ up_read(&p->lock);
+
}
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
index 28f6838b1f4c..d7987eb80970 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -34,8 +34,7 @@
#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
#define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX
-/*
- * Written into kfd_signal_slot_t to indicate that the event is not signaled.
+/* Written into kfd_signal_slot_t to indicate that the event is not signaled.
* Since the event protocol may need to write the event ID into memory, this
* must not be a valid event ID.
* For the sake of easy memset-ing, this must be a byte pattern.
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 2b655103ba79..587f84714ca0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -33,7 +33,7 @@
#include <linux/time.h>
#include "kfd_priv.h"
#include <linux/mm.h>
-#include <linux/mman.h>
+#include <uapi/asm-generic/mman-common.h>
#include <asm/processor.h>
/*
@@ -278,21 +278,36 @@
#define MAKE_GPUVM_APP_BASE(gpu_num) \
(((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
-#define MAKE_GPUVM_APP_LIMIT(base) \
- (((uint64_t)(base) & \
- 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL)
+#define MAKE_GPUVM_APP_LIMIT(base, size) \
+ (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
-#define MAKE_SCRATCH_APP_BASE(gpu_num) \
- (((uint64_t)(gpu_num) << 61) + 0x100000000L)
+#define MAKE_SCRATCH_APP_BASE() \
+ (((uint64_t)(0x1UL) << 61) + 0x100000000L)
#define MAKE_SCRATCH_APP_LIMIT(base) \
(((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
-#define MAKE_LDS_APP_BASE(gpu_num) \
- (((uint64_t)(gpu_num) << 61) + 0x0)
+#define MAKE_LDS_APP_BASE() \
+ (((uint64_t)(0x1UL) << 61) + 0x0)
+
#define MAKE_LDS_APP_LIMIT(base) \
(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+
+#define DGPU_VM_BASE_DEFAULT 0x100000
+
+int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
+ uint64_t base, uint64_t limit)
+{
+ if (base < (pdd->qpd.cwsr_base + pdd->dev->cwsr_size)) {
+ pr_err("Set dgpu vm base 0x%llx failed.\n", base);
+ return -EINVAL;
+ }
+ pdd->dgpu_base = base;
+ pdd->dgpu_limit = limit;
+ return 0;
+}
+
int kfd_init_apertures(struct kfd_process *process)
{
uint8_t id = 0;
@@ -300,13 +315,16 @@ int kfd_init_apertures(struct kfd_process *process)
struct kfd_process_device *pdd;
/*Iterating over all devices*/
- while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
- id < NUM_OF_SUPPORTED_GPUS) {
+ while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
+ if (!dev) {
+ id++; /* Skip non GPU devices */
+ continue;
+ }
pdd = kfd_create_process_device_data(dev, process);
if (pdd == NULL) {
pr_err("Failed to create process device data\n");
- return -1;
+ goto err;
}
/*
* For 64 bit process aperture will be statically reserved in
@@ -322,19 +340,24 @@ int kfd_init_apertures(struct kfd_process *process)
* node id couldn't be 0 - the three MSB bits of
* aperture shoudn't be 0
*/
- pdd->lds_base = MAKE_LDS_APP_BASE(id + 1);
+ pdd->lds_base = MAKE_LDS_APP_BASE();
pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
- pdd->gpuvm_limit =
- MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
+ pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(
+ pdd->gpuvm_base,
+ dev->shared_resources.gpuvm_size);
- pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
+ pdd->scratch_base = MAKE_SCRATCH_APP_BASE();
pdd->scratch_limit =
MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+
+ if (KFD_IS_DGPU(dev->device_info->asic_family))
+ pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT;
+
}
dev_dbg(kfd_device, "node id %u\n", id);
@@ -350,6 +373,32 @@ int kfd_init_apertures(struct kfd_process *process)
}
return 0;
+
+err:
+ return -1;
}
+void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid)
+{
+ uint8_t vmid;
+ int first_vmid_to_scan = 8;
+ int last_vmid_to_scan = 15;
+ const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
+ /* Scan all registers in the range ATC_VMID8_PASID_MAPPING .. ATC_VMID15_PASID_MAPPING
+ * to check which VMID the current process is mapped to
+ * and flush TLB for this VMID if found*/
+ for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
+ if (f2g->get_atc_vmid_pasid_mapping_valid(
+ dev->kgd, vmid)) {
+ if (f2g->get_atc_vmid_pasid_mapping_pasid(
+ dev->kgd, vmid) == pasid) {
+ dev_dbg(kfd_device,
+ "TLB of vmid %u", vmid);
+ f2g->write_vmid_invalidate_request(
+ dev->kgd, vmid);
+ break;
+ }
+ }
+ }
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
index 7f134aa9bfd3..a8cdbc812d00 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -172,8 +172,7 @@ static void interrupt_wq(struct work_struct *work)
sizeof(uint32_t))];
while (dequeue_ih_ring_entry(dev, ih_ring_entry))
- dev->device_info->event_interrupt_class->interrupt_wq(dev,
- ih_ring_entry);
+ dev->device_info->event_interrupt_class->interrupt_wq(dev, ih_ring_entry);
}
bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry)
@@ -181,8 +180,7 @@ bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry)
/* integer and bitwise OR so there is no boolean short-circuiting */
unsigned wanted = 0;
- wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev,
- ih_ring_entry);
+ wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, ih_ring_entry);
return wanted != 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 9beae87aadd5..513cfe642c22 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -47,6 +47,9 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
pr_debug("amdkfd: In func %s initializing queue type %d size %d\n",
__func__, KFD_QUEUE_TYPE_HIQ, queue_size);
+ memset(&prop, 0, sizeof(prop));
+ memset(&nop, 0, sizeof(nop));
+
nop.opcode = IT_NOP;
nop.type = PM4_TYPE_3;
nop.u32all |= PM4_COUNT_ZERO;
@@ -121,7 +124,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
prop.eop_ring_buffer_address = kq->eop_gpu_addr;
prop.eop_ring_buffer_size = PAGE_SIZE;
- if (init_queue(&kq->queue, prop) != 0)
+ if (init_queue(&kq->queue, &prop) != 0)
goto err_init_queue;
kq->queue->device = dev;
@@ -140,7 +143,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
kq->queue->pipe = KFD_CIK_HIQ_PIPE;
kq->queue->queue = KFD_CIK_HIQ_QUEUE;
kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe,
- kq->queue->queue, NULL);
+ kq->queue->queue, NULL, 0);
} else {
/* allocate fence for DIQ */
@@ -210,20 +213,23 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
BUG_ON(!kq || !buffer_ptr);
+ /* When rptr == wptr, the buffer is empty.
+ * When rptr == wptr + 1, the buffer is full.
+ * It is always rptr that advances to the position of wptr, rather than
+ * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
+ */
rptr = *kq->rptr_kernel;
wptr = *kq->wptr_kernel;
queue_address = (unsigned int *)kq->pq_kernel_addr;
queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t);
- pr_debug("rptr: %d\n", rptr);
- pr_debug("wptr: %d\n", wptr);
- pr_debug("queue_address 0x%p\n", queue_address);
+ pr_debug("amdkfd: In func %s\n rptr: %d\n wptr: %d\n queue_address 0x%p\n",
+ __func__, rptr, wptr, queue_address);
- available_size = (rptr - 1 - wptr + queue_size_dwords) %
+ available_size = (rptr + queue_size_dwords - 1 - wptr) %
queue_size_dwords;
- if (packet_size_in_dwords >= queue_size_dwords ||
- packet_size_in_dwords >= available_size) {
+ if (packet_size_in_dwords > available_size) {
/*
* make sure calling functions know
* acquire_packet_buffer() failed
@@ -233,6 +239,13 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
}
if (wptr + packet_size_in_dwords >= queue_size_dwords) {
+ /* make sure after rolling back to position 0, there is
+ * still enough space. */
+ if (packet_size_in_dwords >= rptr) {
+ *buffer_ptr = NULL;
+ return -ENOMEM;
+ }
+ /* fill nops, roll back and start at position 0 */
while (wptr > 0) {
queue_address[wptr] = kq->nop_packet;
wptr = (wptr + 1) % queue_size_dwords;
@@ -292,6 +305,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
switch (dev->device_info->asic_family) {
case CHIP_CARRIZO:
+ case CHIP_TONGA:
+ case CHIP_FIJI:
kernel_queue_init_vi(&kq->ops_asic_specific);
break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 850a5623661f..e9b886d7a041 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -29,10 +29,11 @@
#define KFD_DRIVER_AUTHOR "AMD Inc. and others"
#define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs"
-#define KFD_DRIVER_DATE "20150421"
-#define KFD_DRIVER_MAJOR 0
-#define KFD_DRIVER_MINOR 7
-#define KFD_DRIVER_PATCHLEVEL 2
+#define KFD_DRIVER_DATE "20160129"
+#define KFD_DRIVER_MAJOR 1
+#define KFD_DRIVER_MINOR 8
+#define KFD_DRIVER_PATCHLEVEL 1
+#define KFD_DRIVER_RC_LEVEL ""
static const struct kgd2kfd_calls kgd2kfd = {
.exit = kgd2kfd_exit,
@@ -42,6 +43,10 @@ static const struct kgd2kfd_calls kgd2kfd = {
.interrupt = kgd2kfd_interrupt,
.suspend = kgd2kfd_suspend,
.resume = kgd2kfd_resume,
+ .evict_bo = kgd2kfd_evict_bo,
+ .restore = kgd2kfd_restore,
+ .quiesce_mm = kgd2kfd_quiesce_mm,
+ .resume_mm = kgd2kfd_resume_mm,
};
int sched_policy = KFD_SCHED_POLICY_HWS;
@@ -49,6 +54,15 @@ module_param(sched_policy, int, 0444);
MODULE_PARM_DESC(sched_policy,
"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
+int hws_max_conc_proc = 0;
+module_param(hws_max_conc_proc, int, 0444);
+MODULE_PARM_DESC(hws_max_conc_proc,
+ "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency (Default), #VMIDs for KFD = Maximum)");
+
+int cwsr_enable = 1;
+module_param(cwsr_enable, int, 0444);
+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
+
int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
module_param(max_num_of_queues_per_device, int, 0444);
MODULE_PARM_DESC(max_num_of_queues_per_device,
@@ -61,6 +75,11 @@ MODULE_PARM_DESC(send_sigterm,
static int amdkfd_init_completed;
+int debug_largebar = 0;
+module_param(debug_largebar, int, 0444);
+MODULE_PARM_DESC(debug_largebar,
+ "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)");
+
int kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f)
{
if (!amdkfd_init_completed)
@@ -149,4 +168,5 @@ MODULE_DESCRIPTION(KFD_DRIVER_DESC);
MODULE_LICENSE("GPL and additional rights");
MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "."
__stringify(KFD_DRIVER_MINOR) "."
- __stringify(KFD_DRIVER_PATCHLEVEL));
+ __stringify(KFD_DRIVER_PATCHLEVEL)
+ KFD_DRIVER_RC_LEVEL);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index b1ef1368c3bb..ef1dc9b4c20e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -31,6 +31,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
return mqd_manager_init_cik(type, dev);
case CHIP_CARRIZO:
return mqd_manager_init_vi(type, dev);
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ return mqd_manager_init_vi_tonga(type, dev);
}
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index 213a71e0b6c7..eb6019259da0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -67,7 +67,8 @@ struct mqd_manager {
int (*load_mqd)(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
- uint32_t __user *wptr);
+ uint32_t __user *wptr,
+ uint32_t page_table_base);
int (*update_mqd)(struct mqd_manager *mm, void *mqd,
struct queue_properties *q);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index d83de985e88c..44dcd9cace4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -29,11 +29,71 @@
#include "cik_structs.h"
#include "oss/oss_2_4_sh_mask.h"
+#define AQL_ENABLE 1
+
static inline struct cik_mqd *get_mqd(void *mqd)
{
return (struct cik_mqd *)mqd;
}
+static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+{
+ return (struct cik_sdma_rlc_registers *)mqd;
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct cik_mqd *m;
+ struct kfd_cu_info cu_info;
+ uint32_t mgmt_se_mask;
+ uint32_t cu_sh_mask, cu_sh_shift;
+ uint32_t cu_mask;
+ int se, sh;
+
+ if (q->cu_mask == 0)
+ return;
+
+ m = get_mqd(mqd);
+ m->compute_static_thread_mgmt_se0 = 0;
+ m->compute_static_thread_mgmt_se1 = 0;
+ m->compute_static_thread_mgmt_se2 = 0;
+ m->compute_static_thread_mgmt_se3 = 0;
+
+ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+ cu_mask = q->cu_mask;
+ for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) {
+ mgmt_se_mask = 0;
+ for (sh = 0; sh < 2 && cu_mask; sh++) {
+ cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]);
+ cu_sh_mask = (1 << cu_sh_shift) - 1;
+ mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16);
+ cu_mask >>= cu_sh_shift;
+ }
+ switch (se) {
+ case 0:
+ m->compute_static_thread_mgmt_se0 = mgmt_se_mask;
+ break;
+ case 1:
+ m->compute_static_thread_mgmt_se1 = mgmt_se_mask;
+ break;
+ case 2:
+ m->compute_static_thread_mgmt_se2 = mgmt_se_mask;
+ break;
+ case 3:
+ m->compute_static_thread_mgmt_se3 = mgmt_se_mask;
+ break;
+ default:
+ break;
+ }
+ }
+ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n",
+ m->compute_static_thread_mgmt_se0,
+ m->compute_static_thread_mgmt_se1,
+ m->compute_static_thread_mgmt_se2,
+ m->compute_static_thread_mgmt_se3);
+}
+
static int init_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
@@ -150,15 +210,16 @@ static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
}
static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
- uint32_t queue_id, uint32_t __user *wptr)
+ uint32_t queue_id, uint32_t __user *wptr,
+ uint32_t page_table_base)
{
return mm->dev->kfd2kgd->hqd_load
- (mm->dev->kgd, mqd, pipe_id, queue_id, wptr);
+ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base);
}
static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
- uint32_t __user *wptr)
+ uint32_t __user *wptr, uint32_t page_table_base)
{
return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
}
@@ -195,11 +256,14 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
}
+ update_cu_mask(mm, mqd, q);
+
m->cp_hqd_active = 0;
q->is_active = false;
if (q->queue_size > 0 &&
q->queue_address != 0 &&
- q->queue_percent > 0) {
+ q->queue_percent > 0 &&
+ !q->is_evicted) {
m->cp_hqd_active = 1;
q->is_active = true;
}
@@ -215,8 +279,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
BUG_ON(!mm || !mqd || !q);
m = get_sdma_mqd(mqd);
- m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) <<
- SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+ m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
+ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
@@ -237,7 +301,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
q->is_active = false;
if (q->queue_size > 0 &&
q->queue_address != 0 &&
- q->queue_percent > 0) {
+ q->queue_percent > 0 &&
+ !q->is_evicted) {
m->sdma_rlc_rb_cntl |=
1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT;
@@ -386,7 +451,8 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
q->is_active = false;
if (q->queue_size > 0 &&
q->queue_address != 0 &&
- q->queue_percent > 0) {
+ q->queue_percent > 0 &&
+ !q->is_evicted) {
m->cp_hqd_active = 1;
q->is_active = true;
}
@@ -394,16 +460,6 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
return 0;
}
-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
-{
- struct cik_sdma_rlc_registers *m;
-
- BUG_ON(!mqd);
-
- m = (struct cik_sdma_rlc_registers *)mqd;
-
- return m;
-}
struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
struct kfd_dev *dev)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index fa32c32fa1c2..b5fb78379e88 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -26,9 +26,9 @@
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
#include "vi_structs.h"
-#include "gca/gfx_8_0_sh_mask.h"
-#include "gca/gfx_8_0_enum.h"
-
+#include "asic_reg/gca/gfx_8_0_sh_mask.h"
+#include "asic_reg/gca/gfx_8_0_enum.h"
+#include "oss/oss_3_0_sh_mask.h"
#define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
static inline struct vi_mqd *get_mqd(void *mqd)
@@ -36,6 +36,64 @@ static inline struct vi_mqd *get_mqd(void *mqd)
return (struct vi_mqd *)mqd;
}
+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+ return (struct vi_sdma_mqd *)mqd;
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct vi_mqd *m;
+ struct kfd_cu_info cu_info;
+ uint32_t mgmt_se_mask;
+ uint32_t cu_sh_mask, cu_sh_shift;
+ uint32_t cu_mask;
+ int se, sh;
+
+ if (q->cu_mask == 0)
+ return;
+
+ m = get_mqd(mqd);
+ m->compute_static_thread_mgmt_se0 = 0;
+ m->compute_static_thread_mgmt_se1 = 0;
+ m->compute_static_thread_mgmt_se2 = 0;
+ m->compute_static_thread_mgmt_se3 = 0;
+
+ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+ cu_mask = q->cu_mask;
+ for (se = 0; se < cu_info.num_shader_engines && cu_mask; se++) {
+ mgmt_se_mask = 0;
+ for (sh = 0; sh < 2 && cu_mask; sh++) {
+ cu_sh_shift = hweight32(cu_info.cu_bitmap[se][sh]);
+ cu_sh_mask = (1 << cu_sh_shift) - 1;
+ mgmt_se_mask |= (cu_mask & cu_sh_mask) << (sh * 16);
+ cu_mask >>= cu_sh_shift;
+ }
+ switch (se) {
+ case 0:
+ m->compute_static_thread_mgmt_se0 = mgmt_se_mask;
+ break;
+ case 1:
+ m->compute_static_thread_mgmt_se1 = mgmt_se_mask;
+ break;
+ case 2:
+ m->compute_static_thread_mgmt_se2 = mgmt_se_mask;
+ break;
+ case 3:
+ m->compute_static_thread_mgmt_se3 = mgmt_se_mask;
+ break;
+ default:
+ break;
+ }
+ }
+ pr_debug("kfd: update cu mask to %#x %#x %#x %#x\n",
+ m->compute_static_thread_mgmt_se0,
+ m->compute_static_thread_mgmt_se1,
+ m->compute_static_thread_mgmt_se2,
+ m->compute_static_thread_mgmt_se3);
+}
+
static int init_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
@@ -82,6 +140,25 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
if (q->format == KFD_QUEUE_FORMAT_AQL)
m->cp_hqd_iq_rptr = 1;
+ if (q->tba_addr) {
+ m->cp_hqd_persistent_state |=
+ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+ m->compute_pgm_rsrc2 |=
+ (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+ m->cp_hqd_ctx_save_base_addr_lo =
+ lower_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_base_addr_hi =
+ upper_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+ m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+ m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+ m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
+ m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
+ m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
+ m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
+ }
+
*mqd = m;
if (gart_addr != NULL)
*gart_addr = addr;
@@ -92,10 +169,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
static int load_mqd(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
- uint32_t __user *wptr)
+ uint32_t __user *wptr, uint32_t page_table_base)
{
return mm->dev->kfd2kgd->hqd_load
- (mm->dev->kgd, mqd, pipe_id, queue_id, wptr);
+ (mm->dev->kgd, mqd, pipe_id, queue_id, wptr, page_table_base);
}
static int __update_mqd(struct mqd_manager *mm, void *mqd,
@@ -153,12 +230,19 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
}
+ if (q->tba_addr)
+ m->cp_hqd_ctx_save_control =
+ atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
+ mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
+
+ update_cu_mask(mm, mqd, q);
m->cp_hqd_active = 0;
q->is_active = false;
if (q->queue_size > 0 &&
q->queue_address != 0 &&
- q->queue_percent > 0) {
+ q->queue_percent > 0 &&
+ !q->is_evicted) {
m->cp_hqd_active = 1;
q->is_active = true;
}
@@ -173,6 +257,12 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
return __update_mqd(mm, mqd, q, MTYPE_CC, 1);
}
+static int update_mqd_tonga(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ return __update_mqd(mm, mqd, q, MTYPE_UC, 0);
+}
+
static int destroy_mqd(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
@@ -231,6 +321,111 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
return retval;
}
+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ int retval;
+ struct vi_sdma_mqd *m;
+
+
+ BUG_ON(!mm || !mqd || !mqd_mem_obj);
+
+ retval = kfd_gtt_sa_allocate(mm->dev,
+ sizeof(struct vi_sdma_mqd),
+ mqd_mem_obj);
+
+ if (retval != 0)
+ return -ENOMEM;
+
+ m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+
+ memset(m, 0, sizeof(struct vi_sdma_mqd));
+
+ *mqd = m;
+ if (gart_addr != NULL)
+ *gart_addr = (*mqd_mem_obj)->gpu_addr;
+
+ retval = mm->update_mqd(mm, m, q);
+
+ return retval;
+}
+
+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ struct kfd_mem_obj *mqd_mem_obj)
+{
+ BUG_ON(!mm || !mqd);
+ kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ uint32_t pipe_id, uint32_t queue_id,
+ uint32_t __user *wptr, uint32_t page_table_base)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
+}
+
+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct vi_sdma_mqd *m;
+ BUG_ON(!mm || !mqd || !q);
+
+ m = get_sdma_mqd(mqd);
+ m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
+ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_doorbell = q->doorbell_off <<
+ SDMA0_RLC0_DOORBELL__OFFSET__SHIFT |
+ 1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT;
+
+ m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr;
+
+ m->sdma_engine_id = q->sdma_engine_id;
+ m->sdma_queue_id = q->sdma_queue_id;
+
+ q->is_active = false;
+ if (q->queue_size > 0 &&
+ q->queue_address != 0 &&
+ q->queue_percent > 0 &&
+ !q->is_evicted) {
+ m->sdmax_rlcx_rb_cntl |=
+ 1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT;
+
+ q->is_active = true;
+ }
+
+ return 0;
+}
+
+/*
+ * * preempt type here is ignored because there is only one way
+ * * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ enum kfd_preempt_type type,
+ unsigned int timeout, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+ uint64_t queue_address, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+
+
struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
struct kfd_dev *dev)
{
@@ -266,6 +461,12 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
mqd->is_occupied = is_occupied;
break;
case KFD_MQD_TYPE_SDMA:
+ mqd->init_mqd = init_mqd_sdma;
+ mqd->uninit_mqd = uninit_mqd_sdma;
+ mqd->load_mqd = load_mqd_sdma;
+ mqd->update_mqd = update_mqd_sdma;
+ mqd->destroy_mqd = destroy_mqd_sdma;
+ mqd->is_occupied = is_occupied_sdma;
break;
default:
kfree(mqd);
@@ -274,3 +475,17 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
return mqd;
}
+
+struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
+ struct kfd_dev *dev)
+{
+ struct mqd_manager *mqd;
+
+ mqd = mqd_manager_init_vi(type, dev);
+ if (!mqd)
+ return NULL;
+ if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE))
+ mqd->update_mqd = update_mqd_tonga;
+ return mqd;
+}
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index ca8c09326b31..c5356ebde005 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -57,26 +57,37 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
{
unsigned int process_count, queue_count;
unsigned int map_queue_size;
+ unsigned int max_proc_per_quantum = 1;
- BUG_ON(!pm || !rlib_size || !over_subscription);
+ struct kfd_dev *dev = pm->dqm->dev;
+
+ BUG_ON(!pm || !rlib_size || !over_subscription || !dev);
process_count = pm->dqm->processes_count;
queue_count = pm->dqm->queue_count;
- /* check if there is over subscription*/
+ /* check if there is over subscription
+ * Note: the arbitration between the number of VMIDs and
+ * hws_max_conc_proc has been done in
+ * kgd2kfd_device_init().
+ */
+
*over_subscription = false;
- if ((process_count > 1) ||
+
+ if (dev->max_proc_per_quantum > 1)
+ max_proc_per_quantum = dev->max_proc_per_quantum;
+
+ if ((process_count > max_proc_per_quantum) ||
queue_count > PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE) {
*over_subscription = true;
pr_debug("kfd: over subscribed runlist\n");
}
- map_queue_size =
- (pm->dqm->dev->device_info->asic_family == CHIP_CARRIZO) ?
+ map_queue_size = KFD_IS_VI(pm->dqm->dev->device_info->asic_family) ?
sizeof(struct pm4_mes_map_queues) :
sizeof(struct pm4_map_queues);
/* calculate run list ib allocation size */
- *rlib_size = process_count * sizeof(struct pm4_map_process) +
+ *rlib_size = process_count * pm->pmf->get_map_process_packet_size() +
queue_count * map_queue_size;
/*
@@ -103,11 +114,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
+ mutex_lock(&pm->lock);
+
retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
&pm->ib_buffer_obj);
if (retval != 0) {
pr_err("kfd: failed to allocate runlist IB\n");
+ mutex_unlock(&pm->lock);
return retval;
}
@@ -116,6 +130,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
memset(*rl_buffer, 0, *rl_buffer_size);
pm->allocated = true;
+
+ mutex_unlock(&pm->lock);
return retval;
}
@@ -123,9 +139,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
uint64_t ib, size_t ib_size_in_dwords, bool chain)
{
struct pm4_runlist *packet;
+ int concurrent_proc_cnt = 0;
+ struct kfd_dev *kfd = pm->dqm->dev;
BUG_ON(!pm || !buffer || !ib);
+ /* Determine the number of processes to map together to HW:
+ * it can not exceed the number of VMIDs available to the
+ * scheduler, and it is determined by the smaller of the number
+ * of processes in the runlist and kfd module parameter
+ * hws_max_conc_proc.
+ * Note: the arbitration between the number of VMIDs and
+ * hws_max_conc_proc has been done in
+ * kgd2kfd_device_init().
+ */
+ concurrent_proc_cnt = min(pm->dqm->processes_count,
+ kfd->max_proc_per_quantum);
+
+
packet = (struct pm4_runlist *)buffer;
memset(buffer, 0, sizeof(struct pm4_runlist));
@@ -136,6 +167,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
packet->bitfields4.chain = chain ? 1 : 0;
packet->bitfields4.offload_polling = 0;
packet->bitfields4.valid = 1;
+ packet->bitfields4.process_cnt = concurrent_proc_cnt;
packet->ordinal2 = lower_32_bits(ib);
packet->bitfields3.ib_base_hi = upper_32_bits(ib);
@@ -182,6 +214,90 @@ static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer,
return 0;
}
+static int pm_create_map_process_scratch_kv(struct packet_manager *pm,
+ uint32_t *buffer, struct qcm_process_device *qpd)
+{
+ struct pm4_map_process_scratch_kv *packet;
+ struct queue *cur;
+ uint32_t num_queues;
+
+ BUG_ON(!pm || !buffer || !qpd);
+
+ packet = (struct pm4_map_process_scratch_kv *)buffer;
+
+ pr_debug("kfd: In func %s\n", __func__);
+
+ memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv));
+
+ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS,
+ sizeof(struct pm4_map_process_scratch_kv));
+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+ packet->bitfields2.process_quantum = 1;
+ packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields3.page_table_base = qpd->page_table_base;
+ packet->bitfields14.gds_size = qpd->gds_size;
+ packet->bitfields14.num_gws = qpd->num_gws;
+ packet->bitfields14.num_oac = qpd->num_oac;
+ num_queues = 0;
+ list_for_each_entry(cur, &qpd->queues_list, list)
+ num_queues++;
+ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : num_queues;
+
+ packet->sh_mem_config = qpd->sh_mem_config;
+ packet->sh_mem_bases = qpd->sh_mem_bases;
+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
+
+ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
+
+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+ return 0;
+}
+
+static int pm_create_map_process_scratch(struct packet_manager *pm,
+ uint32_t *buffer, struct qcm_process_device *qpd)
+{
+ struct pm4_map_process_scratch *packet;
+ struct queue *cur;
+ uint32_t num_queues;
+
+ BUG_ON(!pm || !buffer || !qpd);
+
+ packet = (struct pm4_map_process_scratch *)buffer;
+
+ pr_debug("kfd: In func %s\n", __func__);
+
+ memset(buffer, 0, sizeof(struct pm4_map_process_scratch));
+
+ packet->header.u32all = build_pm4_header(IT_MAP_PROCESS,
+ sizeof(struct pm4_map_process_scratch));
+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+ packet->bitfields2.process_quantum = 1;
+ packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields3.page_table_base = qpd->page_table_base;
+ packet->bitfields10.gds_size = qpd->gds_size;
+ packet->bitfields10.num_gws = qpd->num_gws;
+ packet->bitfields10.num_oac = qpd->num_oac;
+ num_queues = 0;
+ list_for_each_entry(cur, &qpd->queues_list, list)
+ num_queues++;
+ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues;
+
+ packet->sh_mem_config = qpd->sh_mem_config;
+ packet->sh_mem_bases = qpd->sh_mem_bases;
+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
+
+ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
+
+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+ return 0;
+}
+
static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer,
struct queue *q, bool is_static)
{
@@ -219,7 +335,7 @@ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer,
queue_type__mes_map_queues__debug_interface_queue_vi;
break;
case KFD_QUEUE_TYPE_SDMA:
- packet->bitfields2.engine_sel =
+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
engine_sel__mes_map_queues__sdma0_vi;
use_static = false; /* no static queues under SDMA */
break;
@@ -279,7 +395,7 @@ static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
engine_sel__mes_map_queues__compute;
break;
case KFD_QUEUE_TYPE_SDMA:
- packet->bitfields2.engine_sel =
+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
engine_sel__mes_map_queues__sdma0;
use_static = false; /* no static queues under SDMA */
break;
@@ -348,12 +464,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
return -ENOMEM;
}
- retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd);
+ retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
if (retval != 0)
return retval;
proccesses_mapped++;
- inc_wptr(&rl_wptr, sizeof(struct pm4_map_process),
+ inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(),
alloc_size_bytes);
list_for_each_entry(kq, &qpd->priv_queue_list, list) {
@@ -363,8 +479,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
pr_debug("kfd: static_queue, mapping kernel q %d, is debug status %d\n",
kq->queue->queue, qpd->is_debug);
- if (pm->dqm->dev->device_info->asic_family ==
- CHIP_CARRIZO)
+ if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family))
retval = pm_create_map_queue_vi(pm,
&rl_buffer[rl_wptr],
kq->queue,
@@ -389,8 +504,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
pr_debug("kfd: static_queue, mapping user queue %d, is debug status %d\n",
q->queue, qpd->is_debug);
- if (pm->dqm->dev->device_info->asic_family ==
- CHIP_CARRIZO)
+ if (KFD_IS_VI(pm->dqm->dev->device_info->asic_family))
retval = pm_create_map_queue_vi(pm,
&rl_buffer[rl_wptr],
q,
@@ -423,7 +537,23 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
return 0;
}
-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
+static int get_map_process_packet_size(void)
+{
+ return sizeof(struct pm4_map_process);
+}
+
+static int get_map_process_packet_size_scratch_kv(void)
+{
+ return sizeof(struct pm4_map_process_scratch_kv);
+}
+
+static int get_map_process_packet_size_scratch(void)
+{
+ return sizeof(struct pm4_map_process_scratch);
+}
+
+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
+ uint16_t fw_ver)
{
BUG_ON(!dqm);
@@ -434,8 +564,37 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
mutex_destroy(&pm->lock);
return -ENOMEM;
}
+ pm->pmf = kzalloc(sizeof(struct packet_manager_firmware), GFP_KERNEL);
pm->allocated = false;
+ switch (pm->dqm->dev->device_info->asic_family) {
+ case CHIP_KAVERI:
+ if (fw_ver >= KFD_SCRATCH_KV_FW_VER) {
+ pm->pmf->map_process = pm_create_map_process_scratch_kv;
+ pm->pmf->get_map_process_packet_size =
+ get_map_process_packet_size_scratch_kv;
+ } else {
+ pm->pmf->map_process = pm_create_map_process;
+ pm->pmf->get_map_process_packet_size =
+ get_map_process_packet_size;
+ }
+ break;
+ case CHIP_CARRIZO:
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ if (fw_ver >= KFD_SCRATCH_CZ_FW_VER) {
+ pm->pmf->map_process = pm_create_map_process_scratch;
+ pm->pmf->get_map_process_packet_size =
+ get_map_process_packet_size_scratch;
+ } else {
+ pm->pmf->map_process = pm_create_map_process;
+ pm->pmf->get_map_process_packet_size =
+ get_map_process_packet_size;
+ }
+ break;
+
+ }
+
return 0;
}
@@ -445,6 +604,7 @@ void pm_uninit(struct packet_manager *pm)
mutex_destroy(&pm->lock);
kernel_queue_uninit(pm->priv_queue);
+ kfree(pm->pmf);
}
int pm_send_set_resources(struct packet_manager *pm,
@@ -577,7 +737,7 @@ fail_acquire_packet_buffer:
}
int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
- enum kfd_preempt_type_filter mode,
+ enum kfd_unmap_queues_filter filter,
uint32_t filter_param, bool reset,
unsigned int sdma_engine)
{
@@ -597,8 +757,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
packet = (struct pm4_unmap_queues *)buffer;
memset(buffer, 0, sizeof(struct pm4_unmap_queues));
- pr_debug("kfd: static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n",
- mode, reset, type);
+ pr_debug("kfd: static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n",
+ filter, reset, type);
packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES,
sizeof(struct pm4_unmap_queues));
switch (type) {
@@ -623,26 +783,26 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
packet->bitfields2.action =
action__mes_unmap_queues__preempt_queues;
- switch (mode) {
- case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE:
+ switch (filter) {
+ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
packet->bitfields2.queue_sel =
queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
packet->bitfields2.num_queues = 1;
packet->bitfields3b.doorbell_offset0 = filter_param;
break;
- case KFD_PREEMPT_TYPE_FILTER_BY_PASID:
+ case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
packet->bitfields2.queue_sel =
queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
packet->bitfields3a.pasid = filter_param;
break;
- case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES:
+ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
packet->bitfields2.queue_sel =
queue_sel__mes_unmap_queues__perform_request_on_all_active_queues;
break;
- case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES:
+ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
/* in this case, we do not preempt static queues */
- packet->bitfields2.queue_sel =
- queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only;
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only;
break;
default:
BUG();
@@ -670,3 +830,4 @@ void pm_release_ib(struct packet_manager *pm)
}
mutex_unlock(&pm->lock);
}
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
index 5b393f3e34a9..e7570ccdc5ad 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h
@@ -127,7 +127,8 @@ struct pm4_runlist {
uint32_t offload_polling:1;
uint32_t reserved3:1;
uint32_t valid:1;
- uint32_t reserved4:8;
+ uint32_t process_cnt:4;
+ uint32_t reserved4:4;
} bitfields4;
uint32_t ordinal4;
};
@@ -186,6 +187,123 @@ struct pm4_map_process {
};
#endif
+/*--------------------MES_MAP_PROCESS_SCRATCH-------------------- */
+
+#ifndef PM4_MES_MAP_PROCESS_SCRATCH_DEFINED
+#define PM4_MES_MAP_PROCESS_SCRATCH_DEFINED
+
+struct pm4_map_process_scratch {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ uint32_t pasid:16;
+ uint32_t reserved1:8;
+ uint32_t diq_enable:1;
+ uint32_t process_quantum:7;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ union {
+ struct {
+ uint32_t page_table_base:28;
+ uint32_t reserved3:4;
+ } bitfields3;
+ uint32_t ordinal3;
+ };
+
+ uint32_t reserved;
+
+ uint32_t sh_mem_bases;
+ uint32_t sh_mem_config;
+ uint32_t sh_mem_ape1_base;
+ uint32_t sh_mem_ape1_limit;
+
+ uint32_t sh_hidden_private_base_vmid;
+
+ uint32_t reserved2;
+ uint32_t reserved3;
+
+ uint32_t gds_addr_lo;
+ uint32_t gds_addr_hi;
+
+ union {
+ struct {
+ uint32_t num_gws:6;
+ uint32_t reserved4:2;
+ uint32_t num_oac:4;
+ uint32_t reserved5:4;
+ uint32_t gds_size:6;
+ uint32_t num_queues:10;
+ } bitfields10;
+ uint32_t ordinal10;
+ };
+
+ uint32_t completion_signal_lo;
+ uint32_t completion_signal_hi;
+
+};
+#endif
+
+#ifndef PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH
+#define PM4_MES_MAP_PROCESS_DEFINED_KV_SCRATCH
+
+struct pm4_map_process_scratch_kv {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ uint32_t pasid:16;
+ uint32_t reserved1:8;
+ uint32_t diq_enable:1;
+ uint32_t process_quantum:7;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ union {
+ struct {
+ uint32_t page_table_base:28;
+ uint32_t reserved2:4;
+ } bitfields3;
+ uint32_t ordinal3;
+ };
+
+ uint32_t reserved3;
+ uint32_t sh_mem_bases;
+ uint32_t sh_mem_config;
+ uint32_t sh_mem_ape1_base;
+ uint32_t sh_mem_ape1_limit;
+ uint32_t sh_hidden_private_base_vmid;
+ uint32_t reserved4;
+ uint32_t reserved5;
+ uint32_t gds_addr_lo;
+ uint32_t gds_addr_hi;
+
+ union {
+ struct {
+ uint32_t num_gws:6;
+ uint32_t reserved6:2;
+ uint32_t num_oac:4;
+ uint32_t reserved7:4;
+ uint32_t gds_size:6;
+ uint32_t num_queues:10;
+ } bitfields14;
+ uint32_t ordinal14;
+ };
+
+ uint32_t completion_signal_lo32;
+uint32_t completion_signal_hi32;
+};
+#endif
+
/*--------------------MES_MAP_QUEUES--------------------*/
#ifndef PM4_MES_MAP_QUEUES_DEFINED
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 80113c335966..92bba461e1e0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -30,13 +30,45 @@
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
+#include <linux/idr.h>
#include <linux/kfd_ioctl.h>
+#include <linux/pid.h>
+#include <linux/interval_tree.h>
#include <kgd_kfd_interface.h>
+#include "amd_rdma.h"
+
#define KFD_SYSFS_FILE_MODE 0444
-#define KFD_MMAP_DOORBELL_MASK 0x8000000000000
-#define KFD_MMAP_EVENTS_MASK 0x4000000000000
+/* GPU ID hash width in bits */
+#define KFD_GPU_ID_HASH_WIDTH 16
+
+/* Use upper bits of mmap offset to store KFD driver specific information.
+ * BITS[63:62] - Encode MMAP type
+ * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to
+ * BITS[45:40] - Reserved. Not Used.
+ * BITS[39:0] - MMAP offset value. Used by TTM.
+ *
+ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
+ * defines are w.r.t to PAGE_SIZE
+ */
+#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT)
+#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_MAP_BO (0x1ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_RESERVED_MEM (0x0ULL << KFD_MMAP_TYPE_SHIFT)
+
+#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
+#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
+ << KFD_MMAP_GPU_ID_SHIFT)
+#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
+ & KFD_MMAP_GPU_ID_MASK)
+#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \
+ >> KFD_MMAP_GPU_ID_SHIFT)
+
+#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT)
+#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK)
/*
* When working with cp scheduler we should assign the HIQ manually or via
@@ -48,8 +80,6 @@
#define KFD_CIK_HIQ_PIPE 4
#define KFD_CIK_HIQ_QUEUE 0
-/* GPU ID hash width in bits */
-#define KFD_GPU_ID_HASH_WIDTH 16
/* Macro for allocating structures */
#define kfd_alloc_struct(ptr_to_struct) \
@@ -74,12 +104,26 @@ extern int max_num_of_queues_per_device;
/* Kernel module parameter to specify the scheduling policy */
extern int sched_policy;
+extern int cwsr_enable;
+
+/*
+ * Kernel module parameter to specify the maximum process
+ * number per HW scheduler
+ */
+extern int hws_max_conc_proc;
+
/*
* Kernel module parameter to specify whether to send sigterm to HSA process on
* unhandled exception
*/
extern int send_sigterm;
+/*
+ * This kernel module is used to simulate large bar machine on non-large bar
+ * enabled machines.
+ */
+extern int debug_largebar;
+
/**
* enum kfd_sched_policy
*
@@ -114,14 +158,17 @@ enum cache_policy {
enum asic_family_type {
CHIP_KAVERI = 0,
- CHIP_CARRIZO
+ CHIP_CARRIZO,
+ CHIP_TONGA,
+ CHIP_FIJI
};
+#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_FIJI)
+#define KFD_IS_DGPU(chip) ((chip) >= CHIP_TONGA && (chip) <= CHIP_FIJI)
+
struct kfd_event_interrupt_class {
- bool (*interrupt_isr)(struct kfd_dev *dev,
- const uint32_t *ih_ring_entry);
- void (*interrupt_wq)(struct kfd_dev *dev,
- const uint32_t *ih_ring_entry);
+ bool (*interrupt_isr)(struct kfd_dev *dev, const uint32_t *ih_ring_entry);
+ void (*interrupt_wq)(struct kfd_dev *dev, const uint32_t *ih_ring_entry);
};
struct kfd_device_info {
@@ -132,6 +179,7 @@ struct kfd_device_info {
size_t ih_ring_entry_size;
uint8_t num_of_watch_points;
uint16_t mqd_size_aligned;
+ bool is_need_iommu_device;
};
struct kfd_mem_obj {
@@ -141,6 +189,12 @@ struct kfd_mem_obj {
uint32_t *cpu_ptr;
};
+struct kfd_vmid_info {
+ uint32_t first_vmid_kfd;
+ uint32_t last_vmid_kfd;
+ uint32_t vmid_num_kfd;
+};
+
struct kfd_dev {
struct kgd_dev *kgd;
@@ -165,11 +219,12 @@ struct kfd_dev {
*/
struct kgd2kfd_shared_resources shared_resources;
+ struct kfd_vmid_info vm_info;
const struct kfd2kgd_calls *kfd2kgd;
struct mutex doorbell_mutex;
- DECLARE_BITMAP(doorbell_available_index,
- KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
+ unsigned long doorbell_available_index[DIV_ROUND_UP(
+ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)];
void *gtt_mem;
uint64_t gtt_start_gpu_addr;
@@ -179,6 +234,11 @@ struct kfd_dev {
unsigned int gtt_sa_chunk_size;
unsigned int gtt_sa_num_of_chunks;
+ /* QCM Device instance */
+ struct device_queue_manager *dqm;
+
+ bool init_complete;
+
/* Interrupts */
void *interrupt_ring;
size_t interrupt_ring_size;
@@ -187,10 +247,6 @@ struct kfd_dev {
struct work_struct interrupt_work;
spinlock_t interrupt_lock;
- /* QCM Device instance */
- struct device_queue_manager *dqm;
-
- bool init_complete;
/*
* Interrupts of interest to KFD are copied
* from the HW ring into a SW ring.
@@ -198,7 +254,26 @@ struct kfd_dev {
bool interrupts_active;
/* Debug manager */
- struct kfd_dbgmgr *dbgmgr;
+ struct kfd_dbgmgr *dbgmgr;
+
+ /* MEC firmware version*/
+ uint16_t mec_fw_version;
+
+ /* Maximum process number mapped to HW scheduler */
+ unsigned int max_proc_per_quantum;
+
+ /* cwsr */
+ bool cwsr_enabled;
+ struct page *cwsr_pages;
+ uint32_t cwsr_size;
+ uint32_t tma_offset; /*Offset for TMA from the start of cwsr_mem*/
+};
+
+struct kfd_bo {
+ void *mem;
+ struct interval_tree_node it;
+ struct kfd_dev *dev;
+ struct list_head cb_data_head;
};
/* KGD2KFD callbacks */
@@ -221,22 +296,22 @@ void kfd_chardev_exit(void);
struct device *kfd_chardev(void);
/**
- * enum kfd_preempt_type_filter
+ * enum kfd_unmap_queues_filter
*
- * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue.
+ * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue.
*
- * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the
+ * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the
* running queues list.
*
- * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to
+ * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to
* specific process.
*
*/
-enum kfd_preempt_type_filter {
- KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE,
- KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES,
- KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES,
- KFD_PREEMPT_TYPE_FILTER_BY_PASID
+enum kfd_unmap_queues_filter {
+ KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE,
+ KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
+ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
+ KFD_UNMAP_QUEUES_FILTER_BY_PASID
};
enum kfd_preempt_type {
@@ -324,6 +399,7 @@ struct queue_properties {
uint32_t __iomem *doorbell_ptr;
uint32_t doorbell_off;
bool is_interop;
+ bool is_evicted; /* true -> queue is evicted */
bool is_active;
/* Not relevant for user mode queues in cp scheduling */
unsigned int vmid;
@@ -336,6 +412,11 @@ struct queue_properties {
uint32_t eop_ring_buffer_size;
uint64_t ctx_save_restore_area_address;
uint32_t ctx_save_restore_area_size;
+ uint32_t ctl_stack_size;
+ uint64_t tba_addr;
+ uint64_t tma_addr;
+ /* Relevant for CU */
+ uint32_t cu_mask;
};
/**
@@ -424,6 +505,7 @@ struct qcm_process_device {
unsigned int queue_count;
unsigned int vmid;
bool is_debug;
+ unsigned evicted; /* eviction counter, 0=active */
/*
* All the memory management data should be here too
*/
@@ -436,8 +518,22 @@ struct qcm_process_device {
uint32_t gds_size;
uint32_t num_gws;
uint32_t num_oac;
+ uint32_t sh_hidden_private_base;
+
+ /*cwsr memory*/
+ int cwsr_mem_handle;
+ uint64_t cwsr_base;
+ uint64_t tba_addr;
+ uint64_t tma_addr;
+ void *cwsr_kaddr;
};
+/*8 byte handle containing GPU ID in the most significant 4 bytes and
+ * idr_handle in the least significant 4 bytes*/
+#define MAKE_HANDLE(gpu_id, idr_handle) (((uint64_t)(gpu_id) << 32) + idr_handle)
+#define GET_GPU_ID(handle) (handle >> 32)
+#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF)
+
/* Data that is per-process-per device. */
struct kfd_process_device {
/*
@@ -449,6 +545,8 @@ struct kfd_process_device {
/* The device that owns this data. */
struct kfd_dev *dev;
+ /* The process that owns this kfd_process_device. */
+ struct kfd_process *process;
/* per-process-per device QCM data structure */
struct qcm_process_device qpd;
@@ -460,10 +558,23 @@ struct kfd_process_device {
uint64_t gpuvm_limit;
uint64_t scratch_base;
uint64_t scratch_limit;
+ uint64_t dgpu_base;
+ uint64_t dgpu_limit;
+ uint64_t mapped_size;
+ uint64_t last_eviction;
+ bool evicted;
+
+ uint64_t sh_hidden_private_base_vmid;
/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
bool bound;
+ /* VM context for GPUVM allocations */
+ void *vm;
+
+ /* GPUVM allocations storage */
+ struct idr alloc_idr;
+
/* This flag tells if we should reset all
* wavefronts on process termination
*/
@@ -482,7 +593,7 @@ struct kfd_process {
struct mm_struct *mm;
- struct mutex mutex;
+ struct rw_semaphore lock;
/*
* In any process, the thread that started main() is the lead
@@ -513,6 +624,8 @@ struct kfd_process {
/* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */
struct kfd_queue **queues;
+ unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)];
+
/*Is the user space process 32 bit?*/
bool is_32bit_user_mode;
@@ -520,10 +633,12 @@ struct kfd_process {
struct mutex event_mutex;
/* All events in process hashed by ID, linked on kfd_event.events. */
DECLARE_HASHTABLE(events, 4);
- struct list_head signal_event_pages; /* struct slot_page_header.
- event_pages */
+ struct list_head signal_event_pages; /* struct slot_page_header.event_pages */
u32 next_nonsignal_event_id;
size_t signal_event_count;
+ size_t debug_event_count;
+
+ struct rb_root bo_interval_tree;
};
/**
@@ -546,9 +661,10 @@ struct amdkfd_ioctl_desc {
void kfd_process_create_wq(void);
void kfd_process_destroy_wq(void);
-struct kfd_process *kfd_create_process(const struct task_struct *);
+struct kfd_process *kfd_create_process(struct file *filep);
struct kfd_process *kfd_get_process(const struct task_struct *);
struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
+struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
struct kfd_process *p);
@@ -558,6 +674,29 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
struct kfd_process *p);
+int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma);
+
+/* KFD process API for creating and translating handles */
+int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+ void *mem, uint64_t start,
+ uint64_t length);
+void *kfd_process_device_translate_handle(struct kfd_process_device *p,
+ int handle);
+struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd,
+ int handle);
+void *kfd_process_find_bo_from_interval(struct kfd_process *p,
+ uint64_t start_addr,
+ uint64_t last_addr);
+void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
+ int handle);
+
+void run_rdma_free_callback(struct kfd_bo *buf_obj);
+struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid);
+
+/* kfd dgpu memory */
+int kfd_map_memory_to_gpu(struct kfd_dev *dev, void *mem,
+ struct kfd_process *p, struct kfd_process_device *pdd);
+
/* Process device data iterator */
struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p);
struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p,
@@ -600,7 +739,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
int kfd_topology_remove_device(struct kfd_dev *gpu);
struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx);
+struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd);
+uint32_t kfd_get_gpu_id(struct kfd_dev *dev);
+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
+int kfd_numa_node_to_apic_id(int numa_node_id);
+int kfd_get_proximity_domain(const struct pci_bus *bus);
/* Interrupts */
int kfd_interrupt_init(struct kfd_dev *dev);
@@ -615,11 +758,13 @@ int kgd2kfd_resume(struct kfd_dev *kfd);
/* amdkfd Apertures */
int kfd_init_apertures(struct kfd_process *process);
+int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
+ uint64_t base, uint64_t limit);
/* Queue Context Management */
struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd);
-int init_queue(struct queue **q, struct queue_properties properties);
+int init_queue(struct queue **q, const struct queue_properties *properties);
void uninit_queue(struct queue *q);
void print_queue_properties(struct queue_properties *q);
void print_queue(struct queue *q);
@@ -630,11 +775,15 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
+ struct kfd_dev *dev);
struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq);
+int kfd_process_vm_fault(struct device_queue_manager *dqm,
+ unsigned int pasid);
/* Process Queue Manager */
struct process_queue_node {
@@ -649,18 +798,16 @@ int pqm_create_queue(struct process_queue_manager *pqm,
struct kfd_dev *dev,
struct file *f,
struct queue_properties *properties,
- unsigned int flags,
- enum kfd_queue_type type,
unsigned int *qid);
int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid);
int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
struct queue_properties *p);
+int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
+ struct queue_properties *p);
struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
unsigned int qid);
-
-int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
- unsigned int fence_value,
- unsigned long timeout);
+int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm);
+int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm);
/* Packet Manager */
@@ -668,7 +815,9 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
#define KFD_FENCE_COMPLETED (100)
#define KFD_FENCE_INIT (10)
-#define KFD_UNMAP_LATENCY (150)
+#define KFD_UNMAP_LATENCY (40)
+
+struct packet_manager_firmware;
struct packet_manager {
struct device_queue_manager *dqm;
@@ -676,9 +825,19 @@ struct packet_manager {
struct mutex lock;
bool allocated;
struct kfd_mem_obj *ib_buffer_obj;
+
+ struct packet_manager_firmware *pmf;
};
-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
+struct packet_manager_firmware {
+ /* Support different firmware versions for map process packet */
+ int (*map_process)(struct packet_manager *pm, uint32_t *buffer,
+ struct qcm_process_device *qpd);
+ int (*get_map_process_packet_size)(void);
+};
+
+int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
+ uint16_t fw_ver);
void pm_uninit(struct packet_manager *pm);
int pm_send_set_resources(struct packet_manager *pm,
struct scheduling_resources *res);
@@ -687,7 +846,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
uint32_t fence_value);
int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
- enum kfd_preempt_type_filter mode,
+ enum kfd_unmap_queues_filter mode,
uint32_t filter_param, bool reset,
unsigned int sdma_engine);
@@ -696,6 +855,9 @@ void pm_release_ib(struct packet_manager *pm);
uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
struct kfd_process *process);
+int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ unsigned int fence_value,
+ unsigned long timeout);
/* Events */
extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
@@ -714,8 +876,7 @@ int kfd_wait_on_events(struct kfd_process *p,
uint32_t num_events, void __user *data,
bool all, uint32_t user_timeout_ms,
enum kfd_event_wait_result *wait_result);
-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
- uint32_t valid_id_bits);
+void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, uint32_t valid_id_bits);
void kfd_signal_iommu_event(struct kfd_dev *dev,
unsigned int pasid, unsigned long address,
bool is_write_requested, bool is_execute_requested);
@@ -723,11 +884,28 @@ void kfd_signal_hw_exception_event(unsigned int pasid);
int kfd_set_event(struct kfd_process *p, uint32_t event_id);
int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
int kfd_event_create(struct file *devkfd, struct kfd_process *p,
- uint32_t event_type, bool auto_reset, uint32_t node_id,
- uint32_t *event_id, uint32_t *event_trigger_data,
- uint64_t *event_page_offset, uint32_t *event_slot_index);
+ uint32_t event_type, bool auto_reset, uint32_t node_id,
+ uint32_t *event_id, uint32_t *event_trigger_data,
+ uint64_t *event_page_offset, uint32_t *event_slot_index,
+ void *kern_addr);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
+void kfd_free_signal_page_dgpu(struct kfd_process *p, uint64_t handle);
+
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ struct kfd_vm_fault_info *info);
+
+void radeon_flush_tlb(struct kfd_dev *dev, uint32_t pasid);
int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
+int kgd2kfd_evict_bo(struct kfd_dev *dev, void *mem);
+int kgd2kfd_restore(struct kfd_dev *kfd);
+int evict_size(struct kfd_process *p, int size, int type);
+int evict_bo(struct kfd_dev *dev, void *mem);
+int restore(struct kfd_dev *kfd);
+
+#define KFD_SCRATCH_CZ_FW_VER 600
+#define KFD_SCRATCH_KV_FW_VER 413
+#define KFD_MULTI_PROC_MAPPING_HWS_SUPPORT 600
+#define KFD_CWSR_CZ_FW_VER 625
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 4f3849ac8c07..dfd2e0d4f544 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -27,6 +27,10 @@
#include <linux/amd-iommu.h>
#include <linux/notifier.h>
#include <linux/compat.h>
+#include <linux/mm.h>
+#include <asm/tlb.h>
+#include <linux/highmem.h>
+#include <uapi/asm-generic/mman-common.h>
struct mm_struct;
@@ -40,6 +44,7 @@ struct mm_struct;
*/
#define INITIAL_QUEUE_ARRAY_SIZE 16
+static int evict_pdd(struct kfd_process_device *pdd);
/*
* List of struct kfd_process (field kfd_process).
* Unique/indexed by mm_struct*
@@ -57,8 +62,14 @@ struct kfd_process_release_work {
struct kfd_process *p;
};
-static struct kfd_process *find_process(const struct task_struct *thread);
+#define MIN_IDR_ID 1
+#define MAX_IDR_ID 0 /*0 - for unlimited*/
+
+static struct kfd_process *find_process(const struct task_struct *thread,
+ bool lock);
static struct kfd_process *create_process(const struct task_struct *thread);
+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
+
void kfd_process_create_wq(void)
{
@@ -74,10 +85,12 @@ void kfd_process_destroy_wq(void)
}
}
-struct kfd_process *kfd_create_process(const struct task_struct *thread)
+struct kfd_process *kfd_create_process(struct file *filep)
{
struct kfd_process *process;
+ struct task_struct *thread = current;
+
BUG_ON(!kfd_process_wq);
if (thread->mm == NULL)
@@ -98,7 +111,7 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
mutex_lock(&kfd_processes_mutex);
/* A prior open of /dev/kfd could have already created the process. */
- process = find_process(thread);
+ process = find_process(thread, false);
if (process)
pr_debug("kfd: process already found\n");
@@ -109,6 +122,8 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
up_write(&thread->mm->mmap_sem);
+ kfd_process_init_cwsr(process, filep);
+
return process;
}
@@ -123,7 +138,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread)
if (thread->group_leader->mm != thread->mm)
return ERR_PTR(-EINVAL);
- process = find_process(thread);
+ process = find_process(thread, false);
return process;
}
@@ -140,23 +155,164 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
return NULL;
}
-static struct kfd_process *find_process(const struct task_struct *thread)
+static struct kfd_process *find_process(const struct task_struct *thread,
+ bool lock)
{
struct kfd_process *p;
int idx;
idx = srcu_read_lock(&kfd_processes_srcu);
p = find_process_by_mm(thread->mm);
+ if (p && lock)
+ down_read(&p->lock);
srcu_read_unlock(&kfd_processes_srcu, idx);
return p;
}
+/* This returns with process->lock read-locked. */
+struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid)
+{
+ struct task_struct *task = NULL;
+ struct kfd_process *p = NULL;
+
+ if (!pid)
+ task = current;
+ else
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ if (task)
+ p = find_process(task, true);
+
+ return p;
+}
+
+int evict_size(struct kfd_process *process, int size, int type)
+{
+ struct kfd_process_device *pdd, *temp_pdd = NULL;
+ struct kfd_process *p = process;
+ int temp = 0;
+
+ down_write(&p->lock);
+
+ if (type == EVICT_FIRST_PDD) {
+
+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
+ pdd->dev->id, p->pasid);
+ if (pdd->mapped_size >= size) {
+ evict_pdd(pdd);
+ return 0;
+ }
+
+ }
+ } else if (type == EVICT_BIGGEST_PDD) {
+
+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
+ pdd->dev->id, p->pasid);
+ if (pdd->mapped_size >= temp) {
+ temp = pdd->mapped_size;
+ temp_pdd = pdd;
+ }
+
+ }
+ if (temp_pdd->mapped_size > size) {
+ evict_pdd(temp_pdd);
+ return 0;
+ }
+
+ }
+ up_write(&p->lock);
+ return 0;
+
+}
+
+int evict_bo(struct kfd_dev *dev, void *mem)
+{
+ struct kfd_process_device *pdd;
+
+ pdd = dev->kfd2kgd->get_pdd_from_buffer_object(dev->kgd,
+ ((struct kgd_mem *)mem));
+
+ if (pdd)
+ evict_pdd(pdd);
+
+ return 0;
+}
+
+static int evict_pdd(struct kfd_process_device *pdd)
+{
+ void *mem;
+ int id;
+
+ /*process_evict_queues(struct device_queue_manager *dqm, pdd->qpd)*/
+ /*
+ * Remove all handles from idr and release appropriate
+ * local memory object
+ */
+ idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+ pdd->dev->kfd2kgd->unmap_memory_to_gpu(
+ pdd->dev->kgd, mem, pdd->vm);
+ }
+ pdd->last_eviction = jiffies;
+ pdd->mapped_size = 0;
+ pdd->evicted = true;
+
+ /*flush_tlb_all();*/
+
+ return 0;
+}
+
+int restore(struct kfd_dev *kfd)
+{
+ struct kfd_process *p = NULL;
+ /* TODO still working on how to get the process */
+ struct kfd_process_device *pdd = kfd_get_process_device_data(kfd, p);
+ void *mem;
+ int id;
+
+ /* need to run on all processes*/
+ down_write(&p->lock);
+
+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+ pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
+ pdd->dev->id, p->pasid);
+
+ /*
+ * Remove all handles from idr and release appropriate
+ * local memory object
+ */
+ if (pdd->evicted) {
+ idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+ pdd->dev->kfd2kgd->map_memory_to_gpu(
+ pdd->dev->kgd,
+ mem, pdd->vm);
+ pdd->last_eviction = 0;
+ pdd->mapped_size = 0;
+ }
+
+ /*process_restore_queues
+ * (struct device_queue_manager *dqm, pdd->qpd)*/
+ } else {
+ pdd->evicted = false;
+ }
+ }
+ up_write(&p->lock);
+ return 0;
+}
+
+/* No process locking is needed in this function, because the process
+ * is not findable any more. We must assume that no other thread is
+ * using it any more, otherwise we couldn't safely free the process
+ * stucture in the end. */
static void kfd_process_wq_release(struct work_struct *work)
{
struct kfd_process_release_work *my_work;
- struct kfd_process_device *pdd, *temp;
+ struct kfd_process_device *pdd, *temp, *peer_pdd;
struct kfd_process *p;
+ struct kfd_bo *buf_obj;
+ int id;
my_work = (struct kfd_process_release_work *) work;
@@ -165,19 +321,40 @@ static void kfd_process_wq_release(struct work_struct *work)
pr_debug("Releasing process (pasid %d) in workqueue\n",
p->pasid);
- mutex_lock(&p->mutex);
-
- list_for_each_entry_safe(pdd, temp, &p->per_device_data,
- per_device_list) {
+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
pdd->dev->id, p->pasid);
- if (pdd->reset_wavefronts)
- dbgdev_wave_reset_wavefronts(pdd->dev, p);
+ if (pdd->dev->device_info->is_need_iommu_device)
+ amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
+
+ /*
+ * Remove all handles from idr and release appropriate
+ * local memory object
+ */
+ idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) {
+ list_for_each_entry(peer_pdd,
+ &p->per_device_data, per_device_list) {
+ pdd->dev->kfd2kgd->unmap_memory_to_gpu(
+ peer_pdd->dev->kgd,
+ buf_obj->mem, peer_pdd->vm);
+ }
- amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
- list_del(&pdd->per_device_list);
+ run_rdma_free_callback(buf_obj);
+ pdd->dev->kfd2kgd->free_memory_of_gpu(
+ pdd->dev->kgd, buf_obj->mem);
+ kfd_process_device_remove_obj_handle(pdd, id);
+ }
+ }
+ list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+ per_device_list) {
+ radeon_flush_tlb(pdd->dev, p->pasid);
+ /* Destroy the GPUVM VM context */
+ if (pdd->vm)
+ pdd->dev->kfd2kgd->destroy_process_vm(
+ pdd->dev->kgd, pdd->vm);
+ list_del(&pdd->per_device_list);
kfree(pdd);
}
@@ -185,15 +362,11 @@ static void kfd_process_wq_release(struct work_struct *work)
kfd_pasid_free(p->pasid);
- mutex_unlock(&p->mutex);
-
- mutex_destroy(&p->mutex);
-
kfree(p->queues);
kfree(p);
- kfree(work);
+ kfree((void *)work);
}
static void kfd_process_destroy_delayed(struct rcu_head *rcu)
@@ -222,6 +395,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
{
struct kfd_process *p;
struct kfd_process_device *pdd = NULL;
+ struct kfd_dev *dev = NULL;
+ long status = -EFAULT;
/*
* The kfd_process structure can not be free because the
@@ -235,9 +410,31 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
mutex_unlock(&kfd_processes_mutex);
synchronize_srcu(&kfd_processes_srcu);
- mutex_lock(&p->mutex);
+ down_write(&p->lock);
+
+ /* Iterate over all process device data structures and if the pdd is in
+ * debug mode,we should first force unregistration, then we will be
+ * able to destroy the queues */
+ list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+ dev = pdd->dev;
+ mutex_lock(get_dbgmgr_mutex());
+
+ if ((dev != NULL) &&
+ (dev->dbgmgr) &&
+ (dev->dbgmgr->pasid == p->pasid)) {
+
+ status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
+ if (status == 0) {
+ kfd_dbgmgr_destroy(dev->dbgmgr);
+ dev->dbgmgr = NULL;
+ }
+ }
+ mutex_unlock(get_dbgmgr_mutex());
+ }
+
+
+ /* now we can uninit the pqm: */
- /* In case our notifier is called before IOMMU notifier */
pqm_uninit(&p->pqm);
/* Iterate over all process device data structure and check
@@ -271,6 +468,94 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
.release = kfd_process_notifier_release,
};
+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep)
+{
+ int err;
+ unsigned long offset;
+ struct kfd_process_device *temp, *pdd = NULL;
+ void *mem = NULL;
+ struct kfd_dev *dev = NULL;
+ struct qcm_process_device *qpd = NULL;
+
+ down_write(&p->lock);
+ list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+ per_device_list) {
+ dev = pdd->dev;
+ qpd = &pdd->qpd;
+ if (!dev->cwsr_enabled || qpd->tba_addr)
+ continue;
+ if (qpd->cwsr_base) {
+ /* cwsr_base is only set for DGPU */
+
+ /* can't hold the process lock while
+ * allocating from KGD */
+ up_write(&p->lock);
+
+ err = dev->kfd2kgd->alloc_memory_of_gpu(
+ dev->kgd, qpd->cwsr_base, dev->cwsr_size,
+ pdd->vm, (struct kgd_mem **)&mem,
+ NULL, &qpd->cwsr_kaddr, pdd,
+ ALLOC_MEM_FLAGS_GTT |
+ ALLOC_MEM_FLAGS_NONPAGED |
+ ALLOC_MEM_FLAGS_EXECUTE_ACCESS |
+ ALLOC_MEM_FLAGS_NO_SUBSTITUTE);
+ if (err)
+ goto err_alloc_tba;
+ err = kfd_map_memory_to_gpu(dev, mem, p, pdd);
+ if (err)
+ goto err_map_tba;
+
+ down_write(&p->lock);
+ /* Check if someone else allocated the memory
+ * while we weren't looking */
+ if (qpd->tba_addr) {
+ up_write(&p->lock);
+ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd,
+ (struct kgd_mem *)mem, pdd->vm);
+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem);
+ down_write(&p->lock);
+ } else {
+ qpd->cwsr_mem_handle =
+ kfd_process_device_create_obj_handle(
+ pdd, mem, qpd->cwsr_base,
+ dev->cwsr_size);
+ if (qpd->cwsr_mem_handle < 0)
+ goto err_create_handle;
+
+ memcpy(qpd->cwsr_kaddr, kmap(dev->cwsr_pages),
+ PAGE_SIZE);
+ kunmap(dev->cwsr_pages);
+ qpd->tba_addr = qpd->cwsr_base;
+ }
+ } else {
+ offset = (kfd_get_gpu_id(dev) |
+ KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT;
+ qpd->tba_addr = (uint64_t)vm_mmap(filep, 0,
+ dev->cwsr_size, PROT_READ | PROT_EXEC,
+ MAP_SHARED, offset);
+ qpd->cwsr_kaddr = (void *)qpd->tba_addr;
+ }
+ if (IS_ERR_VALUE(qpd->tba_addr)) {
+ pr_err("Failure to set tba address. error -%d.\n",
+ (int)qpd->tba_addr);
+ qpd->tba_addr = 0;
+ qpd->cwsr_kaddr = NULL;
+ } else
+ qpd->tma_addr = qpd->tba_addr + dev->tma_offset;
+ pr_debug("set tba :0x%llx, tma:0x%llx for pqm.\n",
+ qpd->tba_addr, qpd->tma_addr);
+ }
+
+err_create_handle:
+ up_write(&p->lock);
+ return err;
+
+err_map_tba:
+ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem);
+err_alloc_tba:
+ return err;
+}
+
static struct kfd_process *create_process(const struct task_struct *thread)
{
struct kfd_process *process;
@@ -281,6 +566,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
if (!process)
goto err_alloc_process;
+ process->bo_interval_tree = RB_ROOT;
+
process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE,
sizeof(process->queues[0]), GFP_KERNEL);
if (!process->queues)
@@ -290,7 +577,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
if (process->pasid == 0)
goto err_alloc_pasid;
- mutex_init(&process->mutex);
+ init_rwsem(&process->lock);
process->mm = thread->mm;
@@ -362,8 +649,22 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
INIT_LIST_HEAD(&pdd->qpd.queues_list);
INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
pdd->qpd.dqm = dev->dqm;
+ pdd->qpd.pqm = &p->pqm;
+ pdd->qpd.evicted = 0;
pdd->reset_wavefronts = false;
+ pdd->process = p;
list_add(&pdd->per_device_list, &p->per_device_data);
+
+ /* Init idr used for memory handle translation */
+ idr_init(&pdd->alloc_idr);
+
+ /* Create the GPUVM context for this specific device */
+ if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm)) {
+ pr_err("Failed to create process VM object\n");
+ list_del(&pdd->per_device_list);
+ kfree(pdd);
+ pdd = NULL;
+ }
}
return pdd;
@@ -391,9 +692,11 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
if (pdd->bound)
return pdd;
- err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
- if (err < 0)
- return ERR_PTR(err);
+ if (dev->device_info->is_need_iommu_device) {
+ err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
pdd->bound = true;
@@ -405,6 +708,7 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
struct kfd_process *p;
struct kfd_process_device *pdd;
int idx, i;
+ long status = -EFAULT;
BUG_ON(dev == NULL);
@@ -478,7 +782,116 @@ bool kfd_has_process_device_data(struct kfd_process *p)
return !(list_empty(&p->per_device_data));
}
-/* This returns with process->mutex locked. */
+/* Create specific handle mapped to mem from process local memory idr
+ * Assumes that the process lock is held. */
+int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+ void *mem, uint64_t start,
+ uint64_t length)
+{
+ int handle;
+ struct kfd_bo *buf_obj;
+ struct kfd_process *p;
+
+ BUG_ON(pdd == NULL);
+ BUG_ON(mem == NULL);
+
+ p = pdd->process;
+
+ buf_obj = kmalloc(sizeof(*buf_obj), GFP_KERNEL);
+
+ if (!buf_obj)
+ return -ENOMEM;
+
+ buf_obj->it.start = start;
+ buf_obj->it.last = start + length - 1;
+ interval_tree_insert(&buf_obj->it, &p->bo_interval_tree);
+
+ buf_obj->mem = mem;
+ buf_obj->dev = pdd->dev;
+
+ INIT_LIST_HEAD(&buf_obj->cb_data_head);
+
+ idr_preload(GFP_KERNEL);
+
+ handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID,
+ GFP_NOWAIT);
+
+ idr_preload_end();
+
+ return handle;
+}
+
+struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd,
+ int handle)
+{
+ BUG_ON(pdd == NULL);
+
+ if (handle < 0)
+ return NULL;
+
+ return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle);
+}
+
+/* Translate specific handle from process local memory idr
+ * Assumes that the process lock is held. */
+void *kfd_process_device_translate_handle(struct kfd_process_device *pdd,
+ int handle)
+{
+ struct kfd_bo *buf_obj;
+
+ buf_obj = kfd_process_device_find_bo(pdd, handle);
+
+ return buf_obj->mem;
+}
+
+void *kfd_process_find_bo_from_interval(struct kfd_process *p,
+ uint64_t start_addr,
+ uint64_t last_addr)
+{
+ struct interval_tree_node *it_node;
+ struct kfd_bo *buf_obj;
+
+ it_node = interval_tree_iter_first(&p->bo_interval_tree,
+ start_addr, last_addr);
+ if (!it_node) {
+ pr_err("%llu - %llu does not relate to an existing buffer\n",
+ start_addr, last_addr);
+ return NULL;
+ }
+
+ BUG_ON(NULL != interval_tree_iter_next(it_node,
+ start_addr, last_addr));
+
+ buf_obj = container_of(it_node, struct kfd_bo, it);
+
+ return buf_obj;
+}
+
+/* Remove specific handle from process local memory idr
+ * Assumes that the process lock is held. */
+void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
+ int handle)
+{
+ struct kfd_bo *buf_obj;
+ struct kfd_process *p;
+
+ BUG_ON(pdd == NULL);
+
+ p = pdd->process;
+
+ if (handle < 0)
+ return;
+
+ buf_obj = kfd_process_device_find_bo(pdd, handle);
+
+ idr_remove(&pdd->alloc_idr, handle);
+
+ interval_tree_remove(&buf_obj->it, &p->bo_interval_tree);
+
+ kfree(buf_obj);
+}
+
+/* This returns with process->lock read-locked. */
struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
{
struct kfd_process *p;
@@ -488,7 +901,7 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
if (p->pasid == pasid) {
- mutex_lock(&p->mutex);
+ down_read(&p->lock);
break;
}
}
@@ -497,3 +910,53 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
return p;
}
+
+/* This returns with process->lock read-locked. */
+struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
+{
+ struct kfd_process *p;
+
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ p = find_process_by_mm(mm);
+ if (p != NULL)
+ down_read(&p->lock);
+
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+
+ return p;
+}
+
+int kfd_reserved_mem_mmap(struct kfd_process *process, struct vm_area_struct *vma)
+{
+ unsigned long pfn, i;
+ int ret = 0;
+ struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
+
+ if (dev == NULL)
+ return -EINVAL;
+ if ((vma->vm_start & (PAGE_SIZE - 1)) ||
+ (vma->vm_end & (PAGE_SIZE - 1))) {
+ pr_err("KFD only support page aligned memory map.\n");
+ return -EINVAL;
+ }
+
+ pr_debug("kfd reserved mem mmap been called.\n");
+ /* We supported two reserved memory mmap in the future .
+ 1. Trap handler code and parameter (TBA and TMA , 2 pages total)
+ 2. Relaunch stack (control block, 1 page for Carrizo)
+ */
+
+ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); ++i) {
+ pfn = page_to_pfn(&dev->cwsr_pages[i]);
+ vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
+ | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
+ /* mapping the page to user process */
+ ret = remap_pfn_range(vma, vma->vm_start + (i << PAGE_SHIFT),
+ pfn, PAGE_SIZE, vma->vm_page_prot);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 7b69070f7ecc..8e2c9a7d8957 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -89,23 +89,36 @@ void pqm_uninit(struct process_queue_manager *pqm)
{
int retval;
struct process_queue_node *pqn, *next;
+ struct kfd_process_device *pdd;
+ struct kfd_dev *dev = NULL;
BUG_ON(!pqm);
pr_debug("In func %s\n", __func__);
list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
- retval = pqm_destroy_queue(
- pqm,
- (pqn->q != NULL) ?
- pqn->q->properties.queue_id :
- pqn->kq->queue->properties.queue_id);
-
- if (retval != 0) {
- pr_err("kfd: failed to destroy queue\n");
- return;
+ if (pqn->q)
+ dev = pqn->q->device;
+ else if (pqn->kq)
+ dev = pqn->kq->dev;
+ else
+ BUG();
+
+ pdd = kfd_get_process_device_data(dev, pqm->process);
+ if (pdd) {
+ retval = dev->dqm->ops.process_termination
+ (dev->dqm, &pdd->qpd);
+ if (retval != 0)
+ pdd->reset_wavefronts = true;
}
}
+
+ list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
+ uninit_queue(pqn->q);
+ list_del(&pqn->process_queue_list);
+ kfree(pqn);
+ }
+
kfree(pqm->queue_slot_bitmap);
pqm->queue_slot_bitmap = NULL;
}
@@ -129,7 +142,7 @@ static int create_cp_queue(struct process_queue_manager *pqm,
q_properties->vmid = 0;
q_properties->queue_id = qid;
- retval = init_queue(q, *q_properties);
+ retval = init_queue(q, q_properties);
if (retval != 0)
goto err_init_queue;
@@ -148,23 +161,19 @@ int pqm_create_queue(struct process_queue_manager *pqm,
struct kfd_dev *dev,
struct file *f,
struct queue_properties *properties,
- unsigned int flags,
- enum kfd_queue_type type,
unsigned int *qid)
{
int retval;
struct kfd_process_device *pdd;
- struct queue_properties q_properties;
struct queue *q;
struct process_queue_node *pqn;
struct kernel_queue *kq;
int num_queues = 0;
struct queue *cur;
+ enum kfd_queue_type type = properties->type;
BUG_ON(!pqm || !dev || !properties || !qid);
- memset(&q_properties, 0, sizeof(struct queue_properties));
- memcpy(&q_properties, properties, sizeof(struct queue_properties));
q = NULL;
kq = NULL;
@@ -192,10 +201,9 @@ int pqm_create_queue(struct process_queue_manager *pqm,
if (retval != 0)
return retval;
- if (list_empty(&pqm->queues)) {
- pdd->qpd.pqm = pqm;
+ if (list_empty(&pdd->qpd.queues_list) &&
+ list_empty(&pdd->qpd.priv_queue_list))
dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
- }
pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL);
if (!pqn) {
@@ -205,17 +213,34 @@ int pqm_create_queue(struct process_queue_manager *pqm,
switch (type) {
case KFD_QUEUE_TYPE_SDMA:
+ if (dev->dqm->sdma_queue_count >= CIK_SDMA_QUEUES) {
+ pr_err("kfd: over-subscription is not allowed for SDMA.\n");
+ retval = -EPERM;
+ goto err_create_queue;
+ }
+
+ retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
+ if (retval != 0)
+ goto err_create_queue;
+ pqn->q = q;
+ pqn->kq = NULL;
+ retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd,
+ &q->properties.vmid);
+ pr_debug("DQM returned %d for create_queue\n", retval);
+ print_queue(q);
+ break;
+
case KFD_QUEUE_TYPE_COMPUTE:
/* check if there is over subscription */
if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
- ((dev->dqm->processes_count >= VMID_PER_DEVICE) ||
+ ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
(dev->dqm->queue_count >= PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE))) {
pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
retval = -EPERM;
goto err_create_queue;
}
- retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid);
+ retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
if (retval != 0)
goto err_create_queue;
pqn->q = q;
@@ -252,9 +277,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
list_add(&pqn->process_queue_list, &pqm->queues);
if (q) {
- *properties = q->properties;
pr_debug("kfd: PQM done creating queue\n");
- print_queue_properties(properties);
+ print_queue_properties(&q->properties);
}
return retval;
@@ -264,7 +288,8 @@ err_create_queue:
err_allocate_pqn:
/* check if queues list is empty unregister process from device */
clear_bit(*qid, pqm->queue_slot_bitmap);
- if (list_empty(&pqm->queues))
+ if (list_empty(&pdd->qpd.queues_list) &&
+ list_empty(&pdd->qpd.priv_queue_list))
dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd);
return retval;
}
@@ -313,9 +338,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
if (pqn->q) {
dqm = pqn->q->device->dqm;
retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
- if (retval != 0)
+ if (retval != 0) {
+ if (retval == -ETIME)
+ pdd->reset_wavefronts = true;
return retval;
-
+ }
uninit_queue(pqn->q);
}
@@ -323,7 +350,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
kfree(pqn);
clear_bit(qid, pqm->queue_slot_bitmap);
- if (list_empty(&pqm->queues))
+ if (list_empty(&pdd->qpd.queues_list) &&
+ list_empty(&pdd->qpd.priv_queue_list))
dqm->ops.unregister_process(dqm, &pdd->qpd);
return retval;
@@ -357,6 +385,31 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
return 0;
}
+int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
+ struct queue_properties *p)
+{
+ int retval;
+ struct process_queue_node *pqn;
+
+ BUG_ON(!pqm);
+
+ pqn = get_queue_by_qid(pqm, qid);
+ if (!pqn) {
+ pr_debug("amdkfd: No queue %d exists for update operation\n",
+ qid);
+ return -EFAULT;
+ }
+
+ pqn->q->properties.cu_mask = p->cu_mask;
+
+ retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+ pqn->q);
+ if (retval != 0)
+ return retval;
+
+ return 0;
+}
+
struct kernel_queue *pqm_get_kernel_queue(
struct process_queue_manager *pqm,
unsigned int qid)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 9a0c90b0702e..0ab197077f2d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -63,7 +63,7 @@ void print_queue(struct queue *q)
pr_debug("Queue Device Address: 0x%p\n", q->device);
}
-int init_queue(struct queue **q, struct queue_properties properties)
+int init_queue(struct queue **q, const struct queue_properties *properties)
{
struct queue *tmp;
@@ -73,7 +73,7 @@ int init_queue(struct queue **q, struct queue_properties properties)
if (!tmp)
return -ENOMEM;
- memcpy(&tmp->properties, &properties, sizeof(struct queue_properties));
+ memcpy(&tmp->properties, properties, sizeof(struct queue_properties));
*q = tmp;
return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
new file mode 100644
index 000000000000..69bdaf12a9eb
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/pid.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include "amd_rdma.h"
+#include "kfd_priv.h"
+
+
+struct rdma_cb {
+ struct list_head node;
+ struct amd_p2p_info amd_p2p_data;
+ void (*free_callback)(void *client_priv);
+ void *client_priv;
+};
+
+/**
+ * This function makes the pages underlying a range of GPU virtual memory
+ * accessible for DMA operations from another PCIe device
+ *
+ * \param address - The start address in the Unified Virtual Address
+ * space in the specified process
+ * \param length - The length of requested mapping
+ * \param pid - Pointer to structure pid to which address belongs.
+ * Could be NULL for current process address space.
+ * \param p2p_data - On return: Pointer to structure describing
+ * underlying pages/locations
+ * \param free_callback - Pointer to callback which will be called when access
+ * to such memory must be stopped immediately: Memory
+ * was freed, GECC events, etc.
+ * Client should immediately stop any transfer
+ * operations and returned as soon as possible.
+ * After return all resources associated with address
+ * will be release and no access will be allowed.
+ * \param client_priv - Pointer to be passed as parameter on
+ * 'free_callback;
+ *
+ * \return 0 if operation was successful
+ */
+static int get_pages(uint64_t address, uint64_t length, struct pid *pid,
+ struct amd_p2p_info **amd_p2p_data,
+ void (*free_callback)(void *client_priv),
+ void *client_priv)
+{
+ struct kfd_bo *buf_obj;
+ struct kgd_mem *mem;
+ struct sg_table *sg_table_tmp;
+ struct kfd_dev *dev;
+ uint64_t last = address + length - 1;
+ uint64_t offset;
+ struct kfd_process *p;
+ struct rdma_cb *rdma_cb_data;
+ int ret = 0;
+
+ p = kfd_lookup_process_by_pid(pid);
+ if (!p) {
+ pr_err("could not find the process in %s.\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ buf_obj = kfd_process_find_bo_from_interval(p, address, last);
+ if (!buf_obj) {
+ pr_err("can not find a kfd_bo for the range\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL);
+ if (!rdma_cb_data) {
+ *amd_p2p_data = NULL;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mem = buf_obj->mem;
+ dev = buf_obj->dev;
+ offset = address - buf_obj->it.start;
+
+ ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem,
+ offset, length, &sg_table_tmp);
+
+ if (ret) {
+ pr_err("pin_get_sg_table_bo failed.\n");
+ *amd_p2p_data = NULL;
+ goto free_mem;
+ }
+
+ rdma_cb_data->amd_p2p_data.va = address;
+ rdma_cb_data->amd_p2p_data.size = length;
+ rdma_cb_data->amd_p2p_data.pid = pid;
+ rdma_cb_data->amd_p2p_data.priv = buf_obj;
+ rdma_cb_data->amd_p2p_data.pages = sg_table_tmp;
+
+ rdma_cb_data->free_callback = free_callback;
+ rdma_cb_data->client_priv = client_priv;
+
+ list_add(&rdma_cb_data->node, &buf_obj->cb_data_head);
+
+ *amd_p2p_data = &rdma_cb_data->amd_p2p_data;
+
+ goto out;
+
+free_mem:
+ kfree(rdma_cb_data);
+out:
+ up_read(&p->lock);
+
+ return ret;
+}
+
+static int put_pages_helper(struct amd_p2p_info *p2p_data)
+{
+ struct kfd_bo *buf_obj;
+ struct kfd_dev *dev;
+ struct sg_table *sg_table_tmp;
+ struct rdma_cb *rdma_cb_data;
+
+ if (!p2p_data) {
+ pr_err("amd_p2p_info pointer is invalid.\n");
+ return -EINVAL;
+ }
+
+ rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data);
+
+ buf_obj = p2p_data->priv;
+ dev = buf_obj->dev;
+ sg_table_tmp = p2p_data->pages;
+
+ list_del(&rdma_cb_data->node);
+ kfree(rdma_cb_data);
+
+ dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp);
+
+
+ return 0;
+}
+
+void run_rdma_free_callback(struct kfd_bo *buf_obj)
+{
+ struct rdma_cb *tmp, *rdma_cb_data;
+
+ list_for_each_entry_safe(rdma_cb_data, tmp,
+ &buf_obj->cb_data_head, node) {
+ if (rdma_cb_data->free_callback)
+ rdma_cb_data->free_callback(
+ rdma_cb_data->client_priv);
+
+ put_pages_helper(&rdma_cb_data->amd_p2p_data);
+ }
+}
+
+/**
+ *
+ * This function release resources previously allocated by get_pages() call.
+ *
+ * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries
+ * allocated by get_pages() call.
+ *
+ * \return 0 if operation was successful
+ */
+static int put_pages(struct amd_p2p_info **p_p2p_data)
+{
+ struct kfd_process *p = NULL;
+ int ret = 0;
+
+ if (!(*p_p2p_data)) {
+ pr_err("amd_p2p_info pointer is invalid.\n");
+ return -EINVAL;
+ }
+
+ p = kfd_lookup_process_by_pid((*p_p2p_data)->pid);
+ if (!p) {
+ pr_err("could not find the process in %s\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ ret = put_pages_helper(*p_p2p_data);
+
+ if (!ret)
+ *p_p2p_data = NULL;
+
+ up_read(&p->lock);
+
+ return ret;
+}
+
+/**
+ * Check if given address belongs to GPU address space.
+ *
+ * \param address - Address to check
+ * \param pid - Process to which given address belongs.
+ * Could be NULL if current one.
+ *
+ * \return 0 - This is not GPU address managed by AMD driver
+ * 1 - This is GPU address managed by AMD driver
+ */
+static int is_gpu_address(uint64_t address, struct pid *pid)
+{
+ struct kfd_bo *buf_obj;
+ struct kfd_process *p;
+
+ p = kfd_lookup_process_by_pid(pid);
+ if (!p) {
+ pr_err("could not find the process in %s.\n",
+ __func__);
+ return 0;
+ }
+
+ buf_obj = kfd_process_find_bo_from_interval(p, address, address);
+
+ up_read(&p->lock);
+ if (!buf_obj)
+ return 0;
+ else
+ return 1;
+}
+
+/**
+ * Return the single page size to be used when building scatter/gather table
+ * for given range.
+ *
+ * \param address - Address
+ * \param length - Range length
+ * \param pid - Process id structure. Could be NULL if current one.
+ * \param page_size - On return: Page size
+ *
+ * \return 0 if operation was successful
+ */
+static int get_page_size(uint64_t address, uint64_t length, struct pid *pid,
+ unsigned long *page_size)
+{
+ /*
+ * As local memory is always consecutive, we can assume the local
+ * memory page size to be arbitrary.
+ * Currently we assume the local memory page size to be the same
+ * as system memory, which is 4KB.
+ */
+ *page_size = PAGE_SIZE;
+
+ return 0;
+}
+
+
+/**
+ * Singleton object: rdma interface function pointers
+ */
+static const struct amd_rdma_interface rdma_ops = {
+ .get_pages = get_pages,
+ .put_pages = put_pages,
+ .is_gpu_address = is_gpu_address,
+ .get_page_size = get_page_size,
+};
+
+/**
+ * amdkfd_query_rdma_interface - Return interface (function pointers table) for
+ * rdma interface
+ *
+ *
+ * \param interace - OUT: Pointer to interface
+ *
+ * \return 0 if operation was successful.
+ */
+int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops)
+{
+ *ops = &rdma_ops;
+
+ return 0;
+}
+EXPORT_SYMBOL(amdkfd_query_rdma_interface);
+
+
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 884c96f50c3d..4e357eb068bf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -28,16 +28,19 @@
#include <linux/hash.h>
#include <linux/cpufreq.h>
#include <linux/log2.h>
+#include <linux/dmi.h>
+#include <linux/atomic.h>
#include "kfd_priv.h"
#include "kfd_crat.h"
#include "kfd_topology.h"
-static struct list_head topology_device_list;
-static int topology_crat_parsed;
+/* topology_device_list - Master list of all topology devices */
+struct list_head topology_device_list;
static struct kfd_system_properties sys_props;
static DECLARE_RWSEM(topology_lock);
+static atomic_t topology_crat_proximity_domain;
struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
{
@@ -57,311 +60,61 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
return device;
}
-struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
+uint32_t kfd_get_gpu_id(struct kfd_dev *dev)
{
struct kfd_topology_device *top_dev;
- struct kfd_dev *device = NULL;
+ uint32_t gpu_id = 0;
down_read(&topology_lock);
list_for_each_entry(top_dev, &topology_device_list, list)
- if (top_dev->gpu->pdev == pdev) {
- device = top_dev->gpu;
+ if (top_dev->gpu == dev) {
+ gpu_id = top_dev->gpu_id;
break;
}
up_read(&topology_lock);
- return device;
-}
-
-static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size)
-{
- struct acpi_table_header *crat_table;
- acpi_status status;
-
- if (!size)
- return -EINVAL;
-
- /*
- * Fetch the CRAT table from ACPI
- */
- status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
- if (status == AE_NOT_FOUND) {
- pr_warn("CRAT table not found\n");
- return -ENODATA;
- } else if (ACPI_FAILURE(status)) {
- const char *err = acpi_format_exception(status);
-
- pr_err("CRAT table error: %s\n", err);
- return -EINVAL;
- }
-
- if (*size >= crat_table->length && crat_image != NULL)
- memcpy(crat_image, crat_table, crat_table->length);
-
- *size = crat_table->length;
-
- return 0;
-}
-
-static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
- struct crat_subtype_computeunit *cu)
-{
- BUG_ON(!dev);
- BUG_ON(!cu);
-
- dev->node_props.cpu_cores_count = cu->num_cpu_cores;
- dev->node_props.cpu_core_id_base = cu->processor_id_low;
- if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
- dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
-
- pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
- cu->processor_id_low);
-}
-
-static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
- struct crat_subtype_computeunit *cu)
-{
- BUG_ON(!dev);
- BUG_ON(!cu);
-
- dev->node_props.simd_id_base = cu->processor_id_low;
- dev->node_props.simd_count = cu->num_simd_cores;
- dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
- dev->node_props.max_waves_per_simd = cu->max_waves_simd;
- dev->node_props.wave_front_size = cu->wave_front_size;
- dev->node_props.mem_banks_count = cu->num_banks;
- dev->node_props.array_count = cu->num_arrays;
- dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
- dev->node_props.simd_per_cu = cu->num_simd_per_cu;
- dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
- if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
- dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
- pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores,
- cu->processor_id_low);
-}
-
-/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */
-static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu)
-{
- struct kfd_topology_device *dev;
- int i = 0;
-
- BUG_ON(!cu);
-
- pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
- cu->proximity_domain, cu->hsa_capability);
- list_for_each_entry(dev, &topology_device_list, list) {
- if (cu->proximity_domain == i) {
- if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
- kfd_populated_cu_info_cpu(dev, cu);
-
- if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
- kfd_populated_cu_info_gpu(dev, cu);
- break;
- }
- i++;
- }
-
- return 0;
+ return gpu_id;
}
-/*
- * kfd_parse_subtype_mem is called when the topology mutex is
- * already acquired
- */
-static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem)
+struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
{
- struct kfd_mem_properties *props;
- struct kfd_topology_device *dev;
- int i = 0;
-
- BUG_ON(!mem);
-
- pr_info("Found memory entry in CRAT table with proximity_domain=%d\n",
- mem->promixity_domain);
- list_for_each_entry(dev, &topology_device_list, list) {
- if (mem->promixity_domain == i) {
- props = kfd_alloc_struct(props);
- if (props == NULL)
- return -ENOMEM;
-
- if (dev->node_props.cpu_cores_count == 0)
- props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE;
- else
- props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
-
- if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
- props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
- if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
- props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
-
- props->size_in_bytes =
- ((uint64_t)mem->length_high << 32) +
- mem->length_low;
- props->width = mem->width;
+ struct kfd_topology_device *top_dev;
+ struct kfd_dev *device = NULL;
- dev->mem_bank_count++;
- list_add_tail(&props->list, &dev->mem_props);
+ down_read(&topology_lock);
+ list_for_each_entry(top_dev, &topology_device_list, list)
+ if (top_dev->gpu && top_dev->gpu->pdev == pdev) {
+ device = top_dev->gpu;
break;
}
- i++;
- }
-
- return 0;
-}
-/*
- * kfd_parse_subtype_cache is called when the topology mutex
- * is already acquired
- */
-static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache)
-{
- struct kfd_cache_properties *props;
- struct kfd_topology_device *dev;
- uint32_t id;
-
- BUG_ON(!cache);
-
- id = cache->processor_id_low;
-
- pr_info("Found cache entry in CRAT table with processor_id=%d\n", id);
- list_for_each_entry(dev, &topology_device_list, list)
- if (id == dev->node_props.cpu_core_id_base ||
- id == dev->node_props.simd_id_base) {
- props = kfd_alloc_struct(props);
- if (props == NULL)
- return -ENOMEM;
-
- props->processor_id_low = id;
- props->cache_level = cache->cache_level;
- props->cache_size = cache->cache_size;
- props->cacheline_size = cache->cache_line_size;
- props->cachelines_per_tag = cache->lines_per_tag;
- props->cache_assoc = cache->associativity;
- props->cache_latency = cache->cache_latency;
-
- if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
- props->cache_type |= HSA_CACHE_TYPE_DATA;
- if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
- props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
- if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
- props->cache_type |= HSA_CACHE_TYPE_CPU;
- if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
- props->cache_type |= HSA_CACHE_TYPE_HSACU;
-
- dev->cache_count++;
- dev->node_props.caches_count++;
- list_add_tail(&props->list, &dev->cache_props);
-
- break;
- }
+ up_read(&topology_lock);
- return 0;
+ return device;
}
-/*
- * kfd_parse_subtype_iolink is called when the topology mutex
- * is already acquired
- */
-static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink)
+struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd)
{
- struct kfd_iolink_properties *props;
- struct kfd_topology_device *dev;
- uint32_t i = 0;
- uint32_t id_from;
- uint32_t id_to;
-
- BUG_ON(!iolink);
-
- id_from = iolink->proximity_domain_from;
- id_to = iolink->proximity_domain_to;
+ struct kfd_topology_device *top_dev;
+ struct kfd_dev *device = NULL;
- pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from);
- list_for_each_entry(dev, &topology_device_list, list) {
- if (id_from == i) {
- props = kfd_alloc_struct(props);
- if (props == NULL)
- return -ENOMEM;
-
- props->node_from = id_from;
- props->node_to = id_to;
- props->ver_maj = iolink->version_major;
- props->ver_min = iolink->version_minor;
-
- /*
- * weight factor (derived from CDIR), currently always 1
- */
- props->weight = 1;
-
- props->min_latency = iolink->minimum_latency;
- props->max_latency = iolink->maximum_latency;
- props->min_bandwidth = iolink->minimum_bandwidth_mbs;
- props->max_bandwidth = iolink->maximum_bandwidth_mbs;
- props->rec_transfer_size =
- iolink->recommended_transfer_size;
-
- dev->io_link_count++;
- dev->node_props.io_links_count++;
- list_add_tail(&props->list, &dev->io_link_props);
+ down_read(&topology_lock);
+ list_for_each_entry(top_dev, &topology_device_list, list)
+ if (top_dev->gpu && top_dev->gpu->kgd == kgd) {
+ device = top_dev->gpu;
break;
}
- i++;
- }
-
- return 0;
-}
-static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr)
-{
- struct crat_subtype_computeunit *cu;
- struct crat_subtype_memory *mem;
- struct crat_subtype_cache *cache;
- struct crat_subtype_iolink *iolink;
- int ret = 0;
-
- BUG_ON(!sub_type_hdr);
-
- switch (sub_type_hdr->type) {
- case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
- cu = (struct crat_subtype_computeunit *)sub_type_hdr;
- ret = kfd_parse_subtype_cu(cu);
- break;
- case CRAT_SUBTYPE_MEMORY_AFFINITY:
- mem = (struct crat_subtype_memory *)sub_type_hdr;
- ret = kfd_parse_subtype_mem(mem);
- break;
- case CRAT_SUBTYPE_CACHE_AFFINITY:
- cache = (struct crat_subtype_cache *)sub_type_hdr;
- ret = kfd_parse_subtype_cache(cache);
- break;
- case CRAT_SUBTYPE_TLB_AFFINITY:
- /*
- * For now, nothing to do here
- */
- pr_info("Found TLB entry in CRAT table (not processing)\n");
- break;
- case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
- /*
- * For now, nothing to do here
- */
- pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n");
- break;
- case CRAT_SUBTYPE_IOLINK_AFFINITY:
- iolink = (struct crat_subtype_iolink *)sub_type_hdr;
- ret = kfd_parse_subtype_iolink(iolink);
- break;
- default:
- pr_warn("Unknown subtype (%d) in CRAT\n",
- sub_type_hdr->type);
- }
+ up_read(&topology_lock);
- return ret;
+ return device;
}
+/* Called with write topology_lock acquired */
static void kfd_release_topology_device(struct kfd_topology_device *dev)
{
struct kfd_mem_properties *mem;
@@ -398,20 +151,22 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
sys_props.num_devices--;
}
-static void kfd_release_live_view(void)
+void kfd_release_live_view(void)
{
struct kfd_topology_device *dev;
+ down_write(&topology_lock);
while (topology_device_list.next != &topology_device_list) {
dev = container_of(topology_device_list.next,
struct kfd_topology_device, list);
kfd_release_topology_device(dev);
-}
-
+ }
+ up_write(&topology_lock);
memset(&sys_props, 0, sizeof(sys_props));
}
-static struct kfd_topology_device *kfd_create_topology_device(void)
+struct kfd_topology_device *kfd_create_topology_device(
+ struct list_head *device_list)
{
struct kfd_topology_device *dev;
@@ -425,65 +180,12 @@ static struct kfd_topology_device *kfd_create_topology_device(void)
INIT_LIST_HEAD(&dev->cache_props);
INIT_LIST_HEAD(&dev->io_link_props);
- list_add_tail(&dev->list, &topology_device_list);
+ list_add_tail(&dev->list, device_list);
sys_props.num_devices++;
return dev;
}
-static int kfd_parse_crat_table(void *crat_image)
-{
- struct kfd_topology_device *top_dev;
- struct crat_subtype_generic *sub_type_hdr;
- uint16_t node_id;
- int ret;
- struct crat_header *crat_table = (struct crat_header *)crat_image;
- uint16_t num_nodes;
- uint32_t image_len;
-
- if (!crat_image)
- return -EINVAL;
-
- num_nodes = crat_table->num_domains;
- image_len = crat_table->length;
-
- pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
-
- for (node_id = 0; node_id < num_nodes; node_id++) {
- top_dev = kfd_create_topology_device();
- if (!top_dev) {
- kfd_release_live_view();
- return -ENOMEM;
- }
- }
-
- sys_props.platform_id =
- (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK;
- sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id);
- sys_props.platform_rev = crat_table->revision;
-
- sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
- while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
- ((char *)crat_image) + image_len) {
- if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
- ret = kfd_parse_subtype(sub_type_hdr);
- if (ret != 0) {
- kfd_release_live_view();
- return ret;
- }
- }
-
- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
- sub_type_hdr->length);
- }
-
- sys_props.generation_count++;
- topology_crat_parsed = 1;
-
- return 0;
-}
-
-
#define sysfs_show_gen_prop(buffer, fmt, ...) \
snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__)
#define sysfs_show_32bit_prop(buffer, name, value) \
@@ -593,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
char *buffer)
{
ssize_t ret;
- uint32_t i;
+ uint32_t i, j;
struct kfd_cache_properties *cache;
/* Making sure that the buffer is an empty string */
@@ -611,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency);
sysfs_show_32bit_prop(buffer, "type", cache->cache_type);
snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer);
- for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++)
- ret = snprintf(buffer, PAGE_SIZE, "%s%d%s",
- buffer, cache->sibling_map[i],
- (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ?
- "\n" : ",");
-
+ for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
+ for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) {
+ /* Check each bit */
+ if (cache->sibling_map[i] & (1 << j))
+ ret = snprintf(buffer, PAGE_SIZE,
+ "%s%d%s", buffer, 1, ",");
+ else
+ ret = snprintf(buffer, PAGE_SIZE,
+ "%s%d%s", buffer, 0, ",");
+ }
+ /* Replace the last "," with end of line */
+ *(buffer + strlen(buffer) - 1) = 0xA;
return ret;
}
@@ -635,6 +343,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
uint32_t i;
uint32_t log_max_watch_addr;
+ struct kfd_local_mem_info local_mem_info;
/* Making sure that the buffer is an empty string */
buffer[0] = 0;
@@ -665,16 +374,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
sysfs_show_32bit_prop(buffer, "simd_count",
dev->node_props.simd_count);
- if (dev->mem_bank_count < dev->node_props.mem_banks_count) {
- pr_info_once("kfd: mem_banks_count truncated from %d to %d\n",
- dev->node_props.mem_banks_count,
- dev->mem_bank_count);
- sysfs_show_32bit_prop(buffer, "mem_banks_count",
- dev->mem_bank_count);
- } else {
- sysfs_show_32bit_prop(buffer, "mem_banks_count",
- dev->node_props.mem_banks_count);
- }
+ sysfs_show_32bit_prop(buffer, "mem_banks_count",
+ dev->node_props.mem_banks_count);
sysfs_show_32bit_prop(buffer, "caches_count",
dev->node_props.caches_count);
@@ -723,17 +424,30 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
HSA_CAP_WATCH_POINTS_TOTALBITS_MASK);
}
+ if (dev->gpu->device_info->asic_family == CHIP_TONGA)
+ dev->node_props.capability |=
+ HSA_CAP_AQL_QUEUE_DOUBLE_MAP;
+
sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
- dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(
- dev->gpu->kgd));
+ dev->node_props.max_engine_clk_fcompute);
- sysfs_show_64bit_prop(buffer, "local_mem_size",
- (unsigned long long int) 0);
+ /*
+ * If the ASIC is CZ, set local memory size to 0 to disable
+ * local memory support
+ */
+ if (dev->gpu->device_info->asic_family != CHIP_CARRIZO) {
+ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
+ &local_mem_info);
+ sysfs_show_64bit_prop(buffer, "local_mem_size",
+ local_mem_info.local_mem_size_private +
+ local_mem_info.local_mem_size_public);
+ }
+ else
+ sysfs_show_64bit_prop(buffer, "local_mem_size",
+ (unsigned long long int) 0);
sysfs_show_32bit_prop(buffer, "fw_version",
- dev->gpu->kfd2kgd->get_fw_version(
- dev->gpu->kgd,
- KGD_ENGINE_MEC1));
+ dev->gpu->mec_fw_version);
sysfs_show_32bit_prop(buffer, "capability",
dev->node_props.capability);
}
@@ -928,6 +642,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
return 0;
}
+/* Called with write topology lock acquired */
static int kfd_build_sysfs_node_tree(void)
{
struct kfd_topology_device *dev;
@@ -944,6 +659,7 @@ static int kfd_build_sysfs_node_tree(void)
return 0;
}
+/* Called with write topology lock acquired */
static void kfd_remove_sysfs_node_tree(void)
{
struct kfd_topology_device *dev;
@@ -1015,101 +731,221 @@ static void kfd_topology_release_sysfs(void)
}
}
+/* Called with write topology_lock acquired */
+static int kfd_topology_update_device_list(struct list_head *temp_list,
+ struct list_head *master_list)
+{
+ int num = 0;
+
+ while (!list_empty(temp_list)) {
+ list_move_tail(temp_list->next, master_list);
+ num++;
+ }
+ return num;
+}
+
+static void kfd_debug_print_topology(void)
+{
+ struct kfd_topology_device *dev;
+
+ down_read(&topology_lock);
+
+ dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list);
+ if (dev) {
+ if (dev->node_props.cpu_cores_count && dev->node_props.simd_count) {
+ pr_info("Topology: Add APU node [0x%0x:0x%0x]\n",
+ dev->node_props.device_id, dev->node_props.vendor_id);
+ }
+ else if (dev->node_props.cpu_cores_count)
+ pr_info("Topology: Add CPU node\n");
+ else if (dev->node_props.simd_count)
+ pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n",
+ dev->node_props.device_id, dev->node_props.vendor_id);
+ }
+ up_read(&topology_lock);
+}
+
+/* Helper function for intializing platform_xx members of kfd_system_properties
+ */
+static void kfd_update_system_properties(void)
+{
+ struct kfd_topology_device *dev;
+
+ down_read(&topology_lock);
+ dev = list_last_entry(&topology_device_list, struct kfd_topology_device, list);
+ if (dev) {
+ sys_props.platform_id =
+ (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK;
+ sys_props.platform_oem = *((uint64_t *)dev->oem_table_id);
+ sys_props.platform_rev = dev->oem_revision;
+ }
+ up_read(&topology_lock);
+}
+
+static void find_system_memory(const struct dmi_header *dm,
+ void *private)
+{
+ struct kfd_mem_properties *mem;
+ u16 mem_width, mem_clock;
+ struct kfd_topology_device *kdev =
+ (struct kfd_topology_device *)private;
+ const u8 *dmi_data = (const u8 *)(dm + 1);
+
+ if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) {
+ mem_width = (u16)(*(const u16 *)(dmi_data + 0x6));
+ mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11));
+ list_for_each_entry(mem, &kdev->mem_props, list) {
+ if (mem_width != 0xFFFF && mem_width != 0)
+ mem->width = mem_width;
+ if (mem_clock != 0)
+ mem->mem_clk_max = mem_clock;
+ }
+ }
+}
+/* kfd_add_non_crat_information - Add information that is not currently
+ * defined in CRAT but is necessary for KFD topology
+ * @dev - topology device to which addition info is added
+ */
+static void kfd_add_non_crat_information(struct kfd_topology_device *kdev)
+{
+ /* Check if CPU only node. */
+ if (kdev->gpu == NULL) {
+ /* Add system memory information */
+ dmi_walk(find_system_memory, kdev);
+ }
+ /* TODO: For GPU node, rearrange code from kfd_topology_add_device */
+}
+
int kfd_topology_init(void)
{
void *crat_image = NULL;
size_t image_size = 0;
int ret;
+ struct list_head temp_topology_device_list;
+ int cpu_only_node = 0;
+ struct kfd_topology_device *kdev;
+ int proximity_domain;
+ int num_nodes;
+
+ /* topology_device_list - Master list of all topology devices
+ * temp_topology_device_list - temporary list created while parsing CRAT
+ * or VCRAT. Once parsing is complete the contents of list is moved to
+ * topology_device_list
+ */
- /*
- * Initialize the head for the topology device list
+ /* Initialize the head for the both the lists
*/
INIT_LIST_HEAD(&topology_device_list);
+ INIT_LIST_HEAD(&temp_topology_device_list);
init_rwsem(&topology_lock);
- topology_crat_parsed = 0;
memset(&sys_props, 0, sizeof(sys_props));
+ /* Proximity domains in ACPI CRAT tables start counting at
+ * 0. The same should be true for virtual CRAT tables created
+ * at this stage. GPUs added later in kfd_topology_add_device
+ * use a counter. */
+ proximity_domain = 0;
+
/*
- * Get the CRAT image from the ACPI
+ * Get the CRAT image from the ACPI. If ACPI doesn't have one
+ * create a virtual CRAT.
+ * NOTE: The current implementation expects all AMD APUs to have
+ * CRAT. If no CRAT is available, it is assumed to be a CPU
*/
- ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
- if (ret == 0 && image_size > 0) {
- pr_info("Found CRAT image with size=%zd\n", image_size);
- crat_image = kmalloc(image_size, GFP_KERNEL);
- if (!crat_image) {
- ret = -ENOMEM;
- pr_err("No memory for allocating CRAT image\n");
- goto err;
- }
- ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
-
- if (ret == 0) {
- down_write(&topology_lock);
- ret = kfd_parse_crat_table(crat_image);
- if (ret == 0)
- ret = kfd_topology_update_sysfs();
- up_write(&topology_lock);
- } else {
- pr_err("Couldn't get CRAT table size from ACPI\n");
- }
- kfree(crat_image);
- } else if (ret == -ENODATA) {
- ret = 0;
- } else {
- pr_err("Couldn't get CRAT table size from ACPI\n");
+ ret = kfd_create_crat_image_acpi(&crat_image, &image_size);
+ if (ret != 0) {
+ ret = kfd_create_crat_image_virtual(&crat_image, &image_size,
+ COMPUTE_UNIT_CPU, NULL,
+ proximity_domain);
+ cpu_only_node = 1;
+ }
+
+ if (ret == 0)
+ ret = kfd_parse_crat_table(crat_image,
+ &temp_topology_device_list,
+ proximity_domain);
+ else {
+ pr_err("Error getting/creating CRAT table\n");
+ goto err;
+ }
+
+ down_write(&topology_lock);
+ num_nodes = kfd_topology_update_device_list(&temp_topology_device_list,
+ &topology_device_list);
+ atomic_set(&topology_crat_proximity_domain, num_nodes-1);
+ ret = kfd_topology_update_sysfs();
+ up_write(&topology_lock);
+
+ if (ret == 0) {
+ sys_props.generation_count++;
+ kfd_update_system_properties();
+ kfd_debug_print_topology();
+ pr_info("Finished initializing topology\n");
+ }
+ else
+ pr_err("Failed to update topology in sysfs ret=%d\n", ret);
+
+ /* For nodes with GPU, this information gets added
+ * when GPU is detected (kfd_topology_add_device). */
+ if (cpu_only_node) {
+ /* Add additional information to CPU only node created above */
+ down_write(&topology_lock);
+ kdev = list_first_entry(&topology_device_list,
+ struct kfd_topology_device, list);
+ up_write(&topology_lock);
+ kfd_add_non_crat_information(kdev);
}
err:
- pr_info("Finished initializing topology ret=%d\n", ret);
+ kfd_destroy_crat_image(crat_image);
return ret;
}
void kfd_topology_shutdown(void)
{
+ down_write(&topology_lock);
kfd_topology_release_sysfs();
+ up_write(&topology_lock);
kfd_release_live_view();
}
-static void kfd_debug_print_topology(void)
-{
- struct kfd_topology_device *dev;
- uint32_t i = 0;
-
- pr_info("DEBUG PRINT OF TOPOLOGY:");
- list_for_each_entry(dev, &topology_device_list, list) {
- pr_info("Node: %d\n", i);
- pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no"));
- pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count);
- pr_info("\tSIMD count: %d", dev->node_props.simd_count);
- i++;
- }
-}
-
static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
{
uint32_t hashout;
uint32_t buf[7];
+ uint64_t local_mem_size;
int i;
+ struct kfd_local_mem_info local_mem_info;
if (!gpu)
return 0;
+ gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info);
+
+ local_mem_size = local_mem_info.local_mem_size_private +
+ local_mem_info.local_mem_size_public;
+
buf[0] = gpu->pdev->devfn;
buf[1] = gpu->pdev->subsystem_vendor;
buf[2] = gpu->pdev->subsystem_device;
buf[3] = gpu->pdev->device;
buf[4] = gpu->pdev->bus->number;
- buf[5] = (uint32_t)(gpu->kfd2kgd->get_vmem_size(gpu->kgd)
- & 0xffffffff);
- buf[6] = (uint32_t)(gpu->kfd2kgd->get_vmem_size(gpu->kgd) >> 32);
+ buf[5] = lower_32_bits(local_mem_size);
+ buf[6] = upper_32_bits(local_mem_size);
for (i = 0, hashout = 0; i < 7; i++)
hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
return hashout;
}
-
+/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
+ * the GPU device is not already present in the topology device list
+ * then return NULL. This means a new topology device has to be
+ * created for this GPU.
+ * TODO: Rather than assiging @gpu to first topology device withtout
+ * gpu attached, it will better to have more stringent check.
+ */
static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
{
struct kfd_topology_device *dev;
@@ -1117,13 +953,14 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
BUG_ON(!gpu);
+ down_write(&topology_lock);
list_for_each_entry(dev, &topology_device_list, list)
if (dev->gpu == NULL && dev->node_props.simd_count > 0) {
dev->gpu = gpu;
out_dev = dev;
break;
}
-
+ up_write(&topology_lock);
return out_dev;
}
@@ -1135,70 +972,146 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival)
*/
}
+/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info,
+ * patch this after CRAT parsing.
+ */
+static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
+{
+ struct kfd_mem_properties *mem;
+ struct kfd_local_mem_info local_mem_info;
+
+ if (dev == NULL)
+ return;
+
+ /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with
+ * single bank of VRAM local memory.
+ * for dGPUs - VCRAT reports only one bank of Local Memory
+ * for APUs - If CRAT from ACPI reports more than one bank, then
+ * all the banks will report the same mem_clk_max information
+ */
+ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
+ &local_mem_info);
+
+ list_for_each_entry(mem, &dev->mem_props, list)
+ mem->mem_clk_max = local_mem_info.mem_clk_max;
+}
+
int kfd_topology_add_device(struct kfd_dev *gpu)
{
uint32_t gpu_id;
struct kfd_topology_device *dev;
- int res;
+ struct kfd_cu_info cu_info;
+ int res = 0;
+ struct list_head temp_topology_device_list;
+ void *crat_image = NULL;
+ size_t image_size = 0;
+ int proximity_domain;
BUG_ON(!gpu);
+ INIT_LIST_HEAD(&temp_topology_device_list);
+
gpu_id = kfd_generate_gpu_id(gpu);
pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
- down_write(&topology_lock);
- /*
- * Try to assign the GPU to existing topology device (generated from
- * CRAT table
+ proximity_domain = atomic_inc_return(&
+ topology_crat_proximity_domain);
+
+ /* Check to see if this gpu device exists in the topology_device_list.
+ * If so, assign the gpu to that device,
+ * else create a Virtual CRAT for this gpu device and then parse that CRAT
+ * to create a new topology device. Once created assign the gpu to that
+ * topology device
*/
dev = kfd_assign_gpu(gpu);
if (!dev) {
- pr_info("GPU was not found in the current topology. Extending.\n");
- kfd_debug_print_topology();
- dev = kfd_create_topology_device();
- if (!dev) {
- res = -ENOMEM;
+ res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+ COMPUTE_UNIT_GPU,
+ gpu, proximity_domain);
+ if (res == 0)
+ res = kfd_parse_crat_table(crat_image,
+ &temp_topology_device_list, proximity_domain);
+ else {
+ pr_err("Error in VCRAT for GPU (ID: 0x%x)\n", gpu_id);
goto err;
}
- dev->gpu = gpu;
- /*
- * TODO: Make a call to retrieve topology information from the
- * GPU vBIOS
- */
+ down_write(&topology_lock);
+ kfd_topology_update_device_list(&temp_topology_device_list,
+ &topology_device_list);
/*
* Update the SYSFS tree, since we added another topology device
*/
- if (kfd_topology_update_sysfs() < 0)
- kfd_topology_release_sysfs();
-
+ res = kfd_topology_update_sysfs();
+ up_write(&topology_lock);
+
+ if (res == 0)
+ sys_props.generation_count++;
+ else
+ pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+ gpu_id, res);
+ dev = kfd_assign_gpu(gpu);
+ BUG_ON(!dev);
}
dev->gpu_id = gpu_id;
gpu->id = gpu_id;
+
+ /* TODO: Move the following lines to function
+ * kfd_add_non_crat_information */
+
+ /* Fill-in additional information that is not available in CRAT but
+ * needed for the topology */
+
+ dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info);
+ dev->node_props.simd_arrays_per_engine = cu_info.num_shader_arrays_per_engine;
+
dev->node_props.vendor_id = gpu->pdev->vendor;
dev->node_props.device_id = gpu->pdev->device;
- dev->node_props.location_id = (gpu->pdev->bus->number << 24) +
- (gpu->pdev->devfn & 0xffffff);
- /*
- * TODO: Retrieve max engine clock values from KGD
- */
-
- if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
- dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE;
- pr_info("amdkfd: adding doorbell packet type capability\n");
+ dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
+ gpu->pdev->devfn);
+ dev->node_props.max_engine_clk_fcompute =
+ dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
+ dev->node_props.max_engine_clk_ccompute =
+ cpufreq_quick_get_max(0) / 1000;
+
+ kfd_fill_mem_clk_max_info(dev);
+
+ switch (dev->gpu->device_info->asic_family) {
+ case CHIP_KAVERI:
+ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 <<
+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+ break;
+ case CHIP_CARRIZO:
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ pr_debug("amdkfd: adding doorbell packet type capability\n");
+ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+ break;
}
- res = 0;
+ /* Fix errors in CZ CRAT.
+ * simd_count: Carrizo CRAT reports wrong simd_count, probably because it
+ * doesn't consider masked out CUs
+ * capability flag: Carrizo CRAT doesn't report IOMMU flags.
+ */
+ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
+ dev->node_props.simd_count =
+ cu_info.simd_per_cu * cu_info.cu_active_number;
+ dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+ }
+ kfd_debug_print_topology();
err:
- up_write(&topology_lock);
-
if (res == 0)
kfd_notify_gpu_change(gpu_id, 1);
+ kfd_destroy_crat_image(crat_image);
return res;
}
@@ -1231,22 +1144,26 @@ int kfd_topology_remove_device(struct kfd_dev *gpu)
return res;
}
-/*
- * When idx is out of bounds, the function will return NULL
+/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD
+ * topology. If GPU device is found @idx, then valid kfd_dev pointer is
+ * returned through @kdev
+ * Return - 0: On success (@kdev will be NULL for non GPU nodes)
+ * -1: If end of list
*/
-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev)
{
struct kfd_topology_device *top_dev;
- struct kfd_dev *device = NULL;
uint8_t device_idx = 0;
+ *kdev = NULL;
down_read(&topology_lock);
list_for_each_entry(top_dev, &topology_device_list, list) {
if (device_idx == idx) {
- device = top_dev->gpu;
- break;
+ *kdev = top_dev->gpu;
+ up_read(&topology_lock);
+ return 0;
}
device_idx++;
@@ -1254,6 +1171,57 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
up_read(&topology_lock);
- return device;
+ return -1;
+
+}
+
+static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
+{
+ const struct cpuinfo_x86 *cpuinfo;
+ int first_cpu_of_nuna_node;
+
+ if (cpumask == NULL || cpumask == cpu_none_mask)
+ return -1;
+ first_cpu_of_nuna_node = cpumask_first(cpumask);
+ cpuinfo = &cpu_data(first_cpu_of_nuna_node);
+
+ return cpuinfo->apicid;
+}
+
+/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
+ * of the given NUMA node (numa_node_id)
+ * Return -1 on failure
+ */
+int kfd_numa_node_to_apic_id(int numa_node_id)
+{
+ if (numa_node_id == -1) {
+ pr_warn("Invalid NUMA Node. Use online CPU mask\n");
+ return kfd_cpumask_to_apic_id(cpu_online_mask);
+ }
+ return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
+}
+
+/* kfd_get_proximity_domain - Find proximity_domain (node id) to which
+ * given PCI bus belongs to. CRAT table contains only the APIC ID
+ * of the parent NUMA node. So use that as the search parameter.
+ * Return -1 on failure
+ */
+int kfd_get_proximity_domain(const struct pci_bus *bus)
+{
+ struct kfd_topology_device *dev;
+ int proximity_domain = -1;
+
+ down_read(&topology_lock);
+
+ list_for_each_entry(dev, &topology_device_list, list)
+ if (dev->node_props.cpu_cores_count &&
+ dev->node_props.cpu_core_id_base ==
+ kfd_cpumask_to_apic_id(cpumask_of_pcibus(bus))) {
+ proximity_domain = dev->proximity_domain;
+ break;
+ }
+
+ up_read(&topology_lock);
+ return proximity_domain;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index c3ddb9b95ff8..ab28188b492e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -39,8 +39,16 @@
#define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080
#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00
#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8
-#define HSA_CAP_RESERVED 0xfffff000
+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000
+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12
+#define HSA_CAP_RESERVED 0xffffc000
+
+#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0
+#define HSA_CAP_DOORBELL_TYPE_1_0 0x1
+#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00
+#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8
#define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000
+#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000
struct kfd_node_properties {
uint32_t cpu_cores_count;
@@ -91,8 +99,6 @@ struct kfd_mem_properties {
struct attribute attr;
};
-#define KFD_TOPOLOGY_CPU_SIBLINGS 256
-
#define HSA_CACHE_TYPE_DATA 0x00000001
#define HSA_CACHE_TYPE_INSTRUCTION 0x00000002
#define HSA_CACHE_TYPE_CPU 0x00000004
@@ -109,7 +115,7 @@ struct kfd_cache_properties {
uint32_t cache_assoc;
uint32_t cache_latency;
uint32_t cache_type;
- uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS];
+ uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE];
struct kobject *kobj;
struct attribute attr;
};
@@ -135,8 +141,8 @@ struct kfd_iolink_properties {
struct kfd_topology_device {
struct list_head list;
uint32_t gpu_id;
+ uint32_t proximity_domain;
struct kfd_node_properties node_props;
- uint32_t mem_bank_count;
struct list_head mem_props;
uint32_t cache_count;
struct list_head cache_props;
@@ -150,6 +156,9 @@ struct kfd_topology_device {
struct attribute attr_gpuid;
struct attribute attr_name;
struct attribute attr_props;
+ uint8_t oem_id[CRAT_OEMID_LENGTH];
+ uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH];
+ uint32_t oem_revision;
};
struct kfd_system_properties {
@@ -164,6 +173,8 @@ struct kfd_system_properties {
struct attribute attr_props;
};
-
+struct kfd_topology_device *kfd_create_topology_device(
+ struct list_head *device_list);
+void kfd_release_live_view(void);
#endif /* __KFD_TOPOLOGY_H__ */