diff options
author | Ashutosh Dixit <ashutosh.dixit@intel.com> | 2024-12-23 19:13:40 -0800 |
---|---|---|
committer | Ashutosh Dixit <ashutosh.dixit@intel.com> | 2024-12-23 19:13:40 -0800 |
commit | acf3ecae2825e0b533c4311cd047e702d29d1843 (patch) | |
tree | 10ce9e4c47b993ac9831d08093023edf934156e8 | |
parent | 8a8675bcf6cd02d999d33aaad0a357f9f4098dd0 (diff) | |
parent | fba0f039affdd0c8767f24e41d5dbef49addea78 (diff) |
Merge remote-tracking branch 'drm-xe/drm-xe-next' into drm-tip
# Conflicts:
# drivers/gpu/drm/xe/xe_exec_queue.c
30 files changed, 558 insertions, 273 deletions
diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy.c b/drivers/gpu/drm/i915/display/intel_cx0_phy.c index 5ebc90d210d4..e768dc6a15b3 100644 --- a/drivers/gpu/drm/i915/display/intel_cx0_phy.c +++ b/drivers/gpu/drm/i915/display/intel_cx0_phy.c @@ -3068,7 +3068,10 @@ int intel_mtl_tbt_calc_port_clock(struct intel_encoder *encoder) val = intel_de_read(display, XELPDP_PORT_CLOCK_CTL(display, encoder->port)); - clock = REG_FIELD_GET(XELPDP_DDI_CLOCK_SELECT_MASK, val); + if (DISPLAY_VER(display) >= 30) + clock = REG_FIELD_GET(XE3_DDI_CLOCK_SELECT_MASK, val); + else + clock = REG_FIELD_GET(XELPDP_DDI_CLOCK_SELECT_MASK, val); drm_WARN_ON(display->drm, !(val & XELPDP_FORWARD_CLOCK_UNGATE)); drm_WARN_ON(display->drm, !(val & XELPDP_TBT_CLOCK_REQUEST)); @@ -3083,13 +3086,18 @@ int intel_mtl_tbt_calc_port_clock(struct intel_encoder *encoder) return 540000; case XELPDP_DDI_CLOCK_SELECT_TBT_810: return 810000; + case XELPDP_DDI_CLOCK_SELECT_TBT_312_5: + return 1000000; + case XELPDP_DDI_CLOCK_SELECT_TBT_625: + return 2000000; default: MISSING_CASE(clock); return 162000; } } -static int intel_mtl_tbt_clock_select(int clock) +static int intel_mtl_tbt_clock_select(struct intel_display *display, + int clock) { switch (clock) { case 162000: @@ -3100,6 +3108,18 @@ static int intel_mtl_tbt_clock_select(int clock) return XELPDP_DDI_CLOCK_SELECT_TBT_540; case 810000: return XELPDP_DDI_CLOCK_SELECT_TBT_810; + case 1000000: + if (DISPLAY_VER(display) < 30) { + drm_WARN_ON(display->drm, "UHBR10 not supported for the platform\n"); + return XELPDP_DDI_CLOCK_SELECT_TBT_162; + } + return XELPDP_DDI_CLOCK_SELECT_TBT_312_5; + case 2000000: + if (DISPLAY_VER(display) < 30) { + drm_WARN_ON(display->drm, "UHBR20 not supported for the platform\n"); + return XELPDP_DDI_CLOCK_SELECT_TBT_162; + } + return XELPDP_DDI_CLOCK_SELECT_TBT_625; default: MISSING_CASE(clock); return XELPDP_DDI_CLOCK_SELECT_TBT_162; @@ -3112,15 +3132,26 @@ static void intel_mtl_tbt_pll_enable(struct intel_encoder *encoder, struct intel_display *display = to_intel_display(encoder); enum phy phy = intel_encoder_to_phy(encoder); u32 val = 0; + u32 mask; /* * 1. Program PORT_CLOCK_CTL REGISTER to configure * clock muxes, gating and SSC */ - val |= XELPDP_DDI_CLOCK_SELECT(intel_mtl_tbt_clock_select(crtc_state->port_clock)); + + if (DISPLAY_VER(display) >= 30) { + mask = XE3_DDI_CLOCK_SELECT_MASK; + val |= XE3_DDI_CLOCK_SELECT(intel_mtl_tbt_clock_select(display, crtc_state->port_clock)); + } else { + mask = XELPDP_DDI_CLOCK_SELECT_MASK; + val |= XELPDP_DDI_CLOCK_SELECT(intel_mtl_tbt_clock_select(display, crtc_state->port_clock)); + } + + mask |= XELPDP_FORWARD_CLOCK_UNGATE; val |= XELPDP_FORWARD_CLOCK_UNGATE; + intel_de_rmw(display, XELPDP_PORT_CLOCK_CTL(display, encoder->port), - XELPDP_DDI_CLOCK_SELECT_MASK | XELPDP_FORWARD_CLOCK_UNGATE, val); + mask, val); /* 2. Read back PORT_CLOCK_CTL REGISTER */ val = intel_de_read(display, XELPDP_PORT_CLOCK_CTL(display, encoder->port)); diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy_regs.h b/drivers/gpu/drm/i915/display/intel_cx0_phy_regs.h index 4dc6e179a774..da154ff26b96 100644 --- a/drivers/gpu/drm/i915/display/intel_cx0_phy_regs.h +++ b/drivers/gpu/drm/i915/display/intel_cx0_phy_regs.h @@ -192,7 +192,9 @@ #define XELPDP_TBT_CLOCK_REQUEST REG_BIT(19) #define XELPDP_TBT_CLOCK_ACK REG_BIT(18) #define XELPDP_DDI_CLOCK_SELECT_MASK REG_GENMASK(15, 12) +#define XE3_DDI_CLOCK_SELECT_MASK REG_GENMASK(16, 12) #define XELPDP_DDI_CLOCK_SELECT(val) REG_FIELD_PREP(XELPDP_DDI_CLOCK_SELECT_MASK, val) +#define XE3_DDI_CLOCK_SELECT(val) REG_FIELD_PREP(XE3_DDI_CLOCK_SELECT_MASK, val) #define XELPDP_DDI_CLOCK_SELECT_NONE 0x0 #define XELPDP_DDI_CLOCK_SELECT_MAXPCLK 0x8 #define XELPDP_DDI_CLOCK_SELECT_DIV18CLK 0x9 @@ -200,6 +202,8 @@ #define XELPDP_DDI_CLOCK_SELECT_TBT_270 0xd #define XELPDP_DDI_CLOCK_SELECT_TBT_540 0xe #define XELPDP_DDI_CLOCK_SELECT_TBT_810 0xf +#define XELPDP_DDI_CLOCK_SELECT_TBT_312_5 0x18 +#define XELPDP_DDI_CLOCK_SELECT_TBT_625 0x19 #define XELPDP_FORWARD_CLOCK_UNGATE REG_BIT(10) #define XELPDP_LANE1_PHY_CLOCK_SELECT REG_BIT(8) #define XELPDP_SSC_ENABLE_PLLA REG_BIT(1) diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h index 10ec2920d31b..f4ee910f0943 100644 --- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h @@ -33,12 +33,13 @@ #define MI_TOPOLOGY_FILTER __MI_INSTR(0xD) #define MI_FORCE_WAKEUP __MI_INSTR(0x1D) -#define MI_STORE_DATA_IMM __MI_INSTR(0x20) -#define MI_SDI_GGTT REG_BIT(22) -#define MI_SDI_LEN_DW GENMASK(9, 0) -#define MI_SDI_NUM_DW(x) REG_FIELD_PREP(MI_SDI_LEN_DW, (x) + 3 - 2) -#define MI_SDI_NUM_QW(x) (REG_FIELD_PREP(MI_SDI_LEN_DW, 2 * (x) + 3 - 2) | \ - REG_BIT(21)) +#define MI_STORE_DATA_IMM __MI_INSTR(0x20) +#define MI_SDI_GGTT REG_BIT(22) +#define MI_FORCE_WRITE_COMPLETION_CHECK REG_BIT(10) +#define MI_SDI_LEN_DW GENMASK(9, 0) +#define MI_SDI_NUM_DW(x) REG_FIELD_PREP(MI_SDI_LEN_DW, (x) + 3 - 2) +#define MI_SDI_NUM_QW(x) (REG_FIELD_PREP(MI_SDI_LEN_DW, 2 * (x) + 3 - 2) | \ + REG_BIT(21)) #define MI_LOAD_REGISTER_IMM __MI_INSTR(0x22) #define MI_LRI_LRM_CS_MMIO REG_BIT(19) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 7c78496e6213..d86219dedde2 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -83,6 +83,8 @@ #define RING_IMR(base) XE_REG((base) + 0xa8) #define RING_INT_STATUS_RPT_PTR(base) XE_REG((base) + 0xac) +#define CS_INT_VEC(base) XE_REG((base) + 0x1b8) + #define RING_EIR(base) XE_REG((base) + 0xb0) #define RING_EMR(base) XE_REG((base) + 0xb4) #define RING_ESR(base) XE_REG((base) + 0xb8) @@ -138,6 +140,7 @@ #define RING_MODE(base) XE_REG((base) + 0x29c) #define GFX_DISABLE_LEGACY_MODE REG_BIT(3) +#define GFX_MSIX_INTERRUPT_ENABLE REG_BIT(13) #define RING_TIMESTAMP(base) XE_REG((base) + 0x358) diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h index 045dfd09db99..57944f90bbf6 100644 --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h @@ -25,6 +25,9 @@ #define CTX_INT_SRC_REPORT_REG (CTX_LRI_INT_REPORT_PTR + 3) #define CTX_INT_SRC_REPORT_PTR (CTX_LRI_INT_REPORT_PTR + 4) +#define CTX_CS_INT_VEC_REG 0x5a +#define CTX_CS_INT_VEC_DATA (CTX_CS_INT_VEC_REG + 1) + #define INDIRECT_CTX_RING_HEAD (0x02 + 1) #define INDIRECT_CTX_RING_TAIL (0x04 + 1) #define INDIRECT_CTX_RING_START (0x06 + 1) diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c index c9ec7a313c6b..405ff904153e 100644 --- a/drivers/gpu/drm/xe/tests/xe_bo.c +++ b/drivers/gpu/drm/xe/tests/xe_bo.c @@ -606,8 +606,6 @@ static void xe_bo_shrink_kunit(struct kunit *test) static struct kunit_case xe_bo_tests[] = { KUNIT_CASE_PARAM(xe_ccs_migrate_kunit, xe_pci_live_device_gen_param), KUNIT_CASE_PARAM(xe_bo_evict_kunit, xe_pci_live_device_gen_param), - KUNIT_CASE_PARAM_ATTR(xe_bo_shrink_kunit, xe_pci_live_device_gen_param, - {.speed = KUNIT_SPEED_SLOW}), {} }; @@ -618,3 +616,17 @@ struct kunit_suite xe_bo_test_suite = { .init = xe_kunit_helper_xe_device_live_test_init, }; EXPORT_SYMBOL_IF_KUNIT(xe_bo_test_suite); + +static struct kunit_case xe_bo_shrink_test[] = { + KUNIT_CASE_PARAM_ATTR(xe_bo_shrink_kunit, xe_pci_live_device_gen_param, + {.speed = KUNIT_SPEED_SLOW}), + {} +}; + +VISIBLE_IF_KUNIT +struct kunit_suite xe_bo_shrink_test_suite = { + .name = "xe_bo_shrink", + .test_cases = xe_bo_shrink_test, + .init = xe_kunit_helper_xe_device_live_test_init, +}; +EXPORT_SYMBOL_IF_KUNIT(xe_bo_shrink_test_suite); diff --git a/drivers/gpu/drm/xe/tests/xe_live_test_mod.c b/drivers/gpu/drm/xe/tests/xe_live_test_mod.c index 0d36ab864ec0..81277c77016d 100644 --- a/drivers/gpu/drm/xe/tests/xe_live_test_mod.c +++ b/drivers/gpu/drm/xe/tests/xe_live_test_mod.c @@ -6,11 +6,13 @@ #include <kunit/test.h> extern struct kunit_suite xe_bo_test_suite; +extern struct kunit_suite xe_bo_shrink_test_suite; extern struct kunit_suite xe_dma_buf_test_suite; extern struct kunit_suite xe_migrate_test_suite; extern struct kunit_suite xe_mocs_test_suite; kunit_test_suite(xe_bo_test_suite); +kunit_test_suite(xe_bo_shrink_test_suite); kunit_test_suite(xe_dma_buf_test_suite); kunit_test_suite(xe_migrate_test_suite); kunit_test_suite(xe_mocs_test_suite); diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 56d4ffb650da..bf36e4fb4679 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -325,7 +325,9 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, xe->info.revid = pdev->revision; xe->info.force_execlist = xe_modparam.force_execlist; - spin_lock_init(&xe->irq.lock); + err = xe_irq_init(xe); + if (err) + goto err; init_waitqueue_head(&xe->ufence_wq); diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h index f1fbfe916867..fc3c2af3fb7f 100644 --- a/drivers/gpu/drm/xe/xe_device.h +++ b/drivers/gpu/drm/xe/xe_device.h @@ -157,8 +157,7 @@ static inline bool xe_device_has_sriov(struct xe_device *xe) static inline bool xe_device_has_msix(struct xe_device *xe) { - /* TODO: change this when MSI-X support is fully integrated */ - return false; + return xe->irq.msix.nvec > 0; } static inline bool xe_device_has_memirq(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index ace22e35e769..8a7b15972413 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -348,6 +348,14 @@ struct xe_device { /** @irq.enabled: interrupts enabled on this device */ atomic_t enabled; + + /** @irq.msix: irq info for platforms that support MSI-X */ + struct { + /** @irq.msix.nvec: number of MSI-X interrupts */ + u16 nvec; + /** @irq.msix.indexes: used to allocate MSI-X indexes */ + struct xarray indexes; + } msix; } irq; /** @ttm: ttm device */ diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 1b4bb083555f..7e1abbbfba12 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -17,6 +17,7 @@ #include "xe_hw_engine_class_sysfs.h" #include "xe_hw_engine_group.h" #include "xe_hw_fence.h" +#include "xe_irq.h" #include "xe_lrc.h" #include "xe_macros.h" #include "xe_migrate.h" @@ -69,6 +70,7 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe, q->gt = gt; q->class = hwe->class; q->width = width; + q->msix_vec = XE_IRQ_DEFAULT_MSIX; q->logical_mask = logical_mask; q->fence_irq = >->fence_irq[hwe->class]; q->ring_ops = gt->ring_ops[hwe->class]; @@ -118,7 +120,7 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) } for (i = 0; i < q->width; ++i) { - q->lrc[i] = xe_lrc_create(q->hwe, q->vm, SZ_16K); + q->lrc[i] = xe_lrc_create(q->hwe, q->vm, SZ_16K, q->msix_vec); if (IS_ERR(q->lrc[i])) { err = PTR_ERR(q->lrc[i]); goto err_unlock; @@ -768,25 +770,20 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q) void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) { struct xe_device *xe = gt_to_xe(q->gt); - struct xe_file *xef; struct xe_lrc *lrc; u32 old_ts, new_ts; int idx; /* - * Jobs that are run during driver load may use an exec_queue, but are - * not associated with a user xe file, so avoid accumulating busyness - * for kernel specific work. + * Jobs that are executed by kernel doesn't have a corresponding xe_file + * and thus are not accounted. */ - if (!q->vm || !q->vm->xef) + if (!q->xef) return; /* Synchronize with unbind while holding the xe file open */ if (!drm_dev_enter(&xe->drm, &idx)) return; - - xef = q->vm->xef; - /* * Only sample the first LRC. For parallel submission, all of them are * scheduled together and we compensate that below by multiplying by @@ -797,7 +794,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q) */ lrc = q->lrc[0]; new_ts = xe_lrc_update_timestamp(lrc, &old_ts); - xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; + q->xef->run_ticks[q->class] += (new_ts - old_ts) * q->width; drm_dev_exit(idx); } diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 1158b6062a6c..eec8f9935a58 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -63,6 +63,8 @@ struct xe_exec_queue { char name[MAX_FENCE_NAME_LEN]; /** @width: width (number BB submitted per exec) of this exec queue */ u16 width; + /** @msix_vec: MSI-X vector (for platforms that support it) */ + u16 msix_vec; /** @fence_irq: fence IRQ used to signal job completion */ struct xe_hw_fence_irq *fence_irq; diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c index a8c416a48812..5ef96deaa881 100644 --- a/drivers/gpu/drm/xe/xe_execlist.c +++ b/drivers/gpu/drm/xe/xe_execlist.c @@ -17,6 +17,7 @@ #include "xe_exec_queue.h" #include "xe_gt.h" #include "xe_hw_fence.h" +#include "xe_irq.h" #include "xe_lrc.h" #include "xe_macros.h" #include "xe_mmio.h" @@ -47,6 +48,7 @@ static void __start_lrc(struct xe_hw_engine *hwe, struct xe_lrc *lrc, struct xe_mmio *mmio = >->mmio; struct xe_device *xe = gt_to_xe(gt); u64 lrc_desc; + u32 ring_mode = _MASKED_BIT_ENABLE(GFX_DISABLE_LEGACY_MODE); lrc_desc = xe_lrc_descriptor(lrc); @@ -80,8 +82,10 @@ static void __start_lrc(struct xe_hw_engine *hwe, struct xe_lrc *lrc, xe_mmio_write32(mmio, RING_HWS_PGA(hwe->mmio_base), xe_bo_ggtt_addr(hwe->hwsp)); xe_mmio_read32(mmio, RING_HWS_PGA(hwe->mmio_base)); - xe_mmio_write32(mmio, RING_MODE(hwe->mmio_base), - _MASKED_BIT_ENABLE(GFX_DISABLE_LEGACY_MODE)); + + if (xe_device_has_msix(gt_to_xe(hwe->gt))) + ring_mode |= _MASKED_BIT_ENABLE(GFX_MSIX_INTERRUPT_ENABLE); + xe_mmio_write32(mmio, RING_MODE(hwe->mmio_base), ring_mode); xe_mmio_write32(mmio, RING_EXECLIST_SQ_CONTENTS_LO(hwe->mmio_base), lower_32_bits(lrc_desc)); @@ -265,7 +269,7 @@ struct xe_execlist_port *xe_execlist_port_create(struct xe_device *xe, port->hwe = hwe; - port->lrc = xe_lrc_create(hwe, NULL, SZ_16K); + port->lrc = xe_lrc_create(hwe, NULL, SZ_16K, XE_IRQ_DEFAULT_MSIX); if (IS_ERR(port->lrc)) { err = PTR_ERR(port->lrc); goto err; diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c index fd80afeef56a..ffd3ba7f6656 100644 --- a/drivers/gpu/drm/xe/xe_gt_idle.c +++ b/drivers/gpu/drm/xe/xe_gt_idle.c @@ -122,10 +122,12 @@ void xe_gt_idle_enable_pg(struct xe_gt *gt) if (!xe_gt_is_media_type(gt)) gtidle->powergate_enable |= RENDER_POWERGATE_ENABLE; - for (i = XE_HW_ENGINE_VCS0, j = 0; i <= XE_HW_ENGINE_VCS7; ++i, ++j) { - if ((gt->info.engine_mask & BIT(i))) - gtidle->powergate_enable |= (VDN_HCP_POWERGATE_ENABLE(j) | - VDN_MFXVDENC_POWERGATE_ENABLE(j)); + if (xe->info.platform != XE_DG1) { + for (i = XE_HW_ENGINE_VCS0, j = 0; i <= XE_HW_ENGINE_VCS7; ++i, ++j) { + if ((gt->info.engine_mask & BIT(i))) + gtidle->powergate_enable |= (VDN_HCP_POWERGATE_ENABLE(j) | + VDN_MFXVDENC_POWERGATE_ENABLE(j)); + } } fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index 4e2868efb620..408365dfe4ee 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -147,6 +147,34 @@ static u32 guc_ctl_ads_flags(struct xe_guc *guc) return flags; } +static bool needs_wa_dual_queue(struct xe_gt *gt) +{ + /* + * The DUAL_QUEUE_WA tells the GuC to not allow concurrent submissions + * on RCS and CCSes with different address spaces, which on DG2 is + * required as a WA for an HW bug. + */ + if (XE_WA(gt, 22011391025)) + return true; + + /* + * On newer platforms, the HW has been updated to not allow parallel + * execution of different address spaces, so the RCS/CCS will stall the + * context switch if one of the other RCS/CCSes is busy with a different + * address space. While functionally correct, having a submission + * stalled on the HW limits the GuC ability to shuffle things around and + * can cause complications if the non-stalled submission runs for a long + * time, because the GuC doesn't know that the stalled submission isn't + * actually running and might declare it as hung. Therefore, we enable + * the DUAL_QUEUE_WA on all newer platforms on GTs that have CCS engines + * to move management back to the GuC. + */ + if (CCS_MASK(gt) && GRAPHICS_VERx100(gt_to_xe(gt)) >= 1270) + return true; + + return false; +} + static u32 guc_ctl_wa_flags(struct xe_guc *guc) { struct xe_device *xe = guc_to_xe(guc); @@ -159,7 +187,7 @@ static u32 guc_ctl_wa_flags(struct xe_guc *guc) if (XE_WA(gt, 14014475959)) flags |= GUC_WA_HOLD_CCS_SWITCHOUT; - if (XE_WA(gt, 22011391025)) + if (needs_wa_dual_queue(gt)) flags |= GUC_WA_DUAL_QUEUE; /* diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c index b19366744148..ac9c666a9652 100644 --- a/drivers/gpu/drm/xe/xe_hw_engine.c +++ b/drivers/gpu/drm/xe/xe_hw_engine.c @@ -324,6 +324,7 @@ void xe_hw_engine_enable_ring(struct xe_hw_engine *hwe) { u32 ccs_mask = xe_hw_engine_mask_per_class(hwe->gt, XE_ENGINE_CLASS_COMPUTE); + u32 ring_mode = _MASKED_BIT_ENABLE(GFX_DISABLE_LEGACY_MODE); if (hwe->class == XE_ENGINE_CLASS_COMPUTE && ccs_mask) xe_mmio_write32(&hwe->gt->mmio, RCU_MODE, @@ -332,8 +333,10 @@ void xe_hw_engine_enable_ring(struct xe_hw_engine *hwe) xe_hw_engine_mmio_write32(hwe, RING_HWSTAM(0), ~0x0); xe_hw_engine_mmio_write32(hwe, RING_HWS_PGA(0), xe_bo_ggtt_addr(hwe->hwsp)); - xe_hw_engine_mmio_write32(hwe, RING_MODE(0), - _MASKED_BIT_ENABLE(GFX_DISABLE_LEGACY_MODE)); + + if (xe_device_has_msix(gt_to_xe(hwe->gt))) + ring_mode |= _MASKED_BIT_ENABLE(GFX_MSIX_INTERRUPT_ENABLE); + xe_hw_engine_mmio_write32(hwe, RING_MODE(0), ring_mode); xe_hw_engine_mmio_write32(hwe, RING_MI_MODE(0), _MASKED_BIT_DISABLE(STOP_RING)); xe_hw_engine_mmio_read32(hwe, RING_MI_MODE(0)); @@ -772,7 +775,7 @@ static void check_gsc_availability(struct xe_gt *gt) xe_mmio_write32(>->mmio, GUNIT_GSC_INTR_ENABLE, 0); xe_mmio_write32(>->mmio, GUNIT_GSC_INTR_MASK, ~0); - drm_info(&xe->drm, "gsccs disabled due to lack of FW\n"); + drm_dbg(&xe->drm, "GSC FW not used, disabling gsccs\n"); } } diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c index 1c509e66694d..32f5a67a917b 100644 --- a/drivers/gpu/drm/xe/xe_irq.c +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -10,6 +10,7 @@ #include <drm/drm_managed.h> #include "display/xe_display.h" +#include "regs/xe_guc_regs.h" #include "regs/xe_irq_regs.h" #include "xe_device.h" #include "xe_drv.h" @@ -29,6 +30,11 @@ #define IIR(offset) XE_REG(offset + 0x8) #define IER(offset) XE_REG(offset + 0xc) +static int xe_irq_msix_init(struct xe_device *xe); +static void xe_irq_msix_free(struct xe_device *xe); +static int xe_irq_msix_request_irqs(struct xe_device *xe); +static void xe_irq_msix_synchronize_irq(struct xe_device *xe); + static void assert_iir_is_zero(struct xe_mmio *mmio, struct xe_reg reg) { u32 val = xe_mmio_read32(mmio, reg); @@ -572,6 +578,11 @@ static void xe_irq_reset(struct xe_device *xe) if (IS_SRIOV_VF(xe)) return vf_irq_reset(xe); + if (xe_device_uses_memirq(xe)) { + for_each_tile(tile, xe, id) + xe_memirq_reset(&tile->memirq); + } + for_each_tile(tile, xe, id) { if (GRAPHICS_VERx100(xe) >= 1210) dg1_irq_reset(tile); @@ -614,6 +625,14 @@ static void xe_irq_postinstall(struct xe_device *xe) if (IS_SRIOV_VF(xe)) return vf_irq_postinstall(xe); + if (xe_device_uses_memirq(xe)) { + struct xe_tile *tile; + unsigned int id; + + for_each_tile(tile, xe, id) + xe_memirq_postinstall(&tile->memirq); + } + xe_display_irq_postinstall(xe, xe_root_mmio_gt(xe)); /* @@ -656,60 +675,83 @@ static irq_handler_t xe_irq_handler(struct xe_device *xe) return xelp_irq_handler; } -static void irq_uninstall(void *arg) +static int xe_irq_msi_request_irqs(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + irq_handler_t irq_handler; + int irq, err; + + irq_handler = xe_irq_handler(xe); + if (!irq_handler) { + drm_err(&xe->drm, "No supported interrupt handler"); + return -EINVAL; + } + + irq = pci_irq_vector(pdev, 0); + err = request_irq(irq, irq_handler, IRQF_SHARED, DRIVER_NAME, xe); + if (err < 0) { + drm_err(&xe->drm, "Failed to request MSI IRQ %d\n", err); + return err; + } + + return 0; +} + +static void xe_irq_msi_free(struct xe_device *xe) { - struct xe_device *xe = arg; struct pci_dev *pdev = to_pci_dev(xe->drm.dev); int irq; + irq = pci_irq_vector(pdev, 0); + free_irq(irq, xe); +} + +static void irq_uninstall(void *arg) +{ + struct xe_device *xe = arg; + if (!atomic_xchg(&xe->irq.enabled, 0)) return; xe_irq_reset(xe); - irq = pci_irq_vector(pdev, 0); - free_irq(irq, xe); + if (xe_device_has_msix(xe)) + xe_irq_msix_free(xe); + else + xe_irq_msi_free(xe); +} + +int xe_irq_init(struct xe_device *xe) +{ + spin_lock_init(&xe->irq.lock); + + return xe_irq_msix_init(xe); } int xe_irq_install(struct xe_device *xe) { struct pci_dev *pdev = to_pci_dev(xe->drm.dev); - unsigned int irq_flags = PCI_IRQ_MSIX; - irq_handler_t irq_handler; - int err, irq, nvec; - - irq_handler = xe_irq_handler(xe); - if (!irq_handler) { - drm_err(&xe->drm, "No supported interrupt handler"); - return -EINVAL; - } + unsigned int irq_flags = PCI_IRQ_MSI; + int nvec = 1; + int err; xe_irq_reset(xe); - nvec = pci_msix_vec_count(pdev); - if (nvec <= 0) { - if (nvec == -EINVAL) { - /* MSIX capability is not supported in the device, using MSI */ - irq_flags = PCI_IRQ_MSI; - nvec = 1; - } else { - drm_err(&xe->drm, "MSIX: Failed getting count\n"); - return nvec; - } + if (xe_device_has_msix(xe)) { + nvec = xe->irq.msix.nvec; + irq_flags = PCI_IRQ_MSIX; } err = pci_alloc_irq_vectors(pdev, nvec, nvec, irq_flags); if (err < 0) { - drm_err(&xe->drm, "MSI/MSIX: Failed to enable support %d\n", err); + drm_err(&xe->drm, "Failed to allocate IRQ vectors: %d\n", err); return err; } - irq = pci_irq_vector(pdev, 0); - err = request_irq(irq, irq_handler, IRQF_SHARED, DRIVER_NAME, xe); - if (err < 0) { - drm_err(&xe->drm, "Failed to request MSI/MSIX IRQ %d\n", err); + err = xe_device_has_msix(xe) ? xe_irq_msix_request_irqs(xe) : + xe_irq_msi_request_irqs(xe); + if (err) return err; - } atomic_set(&xe->irq.enabled, 1); @@ -722,18 +764,28 @@ int xe_irq_install(struct xe_device *xe) return 0; free_irq_handler: - free_irq(irq, xe); + if (xe_device_has_msix(xe)) + xe_irq_msix_free(xe); + else + xe_irq_msi_free(xe); return err; } -void xe_irq_suspend(struct xe_device *xe) +static void xe_irq_msi_synchronize_irq(struct xe_device *xe) { - int irq = to_pci_dev(xe->drm.dev)->irq; + synchronize_irq(to_pci_dev(xe->drm.dev)->irq); +} +void xe_irq_suspend(struct xe_device *xe) +{ atomic_set(&xe->irq.enabled, 0); /* no new irqs */ - synchronize_irq(irq); /* flush irqs */ + /* flush irqs */ + if (xe_device_has_msix(xe)) + xe_irq_msix_synchronize_irq(xe); + else + xe_irq_msi_synchronize_irq(xe); xe_irq_reset(xe); /* turn irqs off */ } @@ -754,3 +806,198 @@ void xe_irq_resume(struct xe_device *xe) for_each_gt(gt, xe, id) xe_irq_enable_hwe(gt); } + +/* MSI-X related definitions and functions below. */ + +enum xe_irq_msix_static { + GUC2HOST_MSIX = 0, + DEFAULT_MSIX = XE_IRQ_DEFAULT_MSIX, + /* Must be last */ + NUM_OF_STATIC_MSIX, +}; + +static int xe_irq_msix_init(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + int nvec = pci_msix_vec_count(pdev); + + if (nvec == -EINVAL) + return 0; /* MSI */ + + if (nvec < 0) { + drm_err(&xe->drm, "Failed getting MSI-X vectors count: %d\n", nvec); + return nvec; + } + + xe->irq.msix.nvec = nvec; + xa_init_flags(&xe->irq.msix.indexes, XA_FLAGS_ALLOC); + return 0; +} + +static irqreturn_t guc2host_irq_handler(int irq, void *arg) +{ + struct xe_device *xe = arg; + struct xe_tile *tile; + u8 id; + + if (!atomic_read(&xe->irq.enabled)) + return IRQ_NONE; + + for_each_tile(tile, xe, id) + xe_guc_irq_handler(&tile->primary_gt->uc.guc, + GUC_INTR_GUC2HOST); + + return IRQ_HANDLED; +} + +static irqreturn_t xe_irq_msix_default_hwe_handler(int irq, void *arg) +{ + unsigned int tile_id, gt_id; + struct xe_device *xe = arg; + struct xe_memirq *memirq; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct xe_tile *tile; + struct xe_gt *gt; + + if (!atomic_read(&xe->irq.enabled)) + return IRQ_NONE; + + for_each_tile(tile, xe, tile_id) { + memirq = &tile->memirq; + if (!memirq->bo) + continue; + + for_each_gt(gt, xe, gt_id) { + if (gt->tile != tile) + continue; + + for_each_hw_engine(hwe, gt, id) + xe_memirq_hwe_handler(memirq, hwe); + } + } + + return IRQ_HANDLED; +} + +static int xe_irq_msix_alloc_vector(struct xe_device *xe, void *irq_buf, + bool dynamic_msix, u16 *msix) +{ + struct xa_limit limit; + int ret; + u32 id; + + limit = (dynamic_msix) ? XA_LIMIT(NUM_OF_STATIC_MSIX, xe->irq.msix.nvec - 1) : + XA_LIMIT(*msix, *msix); + ret = xa_alloc(&xe->irq.msix.indexes, &id, irq_buf, limit, GFP_KERNEL); + if (ret) + return ret; + + if (dynamic_msix) + *msix = id; + + return 0; +} + +static void xe_irq_msix_release_vector(struct xe_device *xe, u16 msix) +{ + xa_erase(&xe->irq.msix.indexes, msix); +} + +static int xe_irq_msix_request_irq_internal(struct xe_device *xe, irq_handler_t handler, + void *irq_buf, const char *name, u16 msix) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + int ret, irq; + + irq = pci_irq_vector(pdev, msix); + if (irq < 0) + return irq; + + ret = request_irq(irq, handler, IRQF_SHARED, name, irq_buf); + if (ret < 0) + return ret; + + return 0; +} + +int xe_irq_msix_request_irq(struct xe_device *xe, irq_handler_t handler, void *irq_buf, + const char *name, bool dynamic_msix, u16 *msix) +{ + int ret; + + ret = xe_irq_msix_alloc_vector(xe, irq_buf, dynamic_msix, msix); + if (ret) + return ret; + + ret = xe_irq_msix_request_irq_internal(xe, handler, irq_buf, name, *msix); + if (ret) { + drm_err(&xe->drm, "Failed to request IRQ for MSI-X %u\n", *msix); + xe_irq_msix_release_vector(xe, *msix); + return ret; + } + + return 0; +} + +void xe_irq_msix_free_irq(struct xe_device *xe, u16 msix) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + int irq; + void *irq_buf; + + irq_buf = xa_load(&xe->irq.msix.indexes, msix); + if (!irq_buf) + return; + + irq = pci_irq_vector(pdev, msix); + if (irq < 0) { + drm_err(&xe->drm, "MSI-X %u can't be released, there is no matching IRQ\n", msix); + return; + } + + free_irq(irq, irq_buf); + xe_irq_msix_release_vector(xe, msix); +} + +int xe_irq_msix_request_irqs(struct xe_device *xe) +{ + int err; + u16 msix; + + msix = GUC2HOST_MSIX; + err = xe_irq_msix_request_irq(xe, guc2host_irq_handler, xe, + DRIVER_NAME "-guc2host", false, &msix); + if (err) + return err; + + msix = DEFAULT_MSIX; + err = xe_irq_msix_request_irq(xe, xe_irq_msix_default_hwe_handler, xe, + DRIVER_NAME "-default-msix", false, &msix); + if (err) { + xe_irq_msix_free_irq(xe, GUC2HOST_MSIX); + return err; + } + + return 0; +} + +void xe_irq_msix_free(struct xe_device *xe) +{ + unsigned long msix; + u32 *dummy; + + xa_for_each(&xe->irq.msix.indexes, msix, dummy) + xe_irq_msix_free_irq(xe, msix); + xa_destroy(&xe->irq.msix.indexes); +} + +void xe_irq_msix_synchronize_irq(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + unsigned long msix; + u32 *dummy; + + xa_for_each(&xe->irq.msix.indexes, msix, dummy) + synchronize_irq(pci_irq_vector(pdev, msix)); +} diff --git a/drivers/gpu/drm/xe/xe_irq.h b/drivers/gpu/drm/xe/xe_irq.h index 067514e13675..a28bd577ba52 100644 --- a/drivers/gpu/drm/xe/xe_irq.h +++ b/drivers/gpu/drm/xe/xe_irq.h @@ -6,13 +6,21 @@ #ifndef _XE_IRQ_H_ #define _XE_IRQ_H_ +#include <linux/interrupt.h> + +#define XE_IRQ_DEFAULT_MSIX 1 + struct xe_device; struct xe_tile; struct xe_gt; +int xe_irq_init(struct xe_device *xe); int xe_irq_install(struct xe_device *xe); void xe_irq_suspend(struct xe_device *xe); void xe_irq_resume(struct xe_device *xe); void xe_irq_enable_hwe(struct xe_gt *gt); +int xe_irq_msix_request_irq(struct xe_device *xe, irq_handler_t handler, void *irq_buf, + const char *name, bool dynamic_msix, u16 *msix); +void xe_irq_msix_free_irq(struct xe_device *xe, u16 msix); #endif diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 22e58c6e2a35..bbb9ffbf6367 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -584,6 +584,7 @@ static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) { struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; struct xe_device *xe = gt_to_xe(hwe->gt); + u8 num_regs; if (!xe_device_uses_memirq(xe)) return; @@ -593,12 +594,18 @@ static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); - regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) | + num_regs = xe_device_has_msix(xe) ? 3 : 2; + regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); + + if (xe_device_has_msix(xe)) { + regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; + /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ + } } static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) @@ -876,7 +883,7 @@ static void xe_lrc_finish(struct xe_lrc *lrc) #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1) static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, - struct xe_vm *vm, u32 ring_size) + struct xe_vm *vm, u32 ring_size, u16 msix_vec) { struct xe_gt *gt = hwe->gt; struct xe_tile *tile = gt_to_tile(gt); @@ -945,6 +952,14 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, xe_drm_client_add_bo(vm->xef->client, lrc->bo); } + if (xe_device_has_msix(xe)) { + xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, + xe_memirq_status_ptr(&tile->memirq, hwe)); + xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, + xe_memirq_source_ptr(&tile->memirq, hwe)); + xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); + } + if (xe_gt_has_indirect_ring_state(gt)) { xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, __xe_lrc_indirect_ring_ggtt_addr(lrc)); @@ -1005,6 +1020,7 @@ err_lrc_finish: * @hwe: Hardware Engine * @vm: The VM (address space) * @ring_size: LRC ring size + * @msix_vec: MSI-X interrupt vector (for platforms that support it) * * Allocate and initialize the Logical Ring Context (LRC). * @@ -1012,7 +1028,7 @@ err_lrc_finish: * upon failure. */ struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, - u32 ring_size) + u32 ring_size, u16 msix_vec) { struct xe_lrc *lrc; int err; @@ -1021,7 +1037,7 @@ struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, if (!lrc) return ERR_PTR(-ENOMEM); - err = xe_lrc_init(lrc, hwe, vm, ring_size); + err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec); if (err) { kfree(lrc); return ERR_PTR(err); diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h index b459dcab8787..4206e6a8b50a 100644 --- a/drivers/gpu/drm/xe/xe_lrc.h +++ b/drivers/gpu/drm/xe/xe_lrc.h @@ -42,7 +42,7 @@ struct xe_lrc_snapshot { #define LRC_PPHWSP_SCRATCH_ADDR (0x34 * 4) struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, - u32 ring_size); + u32 ring_size, u16 msix_vec); void xe_lrc_destroy(struct kref *ref); /** diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 1b97d90aadda..8b32fad67878 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -581,7 +581,9 @@ static void emit_pte(struct xe_migrate *m, while (ptes) { u32 chunk = min(MAX_PTE_PER_SDI, ptes); - bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); + bb->cs[bb->len++] = MI_STORE_DATA_IMM | + MI_FORCE_WRITE_COMPLETION_CHECK | + MI_SDI_NUM_QW(chunk); bb->cs[bb->len++] = ofs; bb->cs[bb->len++] = 0; @@ -1223,7 +1225,9 @@ static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, if (!(bb->len & 1)) bb->cs[bb->len++] = MI_NOOP; - bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); + bb->cs[bb->len++] = MI_STORE_DATA_IMM | + MI_FORCE_WRITE_COMPLETION_CHECK | + MI_SDI_NUM_QW(chunk); bb->cs[bb->len++] = lower_32_bits(addr); bb->cs[bb->len++] = upper_32_bits(addr); if (pt_op->bind) @@ -1388,7 +1392,8 @@ __xe_migrate_update_pgtables(struct xe_migrate *m, u32 idx = 0; bb->cs[bb->len++] = MI_STORE_DATA_IMM | - MI_SDI_NUM_QW(chunk); + MI_FORCE_WRITE_COMPLETION_CHECK | + MI_SDI_NUM_QW(chunk); bb->cs[bb->len++] = ofs; bb->cs[bb->len++] = 0; /* upper_32_bits */ diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index ec88b18e9baa..4e00a77289c5 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -16,7 +16,6 @@ #include "instructions/xe_mi_commands.h" #include "regs/xe_engine_regs.h" #include "regs/xe_gt_regs.h" -#include "regs/xe_lrc_layout.h" #include "regs/xe_oa_regs.h" #include "xe_assert.h" #include "xe_bb.h" @@ -28,7 +27,6 @@ #include "xe_gt_mcr.h" #include "xe_gt_printk.h" #include "xe_guc_pc.h" -#include "xe_lrc.h" #include "xe_macros.h" #include "xe_mmio.h" #include "xe_oa.h" @@ -74,12 +72,6 @@ struct xe_oa_config { struct rcu_head rcu; }; -struct flex { - struct xe_reg reg; - u32 offset; - u32 value; -}; - struct xe_oa_open_param { struct xe_file *xef; u32 oa_unit_id; @@ -97,6 +89,7 @@ struct xe_oa_open_param { int num_syncs; struct xe_sync_entry *syncs; size_t oa_buffer_size; + int wait_num_reports; }; struct xe_oa_config_bo { @@ -241,11 +234,10 @@ static void oa_timestamp_clear(struct xe_oa_stream *stream, u32 *report) static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) { u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); + u32 tail, hw_tail, partial_report_size, available; int report_size = stream->oa_buffer.format->size; - u32 tail, hw_tail; unsigned long flags; bool pollin; - u32 partial_report_size; spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); @@ -289,8 +281,8 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) stream->oa_buffer.tail = tail; - pollin = xe_oa_circ_diff(stream, stream->oa_buffer.tail, - stream->oa_buffer.head) >= report_size; + available = xe_oa_circ_diff(stream, stream->oa_buffer.tail, stream->oa_buffer.head); + pollin = available >= stream->wait_num_reports * report_size; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); @@ -605,19 +597,38 @@ static __poll_t xe_oa_poll(struct file *file, poll_table *wait) return ret; } +static void xe_oa_lock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + down_read(&q->vm->lock); + xe_vm_lock(q->vm, false); + } +} + +static void xe_oa_unlock_vma(struct xe_exec_queue *q) +{ + if (q->vm) { + xe_vm_unlock(q->vm); + up_read(&q->vm->lock); + } +} + static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa_submit_deps deps, struct xe_bb *bb) { + struct xe_exec_queue *q = stream->exec_q ?: stream->k_exec_q; struct xe_sched_job *job; struct dma_fence *fence; int err = 0; - /* Kernel configuration is issued on stream->k_exec_q, not stream->exec_q */ - job = xe_bb_create_job(stream->k_exec_q, bb); + xe_oa_lock_vma(q); + + job = xe_bb_create_job(q, bb); if (IS_ERR(job)) { err = PTR_ERR(job); goto exit; } + job->ggtt = true; if (deps == XE_OA_SUBMIT_ADD_DEPS) { for (int i = 0; i < stream->num_syncs && !err; i++) @@ -632,10 +643,13 @@ static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa fence = dma_fence_get(&job->drm.s_fence->finished); xe_sched_job_push(job); + xe_oa_unlock_vma(q); + return fence; err_put_job: xe_sched_job_put(job); exit: + xe_oa_unlock_vma(q); return ERR_PTR(err); } @@ -684,63 +698,19 @@ static void xe_oa_free_configs(struct xe_oa_stream *stream) dma_fence_put(stream->last_fence); } -static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, - struct xe_bb *bb, const struct flex *flex, u32 count) -{ - u32 offset = xe_bo_ggtt_addr(lrc->bo); - - do { - bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); - bb->cs[bb->len++] = offset + flex->offset * sizeof(u32); - bb->cs[bb->len++] = 0; - bb->cs[bb->len++] = flex->value; - - } while (flex++, --count); -} - -static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lrc, - const struct flex *flex, u32 count) +static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri, u32 count) { struct dma_fence *fence; struct xe_bb *bb; int err; - bb = xe_bb_new(stream->gt, 4 * count, false); + bb = xe_bb_new(stream->gt, 2 * count + 1, false); if (IS_ERR(bb)) { err = PTR_ERR(bb); goto exit; } - xe_oa_store_flex(stream, lrc, bb, flex, count); - - fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); - if (IS_ERR(fence)) { - err = PTR_ERR(fence); - goto free_bb; - } - xe_bb_free(bb, fence); - dma_fence_put(fence); - - return 0; -free_bb: - xe_bb_free(bb, NULL); -exit: - return err; -} - -static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri) -{ - struct dma_fence *fence; - struct xe_bb *bb; - int err; - - bb = xe_bb_new(stream->gt, 3, false); - if (IS_ERR(bb)) { - err = PTR_ERR(bb); - goto exit; - } - - write_cs_mi_lri(bb, reg_lri, 1); + write_cs_mi_lri(bb, reg_lri, count); fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb); if (IS_ERR(fence)) { @@ -760,71 +730,55 @@ exit: static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, { + OAR_OACONTROL, + oacontrol, + }, + { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE), + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) }, }; - struct xe_oa_reg reg_lri = { OAR_OACONTROL, oacontrol }; - int err; - - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable) { const struct xe_oa_format *format = stream->oa_buffer.format; - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32); u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) | (enable ? OAR_OACONTROL_COUNTER_ENABLE : 0); - struct flex regs_context[] = { + struct xe_oa_reg reg_lri[] = { { OACTXCONTROL(stream->hwe->mmio_base), - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1, enable ? OA_COUNTER_RESUME : 0, }, { + OAC_OACONTROL, + oacontrol + }, + { RING_CONTEXT_CONTROL(stream->hwe->mmio_base), - regs_offset + CTX_CONTEXT_CONTROL, - _MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE) | + _MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE, + enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) | _MASKED_FIELD(CTX_CTRL_RUN_ALONE, enable ? CTX_CTRL_RUN_ALONE : 0), }, }; - struct xe_oa_reg reg_lri = { OAC_OACONTROL, oacontrol }; - int err; /* Set ccs select to enable programming of OAC_OACONTROL */ xe_mmio_write32(&stream->gt->mmio, __oa_regs(stream)->oa_ctrl, __oa_ccs_select(stream)); - /* Modify stream hwe context image with regs_context */ - err = xe_oa_modify_ctx_image(stream, stream->exec_q->lrc[0], - regs_context, ARRAY_SIZE(regs_context)); - if (err) - return err; - - /* Apply reg_lri using LRI */ - return xe_oa_load_with_lri(stream, ®_lri); + return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri)); } static int xe_oa_configure_oa_context(struct xe_oa_stream *stream, bool enable) @@ -1285,6 +1239,17 @@ static int xe_oa_set_prop_oa_buffer_size(struct xe_oa *oa, u64 value, return 0; } +static int xe_oa_set_prop_wait_num_reports(struct xe_oa *oa, u64 value, + struct xe_oa_open_param *param) +{ + if (!value) { + drm_dbg(&oa->xe->drm, "wait_num_reports %llu\n", value); + return -EINVAL; + } + param->wait_num_reports = value; + return 0; +} + static int xe_oa_set_prop_ret_inval(struct xe_oa *oa, u64 value, struct xe_oa_open_param *param) { @@ -1306,6 +1271,7 @@ static const xe_oa_set_property_fn xe_oa_set_property_funcs_open[] = { [DRM_XE_OA_PROPERTY_NUM_SYNCS] = xe_oa_set_prop_num_syncs, [DRM_XE_OA_PROPERTY_SYNCS] = xe_oa_set_prop_syncs_user, [DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE] = xe_oa_set_prop_oa_buffer_size, + [DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS] = xe_oa_set_prop_wait_num_reports, }; static const xe_oa_set_property_fn xe_oa_set_property_funcs_config[] = { @@ -1321,6 +1287,7 @@ static const xe_oa_set_property_fn xe_oa_set_property_funcs_config[] = { [DRM_XE_OA_PROPERTY_NUM_SYNCS] = xe_oa_set_prop_num_syncs, [DRM_XE_OA_PROPERTY_SYNCS] = xe_oa_set_prop_syncs_user, [DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE] = xe_oa_set_prop_ret_inval, + [DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS] = xe_oa_set_prop_ret_inval, }; static int xe_oa_user_ext_set_property(struct xe_oa *oa, enum xe_oa_user_extn_from from, @@ -1704,81 +1671,6 @@ static const struct file_operations xe_oa_fops = { .mmap = xe_oa_mmap, }; -static bool engine_supports_mi_query(struct xe_hw_engine *hwe) -{ - return hwe->class == XE_ENGINE_CLASS_RENDER || - hwe->class == XE_ENGINE_CLASS_COMPUTE; -} - -static bool xe_oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) -{ - u32 idx = *offset; - u32 len = min(MI_LRI_LEN(state[idx]) + idx, end); - bool found = false; - - idx++; - for (; idx < len; idx += 2) { - if (state[idx] == reg) { - found = true; - break; - } - } - - *offset = idx; - return found; -} - -#define IS_MI_LRI_CMD(x) (REG_FIELD_GET(MI_OPCODE, (x)) == \ - REG_FIELD_GET(MI_OPCODE, MI_LOAD_REGISTER_IMM)) - -static u32 xe_oa_context_image_offset(struct xe_oa_stream *stream, u32 reg) -{ - struct xe_lrc *lrc = stream->exec_q->lrc[0]; - u32 len = (xe_gt_lrc_size(stream->gt, stream->hwe->class) + - lrc->ring.size) / sizeof(u32); - u32 offset = xe_lrc_regs_offset(lrc) / sizeof(u32); - u32 *state = (u32 *)lrc->bo->vmap.vaddr; - - if (drm_WARN_ON(&stream->oa->xe->drm, !state)) - return U32_MAX; - - for (; offset < len; ) { - if (IS_MI_LRI_CMD(state[offset])) { - /* - * We expect reg-value pairs in MI_LRI command, so - * MI_LRI_LEN() should be even - */ - drm_WARN_ON(&stream->oa->xe->drm, - MI_LRI_LEN(state[offset]) & 0x1); - - if (xe_oa_find_reg_in_lri(state, reg, &offset, len)) - break; - } else { - offset++; - } - } - - return offset < len ? offset : U32_MAX; -} - -static int xe_oa_set_ctx_ctrl_offset(struct xe_oa_stream *stream) -{ - struct xe_reg reg = OACTXCONTROL(stream->hwe->mmio_base); - u32 offset = stream->oa->ctx_oactxctrl_offset[stream->hwe->class]; - - /* Do this only once. Failure is stored as offset of U32_MAX */ - if (offset) - goto exit; - - offset = xe_oa_context_image_offset(stream, reg.addr); - stream->oa->ctx_oactxctrl_offset[stream->hwe->class] = offset; - - drm_dbg(&stream->oa->xe->drm, "%s oa ctx control at 0x%08x dword offset\n", - stream->hwe->name, offset); -exit: - return offset && offset != U32_MAX ? 0 : -ENODEV; -} - static int xe_oa_stream_init(struct xe_oa_stream *stream, struct xe_oa_open_param *param) { @@ -1797,6 +1689,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, stream->periodic = param->period_exponent > 0; stream->period_exponent = param->period_exponent; stream->no_preempt = param->no_preempt; + stream->wait_num_reports = param->wait_num_reports; stream->xef = xe_file_get(param->xef); stream->num_syncs = param->num_syncs; @@ -1815,17 +1708,6 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, else stream->oa_buffer.circ_size = param->oa_buffer_size; - if (stream->exec_q && engine_supports_mi_query(stream->hwe)) { - /* If we don't find the context offset, just return error */ - ret = xe_oa_set_ctx_ctrl_offset(stream); - if (ret) { - drm_err(&stream->oa->xe->drm, - "xe_oa_set_ctx_ctrl_offset failed for %s\n", - stream->hwe->name); - goto exit; - } - } - stream->oa_config = xe_oa_get_oa_config(stream->oa, param->metric_set); if (!stream->oa_config) { drm_dbg(&stream->oa->xe->drm, "Invalid OA config id=%i\n", param->metric_set); @@ -2094,8 +1976,8 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f if (XE_IOCTL_DBG(oa->xe, !param.exec_q)) return -ENOENT; - if (param.exec_q->width > 1) - drm_dbg(&oa->xe->drm, "exec_q->width > 1, programming only exec_q->lrc[0]\n"); + if (XE_IOCTL_DBG(oa->xe, param.exec_q->width > 1)) + return -EOPNOTSUPP; } /* @@ -2156,6 +2038,14 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f if (!param.oa_buffer_size) param.oa_buffer_size = DEFAULT_XE_OA_BUFFER_SIZE; + if (!param.wait_num_reports) + param.wait_num_reports = 1; + if (param.wait_num_reports > param.oa_buffer_size / f->size) { + drm_dbg(&oa->xe->drm, "wait_num_reports %d\n", param.wait_num_reports); + ret = -EINVAL; + goto err_exec_q; + } + ret = xe_oa_parse_syncs(oa, ¶m); if (ret) goto err_exec_q; diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h index df7793915628..52e33c37d5ee 100644 --- a/drivers/gpu/drm/xe/xe_oa_types.h +++ b/drivers/gpu/drm/xe/xe_oa_types.h @@ -138,9 +138,6 @@ struct xe_oa { /** @metrics_idr: List of dynamic configurations (struct xe_oa_config) */ struct idr metrics_idr; - /** @ctx_oactxctrl_offset: offset of OACTXCONTROL register in context image */ - u32 ctx_oactxctrl_offset[XE_ENGINE_CLASS_MAX]; - /** @oa_formats: tracks all OA formats across platforms */ const struct xe_oa_format *oa_formats; @@ -218,6 +215,9 @@ struct xe_oa_stream { /** @pollin: Whether there is data available to read */ bool pollin; + /** @wait_num_reports: Number of reports to wait for before signalling pollin */ + int wait_num_reports; + /** @periodic: Whether periodic sampling is currently enabled */ bool periodic; diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index a6761cb769b2..c6e57af0144c 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -7,6 +7,7 @@ #include <linux/fault-inject.h> #include <linux/pm_runtime.h> +#include <linux/suspend.h> #include <drm/drm_managed.h> #include <drm/ttm/ttm_placement.h> @@ -607,7 +608,8 @@ static bool xe_pm_suspending_or_resuming(struct xe_device *xe) struct device *dev = xe->drm.dev; return dev->power.runtime_status == RPM_SUSPENDING || - dev->power.runtime_status == RPM_RESUMING; + dev->power.runtime_status == RPM_RESUMING || + pm_suspend_target_state != PM_SUSPEND_ON; #else return false; #endif diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c index d2a816f71bf2..c059639613f7 100644 --- a/drivers/gpu/drm/xe/xe_query.c +++ b/drivers/gpu/drm/xe/xe_query.c @@ -672,7 +672,8 @@ static int query_oa_units(struct xe_device *xe, du->oa_unit_type = u->type; du->oa_timestamp_freq = xe_oa_timestamp_frequency(gt); du->capabilities = DRM_XE_OA_CAPS_BASE | DRM_XE_OA_CAPS_SYNCS | - DRM_XE_OA_CAPS_OA_BUFFER_SIZE; + DRM_XE_OA_CAPS_OA_BUFFER_SIZE | + DRM_XE_OA_CAPS_WAIT_NUM_REPORTS; j = 0; for_each_hw_engine(hwe, gt, hwe_id) { diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index 0be4f489d3e1..c8ab37fa0d19 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -72,7 +72,8 @@ static int emit_user_interrupt(u32 *dw, int i) static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i) { - dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); + dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | + MI_FORCE_WRITE_COMPLETION_CHECK | MI_SDI_NUM_DW(1); dw[i++] = addr; dw[i++] = 0; dw[i++] = value; @@ -162,7 +163,8 @@ static int emit_pipe_invalidate(u32 mask_flags, bool invalidate_tlb, u32 *dw, static int emit_store_imm_ppgtt_posted(u64 addr, u64 value, u32 *dw, int i) { - dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(1); + dw[i++] = MI_STORE_DATA_IMM | MI_FORCE_WRITE_COMPLETION_CHECK | + MI_SDI_NUM_QW(1); dw[i++] = lower_32_bits(addr); dw[i++] = upper_32_bits(addr); dw[i++] = lower_32_bits(value); @@ -221,7 +223,10 @@ static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw, static u32 get_ppgtt_flag(struct xe_sched_job *job) { - return job->q->vm ? BIT(8) : 0; + if (job->q->vm && !job->ggtt) + return BIT(8); + + return 0; } static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i) diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index f13f333f00be..d942b20a9f29 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -56,6 +56,8 @@ struct xe_sched_job { u32 migrate_flush_flags; /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ bool ring_ops_flush_tlb; + /** @ggtt: mapped in ggtt. */ + bool ggtt; /** @ptrs: per instance pointers. */ struct xe_job_ptrs ptrs[]; }; diff --git a/drivers/gpu/drm/xe/xe_trace_bo.h b/drivers/gpu/drm/xe/xe_trace_bo.h index 1762dd30ba6d..ea50fee50c7d 100644 --- a/drivers/gpu/drm/xe/xe_trace_bo.h +++ b/drivers/gpu/drm/xe/xe_trace_bo.h @@ -60,8 +60,8 @@ TRACE_EVENT(xe_bo_move, TP_STRUCT__entry( __field(struct xe_bo *, bo) __field(size_t, size) - __field(u32, new_placement) - __field(u32, old_placement) + __string(new_placement_name, xe_mem_type_to_name[new_placement]) + __string(old_placement_name, xe_mem_type_to_name[old_placement]) __string(device_id, __dev_name_bo(bo)) __field(bool, move_lacks_source) ), @@ -69,15 +69,15 @@ TRACE_EVENT(xe_bo_move, TP_fast_assign( __entry->bo = bo; __entry->size = bo->size; - __entry->new_placement = new_placement; - __entry->old_placement = old_placement; + __assign_str(new_placement_name); + __assign_str(old_placement_name); __assign_str(device_id); __entry->move_lacks_source = move_lacks_source; ), TP_printk("move_lacks_source:%s, migrate object %p [size %zu] from %s to %s device_id:%s", __entry->move_lacks_source ? "yes" : "no", __entry->bo, __entry->size, - xe_mem_type_to_name[__entry->old_placement], - xe_mem_type_to_name[__entry->new_placement], __get_str(device_id)) + __get_str(old_placement_name), + __get_str(new_placement_name), __get_str(device_id)) ); DECLARE_EVENT_CLASS(xe_vma, diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h index c6518b0992cf..77c826589ec1 100644 --- a/include/drm/intel/pciids.h +++ b/include/drm/intel/pciids.h @@ -858,6 +858,7 @@ MACRO__(0xB092, ## __VA_ARGS__), \ MACRO__(0xB0A0, ## __VA_ARGS__), \ MACRO__(0xB0A1, ## __VA_ARGS__), \ - MACRO__(0xB0A2, ## __VA_ARGS__) + MACRO__(0xB0A2, ## __VA_ARGS__), \ + MACRO__(0xB0B0, ## __VA_ARGS__) #endif /* __PCIIDS_H__ */ diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 0383b52cbd86..f62689ca861a 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1487,6 +1487,7 @@ struct drm_xe_oa_unit { #define DRM_XE_OA_CAPS_BASE (1 << 0) #define DRM_XE_OA_CAPS_SYNCS (1 << 1) #define DRM_XE_OA_CAPS_OA_BUFFER_SIZE (1 << 2) +#define DRM_XE_OA_CAPS_WAIT_NUM_REPORTS (1 << 3) /** @oa_timestamp_freq: OA timestamp freq */ __u64 oa_timestamp_freq; @@ -1660,6 +1661,12 @@ enum drm_xe_oa_property_id { * buffer is allocated by default. */ DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE, + + /** + * @DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS: Number of reports to wait + * for before unblocking poll or read + */ + DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS, }; /** |