From aa375130210a9941e84d4f4cf41e6bef7a728599 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Fri, 27 Mar 2015 08:15:35 -0400 Subject: ib/mlx5/hmm: add mlx5 hmm device initialization and callback v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This add the core hmm callback for mlx5 device driver and initialize the hmm device for the mlx5 infiniband device driver. Changed since v1: - Adapt to new hmm_mirror lifetime rules. - HMM_ISDIRTY no longer exist. Changed since v2: - Adapt to HMM page table changes. Signed-off-by: Jérôme Glisse Signed-off-by: John Hubbard --- drivers/infiniband/hw/mlx5/main.c | 5 + drivers/infiniband/hw/mlx5/mem.c | 38 ++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 16 +++- drivers/infiniband/hw/mlx5/mr.c | 7 ++ drivers/infiniband/hw/mlx5/odp.c | 174 +++++++++++++++++++++++++++++++++++ 5 files changed, 239 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index cccad5113f7c..fe63a302e13c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2413,6 +2413,9 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (err) goto err_rsrc; + /* If HMM initialization fails we just do not enable odp. */ + mlx5_dev_init_odp_hmm(&dev->ib_dev, &mdev->pdev->dev); + err = ib_register_device(&dev->ib_dev, NULL); if (err) goto err_odp; @@ -2437,6 +2440,7 @@ err_umrc: err_dev: ib_unregister_device(&dev->ib_dev); + mlx5_dev_fini_odp_hmm(&dev->ib_dev); err_odp: mlx5_ib_odp_remove_one(dev); @@ -2461,6 +2465,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); + mlx5_dev_fini_odp_hmm(&dev->ib_dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); if (ll == IB_LINK_LAYER_ETHERNET) diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 19354b6c8153..0d74eacce922 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -154,6 +154,8 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, __be64 *pas, int access_flags, void *data) { unsigned long umem_page_shift = ilog2(umem->page_size); + unsigned long start = ib_umem_start(umem) + (offset << PAGE_SHIFT); + unsigned long end = start + (num_pages << PAGE_SHIFT); int shift = page_shift - umem_page_shift; int mask = (1 << shift) - 1; int i, k; @@ -164,6 +166,42 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int entry; #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) + if (umem->odp_data) { + struct ib_mirror *ib_mirror = umem->odp_data->ib_mirror; + struct hmm_mirror *mirror = &ib_mirror->base; + struct hmm_pt_iter *iter = data, local_iter; + unsigned long addr; + + if (iter == NULL) { + iter = &local_iter; + hmm_pt_iter_init(iter, &mirror->pt); + } + + for (i=0, addr=start; i < num_pages; ++i, addr+=PAGE_SIZE) { + unsigned long next = end; + dma_addr_t *ptep, pte; + + /* Get and lock pointer to mirror page table. */ + ptep = hmm_pt_iter_lookup(iter, addr, &next); + pte = ptep ? *ptep : 0; + /* + * HMM will not have any page tables set up, if this + * function is called before page faults have happened + * on the MR. In that case, we don't have PA's yet, so + * just set each one to zero and continue on. The hw + * will trigger a page fault. + */ + if (hmm_pte_test_valid_dma(&pte)) + pas[i] = cpu_to_be64(umem_dma_to_mtt(pte)); + else + pas[i] = (__be64)0; + } + + if (iter == &local_iter) + hmm_pt_iter_fini(iter); + + return; + } #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ const bool odp = umem->odp_data != NULL; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5180c395e78d..8e3ea74767a5 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -762,6 +762,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) + extern struct workqueue_struct *mlx5_ib_page_fault_wq; void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); @@ -776,7 +777,20 @@ void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) +void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev, struct device *dev); +void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev); +int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start, + u64 end, void *cookie); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ +static inline void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev, + struct device *dev) +{ +} + +static inline void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev) +{ +} + void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ @@ -794,7 +808,6 @@ static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} - #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ int mlx5_ib_get_vf_config(struct ib_device *device, int vf, @@ -875,4 +888,5 @@ static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, return 0; } + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 69742c8a9deb..5d1f1a87f43c 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1489,6 +1489,13 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) + if (mlx5_ib_umem_invalidate(umem, ib_umem_start(umem), + ib_umem_end(umem), NULL)) + /* + * FIXME do something to kill all mr and umem + * in use by this process. + */ + pr_err("killing all mr with odp due to mtt update failure\n"); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ /* Destroy all page mappings */ mlx5_ib_invalidate_range(umem, ib_umem_start(umem), diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 94395fa9b1e8..b92ff4d1abdc 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -53,6 +53,180 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, } #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) + +int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start, + u64 end, void *cookie) +{ + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + u64 idx = 0, blk_start_idx = 0; + struct hmm_pt_iter iter; + struct mlx5_ib_mr *mlx5_ib_mr; + struct hmm_mirror *mirror; + unsigned long addr; + int in_block = 0; + int ret = 0; + + if (!umem || !umem->odp_data) { + pr_err("invalidation called on NULL umem or non-ODP umem\n"); + return -EINVAL; + } + + /* Is this ib_mr active and registered yet ? */ + if (umem->odp_data->private == NULL) + return 0; + + mlx5_ib_mr = umem->odp_data->private; + if (!mlx5_ib_mr->ibmr.pd) + return 0; + + mirror = &umem->odp_data->ib_mirror->base; + start = max_t(u64, ib_umem_start(umem), start); + end = min_t(u64, ib_umem_end(umem), end); + hmm_pt_iter_init(&iter, &mirror->pt); + + /* + * Iteration one - zap the HW's MTTs. HMM ensures that while we are + * doing the invalidation, no page fault will attempt to overwrite the + * same MTTs. Concurent invalidations might race us, but they will + * write 0s as well, so no difference in the end result. + */ + for (addr = start; addr < end; addr += (u64)umem->page_size) { + unsigned long next = end; + dma_addr_t *ptep; + + /* Get and lock pointer to mirror page table. */ + ptep = hmm_pt_iter_walk(&iter, &addr, &next); + for (; ptep && addr < next; addr += PAGE_SIZE, ptep++) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + /* + * Strive to write the MTTs in chunks, but avoid + * overwriting non-existing MTTs. The huristic here can + * be improved to estimate the cost of another UMR vs. + * the cost of bigger UMR. + */ + if ((*ptep) & (ODP_READ_ALLOWED_BIT | + ODP_WRITE_ALLOWED_BIT)) { + if ((*ptep) & ODP_WRITE_ALLOWED_BIT) + hmm_pte_set_dirty(ptep); + /* + * Because there can not be concurrent overlapping + * munmap, page migrate, page write protect then it + * is safe here to clear those bits. + */ + hmm_pte_clear_bit(ptep, ODP_READ_ALLOWED_SHIFT); + hmm_pte_clear_bit(ptep, ODP_WRITE_ALLOWED_SHIFT); + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + ret = mlx5_ib_update_mtt(mlx5_ib_mr, + blk_start_idx, + idx - blk_start_idx, + 1, &iter) || ret; + in_block = 0; + } + } + } + } + if (in_block) + ret = mlx5_ib_update_mtt(mlx5_ib_mr, blk_start_idx, + idx - blk_start_idx + 1, 1, + &iter) || ret; + hmm_pt_iter_fini(&iter); + return ret; +} + +static int mlx5_hmm_invalidate_range(struct hmm_mirror *mirror, + unsigned long start, + unsigned long end) +{ + struct ib_mirror *ib_mirror; + int ret; + + ib_mirror = container_of(mirror, struct ib_mirror, base); + + /* Go over all memory region and invalidate them. */ + down_read(&ib_mirror->umem_rwsem); + ret = rbt_ib_umem_for_each_in_range(&ib_mirror->umem_tree, start, end, + mlx5_ib_umem_invalidate, NULL); + up_read(&ib_mirror->umem_rwsem); + return ret; +} + +static void mlx5_hmm_release(struct hmm_mirror *mirror) +{ + struct ib_mirror *ib_mirror; + + ib_mirror = container_of(mirror, struct ib_mirror, base); + + /* Go over all memory region and invalidate them. */ + mlx5_hmm_invalidate_range(mirror, 0, ULLONG_MAX); +} + +static void mlx5_hmm_free(struct hmm_mirror *mirror) +{ + struct ib_mirror *ib_mirror; + + ib_mirror = container_of(mirror, struct ib_mirror, base); + kfree(ib_mirror); +} + +static int mlx5_hmm_update(struct hmm_mirror *mirror, + struct hmm_event *event) +{ + struct device *device = mirror->device->dev; + int ret = 0; + + switch (event->etype) { + case HMM_DEVICE_READ_FAULT: + case HMM_DEVICE_WRITE_FAULT: + /* FIXME implement. */ + break; + case HMM_NONE: + default: + dev_warn(device, "Warning: unhandled HMM event (%d) defaulting to invalidation\n", + event->etype); + /* Fallthrough. */ + /* For write protect and fork we could only invalidate writeable mr. */ + case HMM_WRITE_PROTECT: + case HMM_MIGRATE: + case HMM_MUNMAP: + case HMM_FORK: + ret = mlx5_hmm_invalidate_range(mirror, + event->start, + event->end); + break; + } + + return ret; +} + +static const struct hmm_device_ops mlx5_hmm_ops = { + .release = &mlx5_hmm_release, + .free = &mlx5_hmm_free, + .update = &mlx5_hmm_update, +}; + +void mlx5_dev_init_odp_hmm(struct ib_device *ib_device, struct device *dev) +{ + INIT_LIST_HEAD(&ib_device->ib_mirrors); + ib_device->hmm_dev.dev = dev; + ib_device->hmm_dev.ops = &mlx5_hmm_ops; + ib_device->hmm_ready = !hmm_device_register(&ib_device->hmm_dev); + mutex_init(&ib_device->hmm_mutex); +} + +void mlx5_dev_fini_odp_hmm(struct ib_device *ib_device) +{ + if (!ib_device->hmm_ready) + return; + hmm_device_unregister(&ib_device->hmm_dev); +} + #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ -- cgit v1.2.3