diff options
author | Jérôme Glisse <jglisse@redhat.com> | 2015-03-27 08:15:35 -0400 |
---|---|---|
committer | Jérôme Glisse <jglisse@redhat.com> | 2016-04-07 13:24:05 -0400 |
commit | aa375130210a9941e84d4f4cf41e6bef7a728599 (patch) | |
tree | 49d9e97be15b3e375688e3a2902ead7a7d2fb3a7 /drivers/infiniband | |
parent | d77c4e5adedc4ce363d35a18c3ee380b0eef92a5 (diff) |
ib/mlx5/hmm: add mlx5 hmm device initialization and callback v3
This add the core hmm callback for mlx5 device driver and initialize
the hmm device for the mlx5 infiniband device driver.
Changed since v1:
- Adapt to new hmm_mirror lifetime rules.
- HMM_ISDIRTY no longer exist.
Changed since v2:
- Adapt to HMM page table changes.
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mem.c | 38 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 16 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 7 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 174 |
5 files changed, 239 insertions, 1 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index cccad5113f7c..fe63a302e13c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2413,6 +2413,9 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (err) goto err_rsrc; + /* If HMM initialization fails we just do not enable odp. */ + mlx5_dev_init_odp_hmm(&dev->ib_dev, &mdev->pdev->dev); + err = ib_register_device(&dev->ib_dev, NULL); if (err) goto err_odp; @@ -2437,6 +2440,7 @@ err_umrc: err_dev: ib_unregister_device(&dev->ib_dev); + mlx5_dev_fini_odp_hmm(&dev->ib_dev); err_odp: mlx5_ib_odp_remove_one(dev); @@ -2461,6 +2465,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); + mlx5_dev_fini_odp_hmm(&dev->ib_dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); if (ll == IB_LINK_LAYER_ETHERNET) diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 19354b6c8153..0d74eacce922 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -154,6 +154,8 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, __be64 *pas, int access_flags, void *data) { unsigned long umem_page_shift = ilog2(umem->page_size); + unsigned long start = ib_umem_start(umem) + (offset << PAGE_SHIFT); + unsigned long end = start + (num_pages << PAGE_SHIFT); int shift = page_shift - umem_page_shift; int mask = (1 << shift) - 1; int i, k; @@ -164,6 +166,42 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int entry; #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) + if (umem->odp_data) { + struct ib_mirror *ib_mirror = umem->odp_data->ib_mirror; + struct hmm_mirror *mirror = &ib_mirror->base; + struct hmm_pt_iter *iter = data, local_iter; + unsigned long addr; + + if (iter == NULL) { + iter = &local_iter; + hmm_pt_iter_init(iter, &mirror->pt); + } + + for (i=0, addr=start; i < num_pages; ++i, addr+=PAGE_SIZE) { + unsigned long next = end; + dma_addr_t *ptep, pte; + + /* Get and lock pointer to mirror page table. */ + ptep = hmm_pt_iter_lookup(iter, addr, &next); + pte = ptep ? *ptep : 0; + /* + * HMM will not have any page tables set up, if this + * function is called before page faults have happened + * on the MR. In that case, we don't have PA's yet, so + * just set each one to zero and continue on. The hw + * will trigger a page fault. + */ + if (hmm_pte_test_valid_dma(&pte)) + pas[i] = cpu_to_be64(umem_dma_to_mtt(pte)); + else + pas[i] = (__be64)0; + } + + if (iter == &local_iter) + hmm_pt_iter_fini(iter); + + return; + } #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ const bool odp = umem->odp_data != NULL; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5180c395e78d..8e3ea74767a5 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -762,6 +762,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) + extern struct workqueue_struct *mlx5_ib_page_fault_wq; void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); @@ -776,7 +777,20 @@ void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) +void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev, struct device *dev); +void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev); +int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start, + u64 end, void *cookie); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ +static inline void mlx5_dev_init_odp_hmm(struct ib_device *ib_dev, + struct device *dev) +{ +} + +static inline void mlx5_dev_fini_odp_hmm(struct ib_device *ib_dev) +{ +} + void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ @@ -794,7 +808,6 @@ static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} - #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ int mlx5_ib_get_vf_config(struct ib_device *device, int vf, @@ -875,4 +888,5 @@ static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, return 0; } + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 69742c8a9deb..5d1f1a87f43c 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1489,6 +1489,13 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) + if (mlx5_ib_umem_invalidate(umem, ib_umem_start(umem), + ib_umem_end(umem), NULL)) + /* + * FIXME do something to kill all mr and umem + * in use by this process. + */ + pr_err("killing all mr with odp due to mtt update failure\n"); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ /* Destroy all page mappings */ mlx5_ib_invalidate_range(umem, ib_umem_start(umem), diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 94395fa9b1e8..b92ff4d1abdc 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -53,6 +53,180 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, } #if IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM) + +int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start, + u64 end, void *cookie) +{ + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + u64 idx = 0, blk_start_idx = 0; + struct hmm_pt_iter iter; + struct mlx5_ib_mr *mlx5_ib_mr; + struct hmm_mirror *mirror; + unsigned long addr; + int in_block = 0; + int ret = 0; + + if (!umem || !umem->odp_data) { + pr_err("invalidation called on NULL umem or non-ODP umem\n"); + return -EINVAL; + } + + /* Is this ib_mr active and registered yet ? */ + if (umem->odp_data->private == NULL) + return 0; + + mlx5_ib_mr = umem->odp_data->private; + if (!mlx5_ib_mr->ibmr.pd) + return 0; + + mirror = &umem->odp_data->ib_mirror->base; + start = max_t(u64, ib_umem_start(umem), start); + end = min_t(u64, ib_umem_end(umem), end); + hmm_pt_iter_init(&iter, &mirror->pt); + + /* + * Iteration one - zap the HW's MTTs. HMM ensures that while we are + * doing the invalidation, no page fault will attempt to overwrite the + * same MTTs. Concurent invalidations might race us, but they will + * write 0s as well, so no difference in the end result. + */ + for (addr = start; addr < end; addr += (u64)umem->page_size) { + unsigned long next = end; + dma_addr_t *ptep; + + /* Get and lock pointer to mirror page table. */ + ptep = hmm_pt_iter_walk(&iter, &addr, &next); + for (; ptep && addr < next; addr += PAGE_SIZE, ptep++) { + idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + /* + * Strive to write the MTTs in chunks, but avoid + * overwriting non-existing MTTs. The huristic here can + * be improved to estimate the cost of another UMR vs. + * the cost of bigger UMR. + */ + if ((*ptep) & (ODP_READ_ALLOWED_BIT | + ODP_WRITE_ALLOWED_BIT)) { + if ((*ptep) & ODP_WRITE_ALLOWED_BIT) + hmm_pte_set_dirty(ptep); + /* + * Because there can not be concurrent overlapping + * munmap, page migrate, page write protect then it + * is safe here to clear those bits. + */ + hmm_pte_clear_bit(ptep, ODP_READ_ALLOWED_SHIFT); + hmm_pte_clear_bit(ptep, ODP_WRITE_ALLOWED_SHIFT); + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + ret = mlx5_ib_update_mtt(mlx5_ib_mr, + blk_start_idx, + idx - blk_start_idx, + 1, &iter) || ret; + in_block = 0; + } + } + } + } + if (in_block) + ret = mlx5_ib_update_mtt(mlx5_ib_mr, blk_start_idx, + idx - blk_start_idx + 1, 1, + &iter) || ret; + hmm_pt_iter_fini(&iter); + return ret; +} + +static int mlx5_hmm_invalidate_range(struct hmm_mirror *mirror, + unsigned long start, + unsigned long end) +{ + struct ib_mirror *ib_mirror; + int ret; + + ib_mirror = container_of(mirror, struct ib_mirror, base); + + /* Go over all memory region and invalidate them. */ + down_read(&ib_mirror->umem_rwsem); + ret = rbt_ib_umem_for_each_in_range(&ib_mirror->umem_tree, start, end, + mlx5_ib_umem_invalidate, NULL); + up_read(&ib_mirror->umem_rwsem); + return ret; +} + +static void mlx5_hmm_release(struct hmm_mirror *mirror) +{ + struct ib_mirror *ib_mirror; + + ib_mirror = container_of(mirror, struct ib_mirror, base); + + /* Go over all memory region and invalidate them. */ + mlx5_hmm_invalidate_range(mirror, 0, ULLONG_MAX); +} + +static void mlx5_hmm_free(struct hmm_mirror *mirror) +{ + struct ib_mirror *ib_mirror; + + ib_mirror = container_of(mirror, struct ib_mirror, base); + kfree(ib_mirror); +} + +static int mlx5_hmm_update(struct hmm_mirror *mirror, + struct hmm_event *event) +{ + struct device *device = mirror->device->dev; + int ret = 0; + + switch (event->etype) { + case HMM_DEVICE_READ_FAULT: + case HMM_DEVICE_WRITE_FAULT: + /* FIXME implement. */ + break; + case HMM_NONE: + default: + dev_warn(device, "Warning: unhandled HMM event (%d) defaulting to invalidation\n", + event->etype); + /* Fallthrough. */ + /* For write protect and fork we could only invalidate writeable mr. */ + case HMM_WRITE_PROTECT: + case HMM_MIGRATE: + case HMM_MUNMAP: + case HMM_FORK: + ret = mlx5_hmm_invalidate_range(mirror, + event->start, + event->end); + break; + } + + return ret; +} + +static const struct hmm_device_ops mlx5_hmm_ops = { + .release = &mlx5_hmm_release, + .free = &mlx5_hmm_free, + .update = &mlx5_hmm_update, +}; + +void mlx5_dev_init_odp_hmm(struct ib_device *ib_device, struct device *dev) +{ + INIT_LIST_HEAD(&ib_device->ib_mirrors); + ib_device->hmm_dev.dev = dev; + ib_device->hmm_dev.ops = &mlx5_hmm_ops; + ib_device->hmm_ready = !hmm_device_register(&ib_device->hmm_dev); + mutex_init(&ib_device->hmm_mutex); +} + +void mlx5_dev_fini_odp_hmm(struct ib_device *ib_device) +{ + if (!ib_device->hmm_ready) + return; + hmm_device_unregister(&ib_device->hmm_dev); +} + #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */ |