diff options
-rw-r--r-- | Documentation/virt/kvm/api.rst | 27 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_asm.h | 4 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_host.h | 15 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_mmu.h | 1 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_pgtable.h | 79 | ||||
-rw-r--r-- | arch/arm64/kvm/arm.c | 28 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/hyp-main.c | 10 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/mem_protect.c | 6 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/tlb.c | 52 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/pgtable.c | 201 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/vhe/tlb.c | 32 | ||||
-rw-r--r-- | arch/arm64/kvm/mmu.c | 207 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/kvm.h | 2 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 3 |
15 files changed, 612 insertions, 57 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index add067793b90..656bd293c8f4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8445,6 +8445,33 @@ structure. When getting the Modified Change Topology Report value, the attr->addr must point to a byte where the value will be stored or retrieved from. +8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE +--------------------------------------- + +:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE +:Architectures: arm64 +:Type: vm +:Parameters: arg[0] is the new split chunk size. +:Returns: 0 on success, -EINVAL if any memslot was already created. + +This capability sets the chunk size used in Eager Page Splitting. + +Eager Page Splitting improves the performance of dirty-logging (used +in live migrations) when guest memory is backed by huge-pages. It +avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing +it eagerly when enabling dirty logging (with the +KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using +KVM_CLEAR_DIRTY_LOG. + +The chunk size specifies how many pages to break at a time, using a +single allocation for each chunk. Bigger the chunk size, more pages +need to be allocated ahead of time. + +The chunk size needs to be a valid block size. The list of acceptable +block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a +64-bit bitmap (each bit describing a block size). The default value is +0, to disable the eager page splitting. + 9. Known KVM API problems ========================= diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 43c3bc0f9544..bb17b2ead4c7 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -68,6 +68,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh, __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid, __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, @@ -225,6 +226,9 @@ extern void __kvm_flush_vm_context(void); extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level); +extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, + int level); extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern void __kvm_timer_set_cntvoff(u64 cntvoff); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..b743198450b3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -159,6 +159,21 @@ struct kvm_s2_mmu { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; +#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0 + /* + * Memory cache used to split + * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It + * is used to allocate stage2 page tables while splitting huge + * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE + * influences both the capacity of the split page cache, and + * how often KVM reschedules. Be wary of raising CHUNK_SIZE + * too high. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_page_cache; + uint64_t split_page_chunk_size; + struct kvm_arch *arch; }; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 27e63c111f78..20c50e00496d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -172,6 +172,7 @@ void __init free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); +void kvm_uninit_stage2_mmu(struct kvm *kvm); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index dc3c072e862f..a41581c985b7 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level) return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } +static inline u32 kvm_supported_block_sizes(void) +{ + u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; + u32 r = 0; + + for (; level < KVM_PGTABLE_MAX_LEVELS; level++) + r |= BIT(kvm_granule_shift(level)); + + return r; +} + +static inline bool kvm_is_block_size_supported(u64 size) +{ + bool is_power_of_two = IS_ALIGNED(size, size); + + return is_power_of_two && (size & kvm_supported_block_sizes()); +} + /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. @@ -104,7 +122,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level) * allocation is physically contiguous. * @free_pages_exact: Free an exact number of memory pages previously * allocated by zalloc_pages_exact. - * @free_removed_table: Free a removed paging structure by unlinking and + * @free_unlinked_table: Free an unlinked paging structure by unlinking and * dropping references. * @get_page: Increment the refcount on a page. * @put_page: Decrement the refcount on a page. When the @@ -124,7 +142,7 @@ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); - void (*free_removed_table)(void *addr, u32 level); + void (*free_unlinked_table)(void *addr, u32 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); @@ -195,6 +213,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, * with other software walkers. * @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was * invoked from a fault handler. + * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries + * without Break-before-make's + * TLB invalidation. + * @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries + * without Cache maintenance + * operations required. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), @@ -202,6 +226,8 @@ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_TABLE_POST = BIT(2), KVM_PGTABLE_WALK_SHARED = BIT(3), KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4), + KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5), + KVM_PGTABLE_WALK_SKIP_CMO = BIT(6), }; struct kvm_pgtable_visit_ctx { @@ -441,7 +467,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); /** - * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure. + * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. * @pgtable: Unlinked stage-2 paging structure to be freed. * @level: Level of the stage-2 paging structure to be freed. @@ -449,7 +475,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * The page-table is assumed to be unreachable by any hardware walkers prior to * freeing and therefore no TLB invalidation is performed. */ -void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); + +/** + * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @phys: Physical address of the memory to map. + * @level: Starting level of the stage-2 paging structure to be created. + * @prot: Permissions and attributes for the mapping. + * @mc: Cache of pre-allocated and zeroed memory from which to allocate + * page-table pages. + * @force_pte: Force mappings to PAGE_SIZE granularity. + * + * Returns an unlinked page-table tree. This new page-table tree is + * not reachable (i.e., it is unlinked) from the root pgd and it's + * therefore unreachableby the hardware page-table walker. No TLB + * invalidation or CMOs are performed. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. + * + * Return: The fully populated (unlinked) stage-2 paging structure, or + * an ERR_PTR(error) on failure. + */ +kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, + u64 phys, u32 level, + enum kvm_pgtable_prot prot, + void *mc, bool force_pte); /** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. @@ -621,6 +673,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr); int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); /** + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing + * to PAGE_SIZE guest pages. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to split. + * @size: Size of the range. + * @mc: Cache of pre-allocated and zeroed memory from which to allocate + * page-table pages. + * + * The function tries to split any level 1 or 2 entry that overlaps + * with the input range (given by @addr and @size). + * + * Return: 0 on success, negative error code on failure. Note that + * kvm_pgtable_stage2_split() is best effort: it tries to break as many + * blocks in the input range as allowed by @mc_capacity. + */ +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc); + +/** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). * @addr: Input address for the start of the walk. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 14391826241c..c605626801c4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -65,6 +65,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; + u64 new_cap; if (cap->flags) return -EINVAL; @@ -89,6 +90,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + new_cap = cap->args[0]; + + mutex_lock(&kvm->slots_lock); + /* + * To keep things simple, allow changing the chunk + * size only when no memory slots have been created. + */ + if (!kvm_are_all_memslots_empty(kvm)) { + r = -EINVAL; + } else if (new_cap && !kvm_is_block_size_supported(new_cap)) { + r = -EINVAL; + } else { + r = 0; + kvm->arch.mmu.split_page_chunk_size = new_cap; + } + mutex_unlock(&kvm->slots_lock); + break; default: r = -EINVAL; break; @@ -302,6 +321,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PTRAUTH_GENERIC: r = system_has_full_ptr_auth(); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + if (kvm) + r = kvm->arch.mmu.split_page_chunk_size; + else + r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + break; + case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: + r = kvm_supported_block_sizes(); + break; default: r = 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 728e01d4536b..c6bf1e49ca93 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -125,6 +125,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); } +static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); + DECLARE_REG(int, level, host_ctxt, 3); + + __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level); +} + static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -315,6 +324,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh), HANDLE_FUNC(__kvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2e9ec4a2a4a3..d35e75b13ffe 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } -static void host_s2_free_removed_table(void *addr, u32 level) +static void host_s2_free_unlinked_table(void *addr, u32 level) { - kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level); + kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); } static int prepare_s2_pool(void *pgt_pool_base) @@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base) host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, - .free_removed_table = host_s2_free_removed_table, + .free_unlinked_table = host_s2_free_unlinked_table, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .page_count = hyp_page_count, diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 978179133f4b..b9991bbd8e3f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt, true); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + /* + * If the host is running at EL1 and we have a VPIPT I-cache, + * then we must perform I-cache maintenance at EL2 in order for + * it to have an effect on the guest. Since the guest cannot hit + * I-cache lines allocated with a different VMID, we don't need + * to worry about junk out of guest reset (we nuke the I-cache on + * VMID rollover), but we do need to be careful when remapping + * executable pages for the same guest. This can happen when KSM + * takes a CoW fault on an executable page, copies the page into + * a page that was previously mapped in the guest and then needs + * to invalidate the guest view of the I-cache for that page + * from EL1. To solve this, we invalidate the entire I-cache when + * unmapping a page from a guest if we have a VPIPT I-cache but + * the host is running at EL1. As above, we could do better if + * we had the VA. + * + * The moral of this story is: if you have a VPIPT I-cache, then + * you should be running with VHE enabled. + */ + if (icache_is_vpipt()) + icache_inval_all_pou(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 5282cb9ca4cf..8acab89080af 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -63,6 +63,16 @@ struct kvm_pgtable_walk_data { const u64 end; }; +static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); +} + +static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); +} + static bool kvm_phys_is_valid(u64 phys) { return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); @@ -743,14 +753,17 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) return false; - /* - * Perform the appropriate TLB invalidation based on the evicted pte - * value (if any). - */ - if (kvm_pte_table(ctx->old, ctx->level)) - kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); - else if (kvm_pte_valid(ctx->old)) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { + /* + * Perform the appropriate TLB invalidation based on the + * evicted pte value (if any). + */ + if (kvm_pte_table(ctx->old, ctx->level)) + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + else if (kvm_pte_valid(ctx->old)) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, + ctx->addr, ctx->level); + } if (stage2_pte_is_counted(ctx->old)) mm_ops->put_page(ctx->ptep); @@ -857,11 +870,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ - if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && + stage2_pte_cacheable(pgt, new)) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), - granule); + granule); - if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && + stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); stage2_make_pte(ctx, new); @@ -883,7 +898,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, if (ret) return ret; - mm_ops->free_removed_table(childp, ctx->level); + mm_ops->free_unlinked_table(childp, ctx->level); return 0; } @@ -928,7 +943,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, * The TABLE_PRE callback runs for table entries on the way down, looking * for table entries which we could conceivably replace with a block entry * for this mapping. If it finds one it replaces the entry and calls - * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table. + * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. * * Otherwise, the LEAF callback performs the mapping at the existing leaves * instead. @@ -1197,7 +1212,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); if (!ret) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } @@ -1230,6 +1245,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } +kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, + u64 phys, u32 level, + enum kvm_pgtable_prot prot, + void *mc, bool force_pte) +{ + struct stage2_map_data map_data = { + .phys = phys, + .mmu = pgt->mmu, + .memcache = mc, + .force_pte = force_pte, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_SKIP_BBM_TLBI | + KVM_PGTABLE_WALK_SKIP_CMO, + .arg = &map_data, + }; + /* + * The input address (.addr) is irrelevant for walking an + * unlinked table. Construct an ambiguous IA range to map + * kvm_granule_size(level) worth of memory. + */ + struct kvm_pgtable_walk_data data = { + .walker = &walker, + .addr = 0, + .end = kvm_granule_size(level), + }; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + kvm_pte_t *pgtable; + int ret; + + if (!IS_ALIGNED(phys, kvm_granule_size(level))) + return ERR_PTR(-EINVAL); + + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); + if (ret) + return ERR_PTR(ret); + + pgtable = mm_ops->zalloc_page(mc); + if (!pgtable) + return ERR_PTR(-ENOMEM); + + ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, + level + 1); + if (ret) { + kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); + mm_ops->put_page(pgtable); + return ERR_PTR(ret); + } + + return pgtable; +} + +/* + * Get the number of page-tables needed to replace a block with a + * fully populated tree up to the PTE entries. Note that @level is + * interpreted as in "level @level entry". + */ +static int stage2_block_get_nr_page_tables(u32 level) +{ + switch (level) { + case 1: + return PTRS_PER_PTE + 1; + case 2: + return 1; + case 3: + return 0; + default: + WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || + level >= KVM_PGTABLE_MAX_LEVELS); + return -EINVAL; + }; +} + +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + struct kvm_mmu_memory_cache *mc = ctx->arg; + struct kvm_s2_mmu *mmu; + kvm_pte_t pte = ctx->old, new, *childp; + enum kvm_pgtable_prot prot; + u32 level = ctx->level; + bool force_pte; + int nr_pages; + u64 phys; + + /* No huge-pages exist at the last level */ + if (level == KVM_PGTABLE_MAX_LEVELS - 1) + return 0; + + /* We only split valid block mappings */ + if (!kvm_pte_valid(pte)) + return 0; + + nr_pages = stage2_block_get_nr_page_tables(level); + if (nr_pages < 0) + return nr_pages; + + if (mc->nobjs >= nr_pages) { + /* Build a tree mapped down to the PTE granularity. */ + force_pte = true; + } else { + /* + * Don't force PTEs, so create_unlinked() below does + * not populate the tree up to the PTE level. The + * consequence is that the call will require a single + * page of level 2 entries at level 1, or a single + * page of PTEs at level 2. If we are at level 1, the + * PTEs will be created recursively. + */ + force_pte = false; + nr_pages = 1; + } + + if (mc->nobjs < nr_pages) + return -ENOMEM; + + mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); + phys = kvm_pte_to_phys(pte); + prot = kvm_pgtable_stage2_pte_prot(pte); + + childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, + level, prot, mc, force_pte); + if (IS_ERR(childp)) + return PTR_ERR(childp); + + if (!stage2_try_break_pte(ctx, mmu)) { + kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); + mm_ops->put_page(childp); + return -EAGAIN; + } + + /* + * Note, the contents of the page table are guaranteed to be made + * visible before the new PTE is assigned because stage2_make_pte() + * writes the PTE using smp_store_release(). + */ + new = kvm_init_table_pte(childp, mm_ops); + stage2_make_pte(ctx, new); + dsb(ishst); + return 0; +} + +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_split_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = mc, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, @@ -1299,7 +1470,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->pgd = NULL; } -void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 24cef9b87f9e..e69da550cdc5 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + dsb(nshst); + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3b9d4d24c361..6db9ef288ec3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector; static unsigned long __ro_after_init io_map_base; -static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, + phys_addr_t size) { - phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); phys_addr_t boundary = ALIGN_DOWN(addr + size, size); return (boundary - 1 < end - 1) ? boundary : end; } +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + + return __stage2_range_addr_end(addr, end, size); +} + /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, @@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, #define stage2_apply_range_resched(mmu, addr, end, fn) \ stage2_apply_range(mmu, addr, end, fn, true) +/* + * Get the maximum number of page-tables pages needed to split a range + * of blocks into PAGE_SIZE PTEs. It assumes the range is already + * mapped at level 2, or at level 1 if allowed. + */ +static int kvm_mmu_split_nr_page_tables(u64 range) +{ + int n = 0; + + if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) + n += DIV_ROUND_UP(range, PUD_SIZE); + n += DIV_ROUND_UP(range, PMD_SIZE); + return n; +} + +static bool need_split_memcache_topup_or_resched(struct kvm *kvm) +{ + struct kvm_mmu_memory_cache *cache; + u64 chunk_size, min; + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + min = kvm_mmu_split_nr_page_tables(chunk_size); + cache = &kvm->arch.mmu.split_page_cache; + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end) +{ + struct kvm_mmu_memory_cache *cache; + struct kvm_pgtable *pgt; + int ret, cache_capacity; + u64 next, chunk_size; + + lockdep_assert_held_write(&kvm->mmu_lock); + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); + + if (chunk_size == 0) + return 0; + + cache = &kvm->arch.mmu.split_page_cache; + + do { + if (need_split_memcache_topup_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* Eager page splitting is best-effort. */ + ret = __kvm_mmu_topup_memory_cache(cache, + cache_capacity, + cache_capacity); + write_lock(&kvm->mmu_lock); + if (ret) + break; + } + + pgt = kvm->arch.mmu.pgt; + if (!pgt) + return -EINVAL; + + next = __stage2_range_addr_end(addr, end, chunk_size); + ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); + if (ret) + break; + } while (addr = next, addr != end); + + return ret; +} + static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -131,21 +211,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size) static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; -static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) +static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) { struct page *page = container_of(head, struct page, rcu_head); void *pgtable = page_to_virt(page); u32 level = page_private(page); - kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); + kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); } -static void stage2_free_removed_table(void *addr, u32 level) +static void stage2_free_unlinked_table(void *addr, u32 level) { struct page *page = virt_to_page(addr); set_page_private(page, (unsigned long)level); - call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); + call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); } static void kvm_host_get_page(void *addr) @@ -701,7 +781,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, .free_pages_exact = kvm_s2_free_pages_exact, - .free_removed_table = stage2_free_removed_table, + .free_unlinked_table = stage2_free_unlinked_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, @@ -775,6 +855,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; + /* The eager page splitting is disabled by default */ + mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + mmu->split_page_cache.gfp_zero = __GFP_ZERO; + mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); return 0; @@ -786,6 +870,12 @@ out_free_pgtable: return err; } +void kvm_uninit_stage2_mmu(struct kvm *kvm) +{ + kvm_free_stage2_pgd(&kvm->arch.mmu); + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); +} + static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -989,39 +1079,66 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) } /** - * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE + * pages for memory slot * @kvm: The KVM pointer - * @slot: The memory slot associated with mask - * @gfn_offset: The gfn offset in memory slot - * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory - * slot to be write protected + * @slot: The memory slot to split * - * Walks bits set in mask write protects the associated pte's. Caller must - * acquire kvm_mmu_lock. + * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. */ -static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) +static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) { - phys_addr_t base_gfn = slot->base_gfn + gfn_offset; - phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; - phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + phys_addr_t start, end; - stage2_wp_range(&kvm->arch.mmu, start, end); + lockdep_assert_held(&kvm->slots_lock); + + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, slot); + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + write_lock(&kvm->mmu_lock); + kvm_mmu_split_huge_pages(kvm, start, end); + write_unlock(&kvm->mmu_lock); } /* - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * dirty pages. + * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of pages at offset 'gfn_offset' in this memory + * slot to enable dirty logging on * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. + * Writes protect selected pages to enable dirty logging, and then + * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + lockdep_assert_held_write(&kvm->mmu_lock); + + stage2_wp_range(&kvm->arch.mmu, start, end); + + /* + * Eager-splitting is done when manual-protect is set. We + * also check for initially-all-set because we can avoid + * eager-splitting if initially-all-set is false. + * Initially-all-set equal false implies that huge-pages were + * already split when enabling dirty logging: no need to do it + * again. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_split_huge_pages(kvm, start, end); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) @@ -1790,20 +1907,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; + /* * At this point memslot has been committed and there is an * allocated dirty_bitmap[], dirty pages will be tracked while the * memory slot is write protected. */ - if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (log_dirty_pages) { + + if (change == KVM_MR_DELETE) + return; + /* - * If we're with initial-all-set, we don't need to write - * protect any pages because they're all reported as dirty. - * Huge pages and normal pages will be write protect gradually. + * Huge and normal pages are write-protected and split + * on either of these two cases: + * + * 1. with initial-all-set: gradually with CLEAR ioctls, */ - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { - kvm_mmu_wp_memory_region(kvm, new->id); - } + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; + /* + * or + * 2. without initial-all-set: all in one shot when + * enabling dirty logging. + */ + kvm_mmu_wp_memory_region(kvm, new->id); + kvm_mmu_split_memory_region(kvm, new->id); + } else { + /* + * Free any leftovers from the eager page splitting cache. Do + * this when deleting, moving, disabling dirty logging, or + * creating the memslot (a nop). Doing it for deletes makes + * sure we don't leak memory, and there's no need to keep the + * cache around for any of the other cases. + */ + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); } } @@ -1877,7 +2016,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_free_stage2_pgd(&kvm->arch.mmu); + kvm_uninit_stage2_mmu(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e571e973bc2..7651069ada46 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -991,6 +991,8 @@ static inline bool kvm_memslots_empty(struct kvm_memslots *slots) return RB_EMPTY_ROOT(&slots->gfn_tree); } +bool kvm_are_all_memslots_empty(struct kvm *kvm); + #define kvm_for_each_memslot(memslot, bkt, slots) \ hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \ if (WARN_ON_ONCE(!memslot->npages)) { \ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 737318b1c1d9..44edee0211fb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1190,6 +1190,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 #define KVM_CAP_COUNTER_OFFSET 227 +#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 +#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..c2723ccf6f9c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4602,7 +4602,7 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; } -static bool kvm_are_all_memslots_empty(struct kvm *kvm) +bool kvm_are_all_memslots_empty(struct kvm *kvm) { int i; @@ -4615,6 +4615,7 @@ static bool kvm_are_all_memslots_empty(struct kvm *kvm) return true; } +EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) |