summaryrefslogtreecommitdiff
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c176
1 files changed, 73 insertions, 103 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5a2b55c8dd9a..b482d240f9a2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -7,6 +7,7 @@
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
#include <linux/vmalloc.h>
@@ -25,7 +26,7 @@
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
@@ -41,6 +42,7 @@
#include <asm/shmparam.h>
#include "internal.h"
+#include "pgalloc-track.h"
bool is_vmalloc_addr(const void *x)
{
@@ -173,7 +175,6 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
- start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
@@ -511,6 +512,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
/*
* This function returns back addresses of parent node
* and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
*/
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
@@ -549,8 +554,12 @@ find_va_links(struct vmap_area *va,
else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
- else
- BUG();
+ else {
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+ return NULL;
+ }
} while (*link);
*parent = &tmp_va->rb_node;
@@ -632,43 +641,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
-augment_tree_propagate_check(struct rb_node *n)
+augment_tree_propagate_check(void)
{
struct vmap_area *va;
- struct rb_node *node;
- unsigned long size;
- bool found = false;
-
- if (n == NULL)
- return;
+ unsigned long computed_size;
- va = rb_entry(n, struct vmap_area, rb_node);
- size = va->subtree_max_size;
- node = n;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
-
- if (get_subtree_max_size(node->rb_left) == size) {
- node = node->rb_left;
- } else {
- if (va_size(va) == size) {
- found = true;
- break;
- }
-
- node = node->rb_right;
- }
- }
-
- if (!found) {
- va = rb_entry(n, struct vmap_area, rb_node);
- pr_emerg("tree is corrupted: %lu, %lu\n",
- va_size(va), va->subtree_max_size);
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ computed_size = compute_subtree_max_size(va);
+ if (computed_size != va->subtree_max_size)
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
}
-
- augment_tree_propagate_check(n->rb_left);
- augment_tree_propagate_check(n->rb_right);
}
#endif
@@ -702,28 +685,15 @@ augment_tree_propagate_check(struct rb_node *n)
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
- struct rb_node *node = &va->rb_node;
- unsigned long new_va_sub_max_size;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
- new_va_sub_max_size = compute_subtree_max_size(va);
-
- /*
- * If the newly calculated maximum available size of the
- * subtree is equal to the current one, then it means that
- * the tree is propagated correctly. So we have to stop at
- * this point to save cycles.
- */
- if (va->subtree_max_size == new_va_sub_max_size)
- break;
-
- va->subtree_max_size = new_va_sub_max_size;
- node = rb_parent(&va->rb_node);
- }
+ /*
+ * Populate the tree from bottom towards the root until
+ * the calculated maximum available size of checked node
+ * is equal to its current one.
+ */
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
#if DEBUG_AUGMENT_PROPAGATE_CHECK
- augment_tree_propagate_check(free_vmap_area_root.rb_node);
+ augment_tree_propagate_check();
#endif
}
@@ -735,7 +705,8 @@ insert_vmap_area(struct vmap_area *va,
struct rb_node *parent;
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
+ if (link)
+ link_va(va, root, parent, link, head);
}
static void
@@ -751,8 +722,10 @@ insert_vmap_area_augment(struct vmap_area *va,
else
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
+ if (link) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
}
/*
@@ -760,6 +733,11 @@ insert_vmap_area_augment(struct vmap_area *va,
* and next free blocks. If coalesce is not done a new
* free area is inserted. If VA has been merged, it is
* freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
*/
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
@@ -776,6 +754,8 @@ merge_or_add_vmap_area(struct vmap_area *va,
* inserted, unless it is merged with its sibling/siblings.
*/
link = find_va_links(va, root, NULL, &parent);
+ if (!link)
+ return NULL;
/*
* Get next node of VA to check if merging can be done.
@@ -796,9 +776,6 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -818,14 +795,18 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list);
if (sibling->va_end == va->va_start) {
- sibling->va_end = va->va_end;
-
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
+ /*
+ * If both neighbors are coalesced, it is important
+ * to unlink the "next" node first, followed by merging
+ * with "previous" one. Otherwise the tree might not be
+ * fully populated if a sibling's augmented value is
+ * "normalized" because of rotation operations.
+ */
if (merged)
unlink_va(va, root);
+ sibling->va_end = va->va_end;
+
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -836,11 +817,13 @@ merge_or_add_vmap_area(struct vmap_area *va,
}
insert:
- if (!merged) {
+ if (!merged)
link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
- }
+ /*
+ * Last step is to check and update the tree.
+ */
+ augment_tree_propagate_from(va);
return va;
}
@@ -1381,6 +1364,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va = merge_or_add_vmap_area(va, &free_vmap_area_root,
&free_vmap_area_list);
+ if (!va)
+ continue;
+
if (is_vmalloc_or_module_addr((void *)orig_start))
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -1513,12 +1499,11 @@ struct vmap_block {
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
* in the free path. Could get rid of this if we change the API to return a
* "cookie" from alloc, to be passed to free. But no big deal yet.
*/
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static DEFINE_XARRAY(vmap_blocks);
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -1575,13 +1560,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}
- err = radix_tree_preload(gfp_mask);
- if (unlikely(err)) {
- kfree(vb);
- free_vmap_area(va);
- return ERR_PTR(err);
- }
-
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -1594,11 +1572,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
- spin_lock(&vmap_block_tree_lock);
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
- spin_unlock(&vmap_block_tree_lock);
- BUG_ON(err);
- radix_tree_preload_end();
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ if (err) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
@@ -1612,12 +1591,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
- unsigned long vb_idx;
- vb_idx = addr_to_vb_idx(vb->va->va_start);
- spin_lock(&vmap_block_tree_lock);
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
- spin_unlock(&vmap_block_tree_lock);
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
@@ -1723,7 +1698,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
- unsigned long vb_idx;
unsigned int order;
struct vmap_block *vb;
@@ -1733,14 +1707,8 @@ static void vb_free(unsigned long addr, unsigned long size)
flush_cache_vunmap(addr, addr + size);
order = get_order(size);
-
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
-
- vb_idx = addr_to_vb_idx(addr);
- rcu_read_lock();
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
- rcu_read_unlock();
- BUG_ON(!vb);
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
unmap_kernel_range_noflush(addr, size);
@@ -3383,8 +3351,9 @@ recovery:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
}
@@ -3432,8 +3401,9 @@ err_free_shadow:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
kfree(vms[area]);
}