summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c193
1 files changed, 136 insertions, 57 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2120f7478e55..f9696c94e211 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -20,6 +20,7 @@
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
+#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
@@ -150,10 +151,15 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
* Must be done before hugepage flags check since shmem has its
* own flags.
*/
- if (!in_pf && shmem_file(vma->vm_file))
- return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
- !enforce_sysfs, vma->vm_mm, vm_flags)
- ? orders : 0;
+ if (!in_pf && shmem_file(vma->vm_file)) {
+ bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
+ !enforce_sysfs, vma->vm_mm, vm_flags);
+
+ if (!vma_is_anon_shmem(vma))
+ return global_huge ? orders : 0;
+ return shmem_allowable_huge_orders(file_inode(vma->vm_file),
+ vma, vma->vm_pgoff, global_huge);
+ }
if (!vma_is_anonymous(vma)) {
/*
@@ -449,14 +455,6 @@ static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);
-struct thpsize {
- struct kobject kobj;
- struct list_head node;
- int order;
-};
-
-#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
-
static ssize_t thpsize_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -509,6 +507,13 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
} else
ret = -EINVAL;
+ if (ret > 0) {
+ int err;
+
+ err = start_stop_khugepaged();
+ if (err)
+ ret = err;
+ }
return ret;
}
@@ -517,6 +522,9 @@ static struct kobj_attribute thpsize_enabled_attr =
static struct attribute *thpsize_attrs[] = {
&thpsize_enabled_attr.attr,
+#ifdef CONFIG_SHMEM
+ &thpsize_shmem_enabled_attr.attr,
+#endif
NULL,
};
@@ -560,6 +568,12 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
+DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
+DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
+DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
static struct attribute *stats_attrs[] = {
&anon_fault_alloc_attr.attr,
@@ -567,6 +581,12 @@ static struct attribute *stats_attrs[] = {
&anon_fault_fallback_charge_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
+ &shmem_alloc_attr.attr,
+ &shmem_fallback_attr.attr,
+ &shmem_fallback_charge_attr.attr,
+ &split_attr.attr,
+ &split_failed_attr.attr,
+ &split_deferred_attr.attr,
NULL,
};
@@ -942,10 +962,10 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
goto release;
}
- clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
+ folio_zero_user(folio, vmf->address);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
- * clear_huge_page writes become visible before the set_pmd_at()
+ * folio_zero_user writes become visible before the set_pmd_at()
* write.
*/
__folio_mark_uptodate(folio);
@@ -972,7 +992,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- folio_add_new_anon_rmap(folio, vma, haddr);
+ folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
@@ -1624,7 +1644,7 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
return false;
/* Do we need write faults for softdirty tracking? */
- if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+ if (pmd_needs_soft_dirty_wp(vma, pmd))
return false;
/* Do we need write faults for uffd-wp tracking? */
@@ -1651,7 +1671,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int nid = NUMA_NO_NODE;
int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
- bool migrated = false, writable = false;
+ bool writable = false;
int flags = 0;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1687,16 +1707,17 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
if (node_is_toptier(nid))
last_cpupid = folio_last_cpupid(folio);
target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
- if (target_nid == NUMA_NO_NODE) {
- folio_put(folio);
+ if (target_nid == NUMA_NO_NODE)
+ goto out_map;
+ if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
+ flags |= TNF_MIGRATE_FAIL;
goto out_map;
}
-
+ /* The folio is isolated and isolation code holds a folio reference. */
spin_unlock(vmf->ptl);
writable = false;
- migrated = migrate_misplaced_folio(folio, vma, target_nid);
- if (migrated) {
+ if (!migrate_misplaced_folio(folio, vma, target_nid)) {
flags |= TNF_MIGRATED;
nid = target_nid;
} else {
@@ -2581,6 +2602,27 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pmd_populate(mm, pmd, pgtable);
}
+void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, bool freeze, struct folio *folio)
+{
+ VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
+ VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
+ VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
+ VM_BUG_ON(freeze && !folio);
+
+ /*
+ * When the caller requests to set up a migration entry, we
+ * require a folio to check the PMD against. Otherwise, there
+ * is a risk of replacing the wrong folio.
+ */
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+ is_pmd_migration_entry(*pmd)) {
+ if (folio && folio != pmd_folio(*pmd))
+ return;
+ __split_huge_pmd_locked(vma, pmd, address, freeze);
+ }
+}
+
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio)
{
@@ -2592,26 +2634,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
-
- /*
- * If caller asks to setup a migration entry, we need a folio to check
- * pmd against. Otherwise we can end up replacing wrong folio.
- */
- VM_BUG_ON(freeze && !folio);
- VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
-
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd)) {
- /*
- * It's safe to call pmd_page when folio is set because it's
- * guaranteed that pmd is present.
- */
- if (folio && folio != pmd_folio(*pmd))
- goto out;
- __split_huge_pmd_locked(vma, pmd, range.start, freeze);
- }
-
-out:
+ split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
@@ -2685,6 +2708,71 @@ static void unmap_folio(struct folio *folio)
try_to_unmap_flush();
}
+static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp,
+ struct folio *folio)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int ref_count, map_count;
+ pmd_t orig_pmd = *pmdp;
+
+ if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
+ return false;
+
+ orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
+
+ /*
+ * Syncing against concurrent GUP-fast:
+ * - clear PMD; barrier; read refcount
+ * - inc refcount; barrier; read PMD
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for folio refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * If the folio or its PMD is redirtied at this point, or if there
+ * are unexpected references, we will give up to discard this folio
+ * and remap it.
+ *
+ * The only folio refs must be one from isolation plus the rmap(s).
+ */
+ if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
+ ref_count != map_count + 1) {
+ set_pmd_at(mm, addr, pmdp, orig_pmd);
+ return false;
+ }
+
+ folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
+ zap_deposited_table(mm, pmdp);
+ add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_drain_local();
+ folio_put(folio);
+
+ return true;
+}
+
+bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmdp, struct folio *folio)
+{
+ VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
+
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
+ return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
+
+ return false;
+}
+
static void remap_page(struct folio *folio, unsigned long nr)
{
int i = 0;
@@ -2838,7 +2926,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
split_page_memcg(head, order, new_order);
if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
- offset = swp_offset(folio->swap);
+ offset = swap_cache_index(folio->swap);
swap_cache = swap_address_space(folio->swap);
xa_lock(&swap_cache->i_pages);
}
@@ -2998,7 +3086,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- bool is_thp = folio_test_pmd_mappable(folio);
+ int order = folio_order(folio);
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
@@ -3183,27 +3271,17 @@ out_unlock:
i_mmap_unlock_read(mapping);
out:
xas_destroy(&xas);
- if (is_thp)
+ if (order == HPAGE_PMD_ORDER)
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
return ret;
}
-void folio_undo_large_rmappable(struct folio *folio)
+void __folio_undo_large_rmappable(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
- if (folio_order(folio) <= 1)
- return;
-
- /*
- * At this point, there is no one trying to add the folio to
- * deferred_list. If folio is not in deferred_list, it's safe
- * to check without acquiring the split_queue_lock.
- */
- if (data_race(list_empty(&folio->_deferred_list)))
- return;
-
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(&folio->_deferred_list)) {
@@ -3248,6 +3326,7 @@ void deferred_split_folio(struct folio *folio)
if (list_empty(&folio->_deferred_list)) {
if (folio_test_pmd_mappable(folio))
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG