summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 17:52:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 17:52:23 -0700
commitf6f7a6369203fa3e07efb7f35cfd81efe9f25b07 (patch)
tree97bec9ddd999040822acf314647eaf4208213589 /mm
parent839fe9156fbe89c3157aa6146d22090f8cffddd8 (diff)
parentdf69f52d990bd85159727bd26e819d3a6e49c666 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: "Almost all of the rest of MM. There was an unusually large amount of MM material this time" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits) zpool: remove no-op module init/exit mm: zbud: constify the zbud_ops mm: zpool: constify the zpool_ops mm: swap: zswap: maybe_preload & refactoring zram: unify error reporting zsmalloc: remove null check from destroy_handle_cache() zsmalloc: do not take class lock in zs_shrinker_count() zsmalloc: use class->pages_per_zspage zsmalloc: consider ZS_ALMOST_FULL as migrate source zsmalloc: partial page ordering within a fullness_list zsmalloc: use shrinker to trigger auto-compaction zsmalloc: account the number of compacted pages zsmalloc/zram: introduce zs_pool_stats api zsmalloc: cosmetic compaction code adjustments zsmalloc: introduce zs_can_compact() function zsmalloc: always keep per-class stats zsmalloc: drop unused variable `nr_to_migrate' mm/memblock.c: fix comment in __next_mem_range() mm/page_alloc.c: fix type information of memoryless node memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c7
-rw-r--r--mm/compaction.c175
-rw-r--r--mm/dmapool.c12
-rw-r--r--mm/early_ioremap.c22
-rw-r--r--mm/filemap.c36
-rw-r--r--mm/huge_memory.c163
-rw-r--r--mm/hugetlb.c432
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h1
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/list_lru.c4
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c31
-rw-r--r--mm/memcontrol.c394
-rw-r--r--mm/memory-failure.c103
-rw-r--r--mm/memory.c48
-rw-r--r--mm/mempolicy.c7
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/memtest.c27
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/mmap.c71
-rw-r--r--mm/oom_kill.c142
-rw-r--r--mm/page_alloc.c80
-rw-r--r--mm/page_isolation.c35
-rw-r--r--mm/shmem.c16
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab_common.c5
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap_state.c37
-rw-r--r--mm/swapfile.c42
-rw-r--r--mm/vmscan.c14
-rw-r--r--mm/zbud.c10
-rw-r--r--mm/zpool.c18
-rw-r--r--mm/zsmalloc.c235
-rw-r--r--mm/zswap.c75
36 files changed, 1243 insertions, 1030 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a23dd1934654..3b6380784c28 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
count += pages;
while (pages--)
__free_pages_bootmem(page++, cur++, 0);
+ bdata->node_bootmem_map = NULL;
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
@@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata,
sidx + bdata->node_min_pfn,
eidx + bdata->node_min_pfn);
+ if (WARN_ON(bdata->node_bootmem_map == NULL))
+ return;
+
if (bdata->hint_idx > sidx)
bdata->hint_idx = sidx;
@@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
eidx + bdata->node_min_pfn,
flags);
+ if (WARN_ON(bdata->node_bootmem_map == NULL))
+ return 0;
+
for (idx = sidx; idx < eidx; idx++)
if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
if (exclusive) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08da99a2..c5c627aae996 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
return !get_pageblock_skip(page);
}
+static void reset_cached_positions(struct zone *zone)
+{
+ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone)
unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
- zone->compact_cached_migrate_pfn[0] = start_pfn;
- zone->compact_cached_migrate_pfn[1] = start_pfn;
- zone->compact_cached_free_pfn = end_pfn;
zone->compact_blockskip_flush = false;
/* Walk the zone and mark every pageblock as suitable for isolation */
@@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone)
clear_pageblock_skip(page);
}
+
+ reset_cached_positions(zone);
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!valid_page)
valid_page = page;
+
+ /*
+ * For compound pages such as THP and hugetlbfs, we can save
+ * potentially a lot of iterations if we skip them at once.
+ * The check is racy, but we can consider only valid values
+ * and the only danger is skipping too much.
+ */
+ if (PageCompound(page)) {
+ unsigned int comp_order = compound_order(page);
+
+ if (likely(comp_order < MAX_ORDER)) {
+ blockpfn += (1UL << comp_order) - 1;
+ cursor += (1UL << comp_order) - 1;
+ }
+
+ goto isolate_fail;
+ }
+
if (!PageBuddy(page))
goto isolate_fail;
@@ -490,6 +514,13 @@ isolate_fail:
}
+ /*
+ * There is a tiny chance that we have read bogus compound_order(),
+ * so be careful to not go outside of the pageblock.
+ */
+ if (unlikely(blockpfn > end_pfn))
+ blockpfn = end_pfn;
+
trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
nr_scanned, total_isolated);
@@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
+ bool is_lru;
+
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
@@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
*/
- if (!PageLRU(page)) {
+ is_lru = PageLRU(page);
+ if (!is_lru) {
if (unlikely(balloon_page_movable(page))) {
if (balloon_page_isolate(page)) {
/* Successfully isolated */
goto isolate_success;
}
}
- continue;
}
/*
- * PageLRU is set. lru_lock normally excludes isolation
- * splitting and collapsing (collapsing has already happened
- * if PageLRU is set) but the lock is not necessarily taken
- * here and it is wasteful to take it just to check transhuge.
- * Check TransHuge without lock and skip the whole pageblock if
- * it's either a transhuge or hugetlbfs page, as calling
- * compound_order() without preventing THP from splitting the
- * page underneath us may return surprising results.
+ * Regardless of being on LRU, compound pages such as THP and
+ * hugetlbfs are not to be compacted. We can potentially save
+ * a lot of iterations if we skip them at once. The check is
+ * racy, but we can consider only valid values and the only
+ * danger is skipping too much.
*/
- if (PageTransHuge(page)) {
- if (!locked)
- low_pfn = ALIGN(low_pfn + 1,
- pageblock_nr_pages) - 1;
- else
- low_pfn += (1 << compound_order(page)) - 1;
+ if (PageCompound(page)) {
+ unsigned int comp_order = compound_order(page);
+
+ if (likely(comp_order < MAX_ORDER))
+ low_pfn += (1UL << comp_order) - 1;
continue;
}
+ if (!is_lru)
+ continue;
+
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
@@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!locked)
break;
- /* Recheck PageLRU and PageTransHuge under lock */
+ /* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
continue;
- if (PageTransHuge(page)) {
- low_pfn += (1 << compound_order(page)) - 1;
+
+ /*
+ * Page become compound since the non-locked check,
+ * and it's on LRU. It can only be a THP so the order
+ * is safe to read and it's 0 for tail pages.
+ */
+ if (unlikely(PageCompound(page))) {
+ low_pfn += (1UL << compound_order(page)) - 1;
continue;
}
}
@@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page), page);
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -898,6 +936,16 @@ static bool suitable_migration_target(struct page *page)
}
/*
+ * Test whether the free scanner has reached the same or lower pageblock than
+ * the migration scanner, and compaction should thus terminate.
+ */
+static inline bool compact_scanners_met(struct compact_control *cc)
+{
+ return (cc->free_pfn >> pageblock_order)
+ <= (cc->migrate_pfn >> pageblock_order);
+}
+
+/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
@@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc)
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; block_start_pfn >= low_pfn &&
- cc->nr_migratepages > cc->nr_freepages;
+ for (; block_start_pfn >= low_pfn;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
@@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc)
block_end_pfn, freelist, false);
/*
+ * If we isolated enough freepages, or aborted due to async
+ * compaction being contended, terminate the loop.
* Remember where the free scanner should restart next time,
* which is where isolate_freepages_block() left off.
* But if it scanned the whole pageblock, isolate_start_pfn
@@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc)
* In that case we will however want to restart at the start
* of the previous pageblock.
*/
- cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
- isolate_start_pfn :
- block_start_pfn - pageblock_nr_pages;
-
- /*
- * isolate_freepages_block() might have aborted due to async
- * compaction being contended
- */
- if (cc->contended)
+ if ((cc->nr_freepages >= cc->nr_migratepages)
+ || cc->contended) {
+ if (isolate_start_pfn >= block_end_pfn)
+ isolate_start_pfn =
+ block_start_pfn - pageblock_nr_pages;
break;
+ } else {
+ /*
+ * isolate_freepages_block() should not terminate
+ * prematurely unless contended, or isolated enough
+ */
+ VM_BUG_ON(isolate_start_pfn < block_end_pfn);
+ }
}
/* split_free_page does not map the pages */
map_pages(freelist);
/*
- * If we crossed the migrate scanner, we want to keep it that way
- * so that compact_finished() may detect this
+ * Record where the free scanner will restart next time. Either we
+ * broke from the loop and set isolate_start_pfn based on the last
+ * call to isolate_freepages_block(), or we met the migration scanner
+ * and the loop terminated due to isolate_start_pfn < low_pfn
*/
- if (block_start_pfn < low_pfn)
- cc->free_pfn = cc->migrate_pfn;
+ cc->free_pfn = isolate_start_pfn;
}
/*
@@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
/* Perform the isolation */
+ isolate_start_pfn = low_pfn;
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
isolate_mode);
@@ -1119,6 +1174,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
/*
+ * Record where we could have freed pages by migration and not
+ * yet flushed them to buddy allocator.
+ * - this is the lowest page that could have been isolated and
+ * then freed by migration.
+ */
+ if (cc->nr_migratepages && !cc->last_migrated_pfn)
+ cc->last_migrated_pfn = isolate_start_pfn;
+
+ /*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
@@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
acct_isolated(zone, cc);
- /*
- * Record where migration scanner will be restarted. If we end up in
- * the same pageblock as the free scanner, make the scanners fully
- * meet so that compact_finished() terminates compaction.
- */
- cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
+ /* Record where migration scanner will be restarted. */
+ cc->migrate_pfn = low_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
- if (cc->free_pfn <= cc->migrate_pfn) {
+ if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
- zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
- zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ reset_cached_positions(zone);
/*
* Mark that the PG_migrate_skip information should be cleared
@@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
- unsigned long last_migrated_pfn = 0;
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
@@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
+ cc->last_migrated_pfn = 0;
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
@@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
while ((ret = compact_finished(zone, cc, migratetype)) ==
COMPACT_CONTINUE) {
int err;
- unsigned long isolate_start_pfn = cc->migrate_pfn;
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
@@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
- if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
+ if (err == -ENOMEM && !compact_scanners_met(cc)) {
ret = COMPACT_PARTIAL;
goto out;
}
}
- /*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator. We use the pfn that
- * isolate_migratepages() started from in this loop iteration
- * - this is the lowest page that could have been isolated and
- * then freed by migration.
- */
- if (!last_migrated_pfn)
- last_migrated_pfn = isolate_start_pfn;
-
check_drain:
/*
* Has the migration scanner moved away from the previous
@@ -1400,18 +1447,18 @@ check_drain:
* compact_finished() can detect immediately if allocation
* would succeed.
*/
- if (cc->order > 0 && last_migrated_pfn) {
+ if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
cc->migrate_pfn & ~((1UL << cc->order) - 1);
- if (last_migrated_pfn < current_block_start) {
+ if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
drain_local_pages(zone);
put_cpu();
/* No more flushing until we migrate again */
- last_migrated_pfn = 0;
+ cc->last_migrated_pfn = 0;
}
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 59d10d16f0a5..71a8998cd03a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
{
bool empty = false;
+ if (unlikely(!pool))
+ return;
+
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
@@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
spin_unlock_irqrestore(&pool->lock, flags);
- page = pool_alloc_page(pool, mem_flags);
+ page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
if (!page)
return NULL;
@@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
break;
}
}
- memset(retval, POOL_POISON_ALLOCATED, pool->size);
+ if (!(mem_flags & __GFP_ZERO))
+ memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
spin_unlock_irqrestore(&pool->lock, flags);
+
+ if (mem_flags & __GFP_ZERO)
+ memset(retval, 0, pool->size);
+
return retval;
}
EXPORT_SYMBOL(dma_pool_alloc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 0cfadafb3fb0..23f744d77ce0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -224,6 +224,28 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size)
return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
}
#endif
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+{
+ unsigned long slop, clen;
+ char *p;
+
+ while (size) {
+ slop = src & ~PAGE_MASK;
+ clen = size;
+ if (clen > MAX_MAP_CHUNK - slop)
+ clen = MAX_MAP_CHUNK - slop;
+ p = early_memremap(src & PAGE_MASK, clen + slop);
+ memcpy(dest, p + slop, clen);
+ early_memunmap(p, clen + slop);
+ dest += clen;
+ src += clen;
+ size -= clen;
+ }
+}
+
#else /* CONFIG_MMU */
void __init __iomem *
diff --git a/mm/filemap.c b/mm/filemap.c
index 1283fc825458..72940fb38666 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- page = alloc_pages_exact_node(n, gfp, 0);
+ page = __alloc_pages_node(n, gfp, 0);
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
return page;
@@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file,
iov_iter_count(i));
again:
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
- status = -EFAULT;
- break;
- }
-
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status < 0))
@@ -2495,8 +2480,17 @@ again:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
-
+ /*
+ * 'page' is now locked. If we are trying to copy from a
+ * mapping of 'page' in userspace, the copy might fault and
+ * would need PageUptodate() to complete. But, page can not be
+ * made Uptodate without acquiring the page lock, which we hold.
+ * Deadlock. Avoid with pagefault_disable(). Fix up below with
+ * iov_iter_fault_in_readable().
+ */
+ pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
@@ -2519,6 +2513,14 @@ again:
*/
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_single_seg_count(i));
+ /*
+ * This is the fallback to recover if the copy from
+ * userspace above faults.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
goto again;
}
pos += copied;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 279a818a39b1..b16279cbd91d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
+#include <linux/dax.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
@@ -105,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = {
};
-static int set_recommended_min_free_kbytes(void)
+static void set_recommended_min_free_kbytes(void)
{
struct zone *zone;
int nr_zones = 0;
@@ -140,7 +141,6 @@ static int set_recommended_min_free_kbytes(void)
min_free_kbytes = recommended_min;
}
setup_per_zone_wmarks();
- return 0;
}
static int start_stop_khugepaged(void)
@@ -172,12 +172,7 @@ fail:
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
- return is_huge_zero_page(pmd_page(pmd));
-}
-
-static struct page *get_huge_zero_page(void)
+struct page *get_huge_zero_page(void)
{
struct page *zero_page;
retry:
@@ -794,16 +789,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
}
/* Caller must hold page table lock. */
-static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
+ return true;
}
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -870,6 +868,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
flags);
}
+static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t entry;
+ spinlock_t *ptl;
+
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_none(*pmd)) {
+ entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
+ }
+ spin_unlock(ptl);
+}
+
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ /*
+ * If we had pmd_special, we could avoid all these restrictions,
+ * but we need to be consistent with PTEs and architectures that
+ * can't support a 'special' bit.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return VM_FAULT_SIGBUS;
+ insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+ return VM_FAULT_NOPAGE;
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
@@ -1414,41 +1455,41 @@ out:
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
+ pmd_t orig_pmd;
spinlock_t *ptl;
- int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- struct page *page;
- pgtable_t pgtable;
- pmd_t orig_pmd;
- /*
- * For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_huge_get_and_clear. So do the
- * pgtable_trans_huge_withdraw after finishing pmdp related
- * operations.
- */
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
- tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
- if (is_huge_zero_pmd(orig_pmd)) {
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ return 0;
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_huge_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ if (vma_is_dax(vma)) {
+ spin_unlock(ptl);
+ if (is_huge_zero_pmd(orig_pmd))
put_huge_zero_page();
- } else {
- page = pmd_page(orig_pmd);
- page_remove_rmap(page);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(!PageHead(page), page);
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
- tlb_remove_page(tlb, page);
- }
- pte_free(tlb->mm, pgtable);
- ret = 1;
+ } else if (is_huge_zero_pmd(orig_pmd)) {
+ pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ put_huge_zero_page();
+ } else {
+ struct page *page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ tlb_remove_page(tlb, page);
}
- return ret;
+ return 1;
}
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
@@ -2285,8 +2326,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
static void khugepaged_alloc_sleep(void)
{
- wait_event_freezable_timeout(khugepaged_wait, false,
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ DEFINE_WAIT(wait);
+
+ add_wait_queue(&khugepaged_wait, &wait);
+ freezable_schedule_timeout_interruptible(
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
}
static int khugepaged_node_load[MAX_NUMNODES];
@@ -2373,7 +2418,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
*/
up_read(&mm->mmap_sem);
- *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
+ *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2911,7 +2956,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2924,25 +2969,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
- }
- if (is_huge_zero_pmd(*pmd)) {
+ if (unlikely(!pmd_trans_huge(*pmd)))
+ goto unlock;
+ if (vma_is_dax(vma)) {
+ pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ if (is_huge_zero_pmd(_pmd))
+ put_huge_zero_page();
+ } else if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
+ } else {
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ get_page(page);
}
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ unlock:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- split_huge_page(page);
+ if (!page)
+ return;
+ split_huge_page(page);
put_page(page);
/*
@@ -2991,7 +3038,7 @@ static void split_huge_page_address(struct mm_struct *mm,
split_huge_page_pmd_mm(mm, address, pmd);
}
-void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51ae41d0fbc0..999fb0aef8f1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -240,11 +240,14 @@ struct file_region {
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. Existing regions will be expanded to accommodate the
- * specified range. We know only existing regions need to be
- * expanded, because region_add is only called after region_chg
- * with the same range. If a new file_region structure must
- * be allocated, it is done in region_chg.
+ * map. In the normal case, existing regions will be expanded
+ * to accommodate the specified range. Sufficient regions should
+ * exist for expansion due to the previous call to region_chg
+ * with the same range. However, it is possible that region_del
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded. In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
*
* Return the number of new huge pages added to the map. This
* number is greater than or equal to zero.
@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
if (f <= rg->to)
break;
+ /*
+ * If no region exists which can be expanded to include the
+ * specified range, the list must have been modified by an
+ * interleving call to region_del(). Pull a region descriptor
+ * from the cache and use it for this range.
+ */
+ if (&rg->link == head || t < rg->from) {
+ VM_BUG_ON(resv->region_cache_count <= 0);
+
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region,
+ link);
+ list_del(&nrg->link);
+
+ nrg->from = f;
+ nrg->to = t;
+ list_add(&nrg->link, rg->link.prev);
+
+ add += t - f;
+ goto out_locked;
+ }
+
/* Round our left edge to the current segment if it encloses us. */
if (f > rg->from)
f = rg->from;
@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
add += t - nrg->to; /* Added to end of region */
nrg->to = t;
+out_locked:
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
* so that the subsequent region_add call will have all the
* regions it needs and will not fail.
*
- * Returns the number of huge pages that need to be added
- * to the existing reservation map for the range [f, t).
- * This number is greater or equal to zero. -ENOMEM is
- * returned if a new file_region structure is needed and can
- * not be allocated.
+ * Upon entry, region_chg will also examine the cache of region descriptors
+ * associated with the map. If there are not enough descriptors cached, one
+ * will be allocated for the in progress add operation.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t). This number is greater or equal to
+ * zero. -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
*/
static long region_chg(struct resv_map *resv, long f, long t)
{
@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
retry:
spin_lock(&resv->lock);
+retry_locked:
+ resv->adds_in_progress++;
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations.
+ */
+ if (resv->adds_in_progress > resv->region_cache_count) {
+ struct file_region *trg;
+
+ VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+ /* Must drop lock to allocate a new descriptor. */
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ return -ENOMEM;
+
+ spin_lock(&resv->lock);
+ list_add(&trg->link, &resv->region_cache);
+ resv->region_cache_count++;
+ goto retry_locked;
+ }
+
/* Locate the region we are before or in. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
@@ -336,6 +391,7 @@ retry:
* size such that we can guarantee to record the reservation. */
if (&rg->link == head || t < rg->from) {
if (!nrg) {
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (!nrg)
@@ -385,43 +441,131 @@ out_nrg:
}
/*
- * Truncate the reserve map at index 'end'. Modify/truncate any
- * region which contains end. Delete any regions past end.
- * Return the number of huge pages removed from the map.
+ * Abort the in progress add operation. The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add. Operations are sometimes
+ * aborted after the call to region_chg. In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine. They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
*/
-static long region_truncate(struct resv_map *resv, long end)
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+ spin_lock(&resv->lock);
+ VM_BUG_ON(!resv->region_cache_count);
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+}
+
+/*
+ * Delete the specified range [f, t) from the reserve map. If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted. Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more. In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated. If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM. Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
+ */
+static long region_del(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
- long chg = 0;
+ struct file_region *nrg = NULL;
+ long del = 0;
+retry:
spin_lock(&resv->lock);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (end <= rg->to)
+ list_for_each_entry_safe(rg, trg, head, link) {
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
break;
- if (&rg->link == head)
- goto out;
- /* If we are in the middle of a region then adjust it. */
- if (end > rg->from) {
- chg = rg->to - end;
- rg->to = end;
- rg = list_entry(rg->link.next, typeof(*rg), link);
- }
+ if (f > rg->from && t < rg->to) { /* Must split region */
+ /*
+ * Check for an entry in the cache before dropping
+ * lock and attempting allocation.
+ */
+ if (!nrg &&
+ resv->region_cache_count > resv->adds_in_progress) {
+ nrg = list_first_entry(&resv->region_cache,
+ struct file_region,
+ link);
+ list_del(&nrg->link);
+ resv->region_cache_count--;
+ }
- /* Drop any remaining regions. */
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
+ if (!nrg) {
+ spin_unlock(&resv->lock);
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ goto retry;
+ }
+
+ del += t - f;
+
+ /* New entry for end of split region */
+ nrg->from = t;
+ nrg->to = rg->to;
+ INIT_LIST_HEAD(&nrg->link);
+
+ /* Original entry is trimmed */
+ rg->to = f;
+
+ list_add(&nrg->link, &rg->link);
+ nrg = NULL;
break;
- chg += rg->to - rg->from;
- list_del(&rg->link);
- kfree(rg);
+ }
+
+ if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+ del += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ continue;
+ }
+
+ if (f <= rg->from) { /* Trim beginning of region */
+ del += t - rg->from;
+ rg->from = t;
+ } else { /* Trim end of region */
+ del += rg->to - f;
+ rg->to = f;
+ }
}
-out:
spin_unlock(&resv->lock);
- return chg;
+ kfree(nrg);
+ return del;
+}
+
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page. The huge page itself was free'ed
+ * and removed from the page cache. This routine will adjust the subpool
+ * usage count, and the global reserve count if needed. By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+ if (restore_reserve && rsv_adjust) {
+ struct hstate *h = hstate_inode(inode);
+
+ hugetlb_acct_memory(h, 1);
+ }
}
/*
@@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
- if (!resv_map)
+ struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+ if (!resv_map || !rg) {
+ kfree(resv_map);
+ kfree(rg);
return NULL;
+ }
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ resv_map->adds_in_progress = 0;
+
+ INIT_LIST_HEAD(&resv_map->region_cache);
+ list_add(&rg->link, &resv_map->region_cache);
+ resv_map->region_cache_count = 1;
+
return resv_map;
}
void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+ struct list_head *head = &resv_map->region_cache;
+ struct file_region *rg, *trg;
/* Clear out any active regions before we release the map. */
- region_truncate(resv_map, 0);
+ region_del(resv_map, 0, LONG_MAX);
+
+ /* ... and any entries left in the cache */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+
+ VM_BUG_ON(resv_map->adds_in_progress);
+
kfree(resv_map);
}
@@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
}
/* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE)
- return true;
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * We know VM_NORESERVE is not set. Therefore, there SHOULD
+ * be a region map for all pages. The only situation where
+ * there is no region map is if a hole was punched via
+ * fallocate. In this case, there really are no reverves to
+ * use. This situation is indicated if chg != 0.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
/*
* Only the process that called mmap() has reserves for
@@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- page = alloc_pages_exact_node(nid,
+ page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
@@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
else
- page = alloc_pages_exact_node(nid,
+ page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
@@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
}
}
+
/*
- * vma_needs_reservation and vma_commit_reservation are used by the huge
- * page allocation routines to manage reservations.
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
+ * are used by the huge page allocation routines to manage reservations.
*
* vma_needs_reservation is called to determine if the huge page at addr
* within the vma has an associated reservation. If a reservation is
* needed, the value 1 is returned. The caller is then responsible for
* managing the global reservation and subpool usage counts. After
* the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.
+ * to add the page to the reservation map. If the page allocation fails,
+ * the reservation must be ended instead of committed. vma_end_reservation
+ * is called in such cases.
*
* In the normal case, vma_commit_reservation returns the same value
* as the preceding vma_needs_reservation call. The only time this
@@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
* is the responsibility of the caller to notice the difference and
* take appropriate action.
*/
+enum vma_resv_mode {
+ VMA_NEEDS_RESV,
+ VMA_COMMIT_RESV,
+ VMA_END_RESV,
+};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
- bool commit)
+ enum vma_resv_mode mode)
{
struct resv_map *resv;
pgoff_t idx;
@@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
return 1;
idx = vma_hugecache_offset(h, vma, addr);
- if (commit)
- ret = region_add(resv, idx, idx + 1);
- else
+ switch (mode) {
+ case VMA_NEEDS_RESV:
ret = region_chg(resv, idx, idx + 1);
+ break;
+ case VMA_COMMIT_RESV:
+ ret = region_add(resv, idx, idx + 1);
+ break;
+ case VMA_END_RESV:
+ region_abort(resv, idx, idx + 1);
+ ret = 0;
+ break;
+ default:
+ BUG();
+ }
if (vma->vm_flags & VM_MAYSHARE)
return ret;
@@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, false);
+ return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}
static long vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, true);
+ return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
+}
+
+static void vma_end_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- long chg, commit;
+ long map_chg, map_commit;
+ long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
idx = hstate_index(h);
/*
- * Processes that did not create the mapping will have no
- * reserves and will not have accounted against subpool
- * limit. Check that the subpool limit can be made before
- * satisfying the allocation MAP_NORESERVE mappings may also
- * need pages and subpool limit allocated allocated if no reserve
- * mapping overlaps.
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
*/
- chg = vma_needs_reservation(h, vma, addr);
- if (chg < 0)
+ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
+ if (map_chg < 0)
return ERR_PTR(-ENOMEM);
- if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1) < 0)
+
+ /*
+ * Processes that did not create the mapping will have no
+ * reserves as indicated by the region/reserve map. Check
+ * that the allocation will not exceed the subpool limit.
+ * Allocations for MAP_NORESERVE mappings also need to be
+ * checked against any subpool limit.
+ */
+ if (map_chg || avoid_reserve) {
+ gbl_chg = hugepage_subpool_get_pages(spool, 1);
+ if (gbl_chg < 0) {
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
+ }
+
+ /*
+ * Even though there was no reservation in the region/reserve
+ * map, there could be reservations associated with the
+ * subpool that can be used. This would be indicated if the
+ * return value of hugepage_subpool_get_pages() is zero.
+ * However, if avoid_reserve is specified we still avoid even
+ * the subpool reservations.
+ */
+ if (avoid_reserve)
+ gbl_chg = 1;
+ }
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_subpool_put;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+ /*
+ * glb_chg is passed to indicate whether or not a page must be taken
+ * from the global free pool (global change). gbl_chg == 0 indicates
+ * a reservation exists for the allocation.
+ */
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_private(page, (unsigned long)spool);
- commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(chg > commit)) {
+ map_commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(map_chg > map_commit)) {
/*
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
@@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_subpool_put:
- if (chg || avoid_reserve)
+ if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
@@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
}
kobject_put(hugepages_kobj);
- kfree(htlb_fault_mutex_table);
+ kfree(hugetlb_fault_mutex_table);
}
module_exit(hugetlb_exit);
@@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
#else
num_fault_mutexes = 1;
#endif
- htlb_fault_mutex_table =
+ hugetlb_fault_mutex_table =
kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
- BUG_ON(!htlb_fault_mutex_table);
+ BUG_ON(!hugetlb_fault_mutex_table);
for (i = 0; i < num_fault_mutexes; i++)
- mutex_init(&htlb_fault_mutex_table[i]);
+ mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
module_init(hugetlb_init);
@@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
return page != NULL;
}
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx)
+{
+ struct inode *inode = mapping->host;
+ struct hstate *h = hstate_inode(inode);
+ int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+ if (err)
+ return err;
+ ClearPagePrivate(page);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3194,21 +3439,13 @@ retry:
set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
- int err;
- struct inode *inode = mapping->host;
-
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ int err = huge_add_to_page_cache(page, mapping, idx);
if (err) {
put_page(page);
if (err == -EEXIST)
goto retry;
goto out;
}
- ClearPagePrivate(page);
-
- spin_lock(&inode->i_lock);
- inode->i_blocks += blocks_per_huge_page(h);
- spin_unlock(&inode->i_lock);
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -3236,11 +3473,14 @@ retry:
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
+ }
ptl = huge_pte_lockptr(h, mm, ptep);
spin_lock(ptl);
@@ -3280,7 +3520,7 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
- mutex_lock(&htlb_fault_mutex_table[hash]);
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
@@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_OOM;
goto out_mutex;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3437,7 +3679,7 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
- mutex_unlock(&htlb_fault_mutex_table[hash]);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode,
}
return 0;
out_err:
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
}
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed)
{
struct hstate *h = hstate_inode(inode);
struct resv_map *resv_map = inode_resv_map(inode);
@@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
- if (resv_map)
- chg = region_truncate(resv_map, offset);
+ if (resv_map) {
+ chg = region_del(resv_map, start, end);
+ /*
+ * region_del() can fail in the rare case where a region
+ * must be split and another region descriptor can not be
+ * allocated. If end == LONG_MAX, it will not fail.
+ */
+ if (chg < 0)
+ return chg;
+ }
+
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
@@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
+
+ return 0;
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index bf73ac17dad4..aeba0edd6e44 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -58,7 +58,7 @@ inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
put_out:
- put_page(p);
+ put_hwpoison_page(p);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 1195dd2d6a2b..bc0fa9a69e46 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -182,6 +182,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
+ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
int order; /* order a direct compactor needs */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index cf79f110157c..f532f6a37b55 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -838,6 +838,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
}
if (crt_early_log >= ARRAY_SIZE(early_log)) {
+ crt_early_log++;
kmemleak_disable();
return;
}
@@ -1882,7 +1883,7 @@ void __init kmemleak_init(void)
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- if (crt_early_log >= ARRAY_SIZE(early_log))
+ if (crt_early_log > ARRAY_SIZE(early_log))
pr_warning("Early log buffer exceeded (%d), please increase "
"DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 909eca2c820e..e1da19fac1b3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
struct list_lru_one *l;
spin_lock(&nlru->lock);
- l = list_lru_from_kmem(nlru, item);
if (list_empty(item)) {
+ l = list_lru_from_kmem(nlru, item);
list_add_tail(item, &l->list);
l->nr_items++;
spin_unlock(&nlru->lock);
@@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
struct list_lru_one *l;
spin_lock(&nlru->lock);
- l = list_lru_from_kmem(nlru, item);
if (!list_empty(item)) {
+ l = list_lru_from_kmem(nlru, item);
list_del_init(item);
l->nr_items--;
spin_unlock(&nlru->lock);
diff --git a/mm/madvise.c b/mm/madvise.c
index ce3a4222c7e7..c889fcbb530e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma,
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
- if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
+ if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
f = vma->vm_file;
diff --git a/mm/memblock.c b/mm/memblock.c
index 95ce68c6da8a..1c7b647e5897 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
break;
}
- return (i < type->cnt) ? i : -1;
+ return i < type->cnt;
}
/*
@@ -569,6 +569,7 @@ repeat:
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
+ WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
@@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *_rgn = &memblock.memory;
+ struct memblock_type *type = &memblock.memory;
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(_rgn, base, size, nid, flags);
+ return memblock_add_range(type, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
*
* This function isolates region [@base, @base + @size), and sets/clears flag
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
phys_addr_t size, int set, int flag)
@@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
{
@@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
phys_addr_t *out_start,
phys_addr_t *out_end)
{
- struct memblock_type *rsv = &memblock.reserved;
+ struct memblock_type *type = &memblock.reserved;
- if (*idx >= 0 && *idx < rsv->cnt) {
- struct memblock_region *r = &rsv->regions[*idx];
+ if (*idx >= 0 && *idx < type->cnt) {
+ struct memblock_region *r = &type->regions[*idx];
phys_addr_t base = r->base;
phys_addr_t size = r->size;
@@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
* in type_b.
*
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
@@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
* Check if the region [@base, @base+@size) intersects a reserved memory block.
*
* RETURNS:
- * 0 if false, non-zero if true
+ * True if they intersect, false if not.
*/
-int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
memblock_cap_size(base, &size);
- return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+ return memblock_overlaps_region(&memblock.reserved, base, size);
}
void __init_memblock memblock_trim_memory(phys_addr_t align)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1af057575ce9..1742a2db89c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
"unevictable",
};
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_TARGET_NUMAINFO,
- MEM_CGROUP_NTARGETS,
-};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024
-struct mem_cgroup_stat_cpu {
- long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEMCG_NR_EVENTS];
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
- struct mem_cgroup *position;
- /* scan generation, increased every round-trip */
- unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
- struct lruvec lruvec;
- unsigned long lru_size[NR_LRU_LISTS];
-
- struct reclaim_iter iter[DEF_PRIORITY + 1];
-
- struct rb_node tree_node; /* RB tree node */
- unsigned long usage_in_excess;/* Set to the value by which */
- /* the soft limit is exceeded*/
- bool on_tree;
- struct mem_cgroup *memcg; /* Back pointer, we cannot */
- /* use container_of */
-};
-
-struct mem_cgroup_per_node {
- struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-struct mem_cgroup_threshold {
- struct eventfd_ctx *eventfd;
- unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
- /* An array index points to threshold just below or equal to usage. */
- int current_threshold;
- /* Size of entries[] */
- unsigned int size;
- /* Array of thresholds */
- struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
- /* Primary thresholds array */
- struct mem_cgroup_threshold_ary *primary;
- /*
- * Spare threshold array.
- * This is needed to make mem_cgroup_unregister_event() "never fail".
- * It must be able to store at least primary->size - 1 entries.
- */
- struct mem_cgroup_threshold_ary *spare;
-};
-
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
- struct cgroup_subsys_state css;
-
- /* Accounted resources */
- struct page_counter memory;
- struct page_counter memsw;
- struct page_counter kmem;
-
- /* Normal memory consumption range */
- unsigned long low;
- unsigned long high;
-
- unsigned long soft_limit;
-
- /* vmpressure notifications */
- struct vmpressure vmpressure;
-
- /* css_online() has been completed */
- int initialized;
-
- /*
- * Should the accounting and control be hierarchical, per subtree?
- */
- bool use_hierarchy;
-
- /* protected by memcg_oom_lock */
- bool oom_lock;
- int under_oom;
-
- int swappiness;
- /* OOM-Killer disable */
- int oom_kill_disable;
-
- /* protect arrays of thresholds */
- struct mutex thresholds_lock;
-
- /* thresholds for memory usage. RCU-protected */
- struct mem_cgroup_thresholds thresholds;
-
- /* thresholds for mem+swap usage. RCU-protected */
- struct mem_cgroup_thresholds memsw_thresholds;
-
- /* For oom notifier event fd */
- struct list_head oom_notify;
-
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
- /*
- * set > 0 if pages under this cgroup are moving to other cgroup.
- */
- atomic_t moving_account;
- /* taken only while moving_account > 0 */
- spinlock_t move_lock;
- struct task_struct *move_lock_task;
- unsigned long move_lock_flags;
- /*
- * percpu counter.
- */
- struct mem_cgroup_stat_cpu __percpu *stat;
- spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
- /* Index in the kmem_cache->memcg_params.memcg_caches array */
- int kmemcg_id;
- bool kmem_acct_activated;
- bool kmem_acct_active;
-#endif
-
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct list_head cgwb_list;
- struct wb_domain cgwb_domain;
-#endif
-
- /* List of events which userspace want to receive */
- struct list_head event_list;
- spinlock_t event_list_lock;
-
- struct mem_cgroup_per_node *nodeinfo[0];
- /* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
- return memcg->kmem_acct_active;
-}
-#endif
-
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
*/
static DEFINE_MUTEX(memcg_create_mutex);
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
- return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) &&
- memcg_proto_active(cg_proto) &&
+ if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
css_tryget_online(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
@@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return &memcg->css;
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -876,14 +686,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_zone *mz;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- return mz->lru_size[lru];
-}
-
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid,
unsigned int lru_mask)
@@ -986,6 +788,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
+EXPORT_SYMBOL(mem_cgroup_from_task);
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
@@ -1031,7 +834,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -1173,30 +976,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- goto out;
-
- switch (idx) {
- case PGFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
- break;
- case PGMAJFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
- break;
- default:
- BUG();
- }
-out:
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
@@ -1295,15 +1074,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
VM_BUG_ON((long)(*lru_size) < 0);
}
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
- if (root == memcg)
- return true;
- if (!root->use_hierarchy)
- return false;
- return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
struct mem_cgroup *task_memcg;
@@ -1330,39 +1100,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret;
}
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
- unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
- unsigned long gb;
-
- inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
-
- return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return true;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- memcg = mz->memcg;
-
- return !!(memcg->css.flags & CSS_ONLINE);
-}
-
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1394,15 +1131,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
return margin;
}
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
- /* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
- return vm_swappiness;
-
- return memcg->swappiness;
-}
-
/*
* A routine for checking "mem" is under move_account() or not.
*
@@ -1545,6 +1273,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1297,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1305,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1343,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
@@ -2062,23 +1795,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val)
-{
- VM_BUG_ON(!rcu_read_lock_held());
-
- if (memcg)
- this_cpu_add(memcg->stat->count[idx], val);
-}
-
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -2504,16 +2220,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
- return memcg ? memcg->kmemcg_id : -1;
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -5127,10 +4833,12 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- struct task_struct *p = cgroup_taskset_first(tset);
- int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *from;
+ struct task_struct *p;
+ struct mm_struct *mm;
unsigned long move_flags;
+ int ret = 0;
/*
* We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4846,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* So we need to save it, and keep it going.
*/
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (move_flags) {
- struct mm_struct *mm;
- struct mem_cgroup *from = mem_cgroup_from_task(p);
+ if (!move_flags)
+ return 0;
- VM_BUG_ON(from == memcg);
+ p = cgroup_taskset_first(tset);
+ from = mem_cgroup_from_task(p);
- mm = get_task_mm(p);
- if (!mm)
- return 0;
- /* We move charges only when we move a owner of the mm */
- if (mm->owner == p) {
- VM_BUG_ON(mc.from);
- VM_BUG_ON(mc.to);
- VM_BUG_ON(mc.precharge);
- VM_BUG_ON(mc.moved_charge);
- VM_BUG_ON(mc.moved_swap);
-
- spin_lock(&mc.lock);
- mc.from = from;
- mc.to = memcg;
- mc.flags = move_flags;
- spin_unlock(&mc.lock);
- /* We set mc.moving_task later */
-
- ret = mem_cgroup_precharge_mc(mm);
- if (ret)
- mem_cgroup_clear_mc();
- }
- mmput(mm);
+ VM_BUG_ON(from == memcg);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
+
+ spin_lock(&mc.lock);
+ mc.from = from;
+ mc.to = memcg;
+ mc.flags = move_flags;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
}
+ mmput(mm);
return ret;
}
@@ -5521,19 +5230,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx,
- unsigned int nr)
-{
- this_cpu_add(memcg->stat->events[idx], nr);
-}
-
-/**
* mem_cgroup_low - check if memory consumption is below the normal range
* @root: the highest ancestor to consider
* @memcg: the memory cgroup to check
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f4446a90cef..eeda6485e76c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p)
if (!mem)
return -EINVAL;
- css = mem_cgroup_css(mem);
+ css = &mem->css;
ino = cgroup_ino(css->cgroup);
css_put(css);
@@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page)
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
+/**
+ * put_hwpoison_page() - Put refcount for memory error handling:
+ * @page: raw error page (hit by memory error)
+ */
+void put_hwpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ if (PageHuge(head)) {
+ put_page(head);
+ return;
+ }
+
+ if (PageTransHuge(head))
+ if (page != head)
+ put_page(head);
+
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(put_hwpoison_page);
+
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1100,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
nr_pages = 1 << compound_order(hpage);
else /* normal page or thp */
nr_pages = 1;
- atomic_long_add(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_add(nr_pages);
/*
* We need/can do nothing about count=0 pages.
@@ -1128,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHWPoison(hpage)) {
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|| (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
return 0;
}
@@ -1152,10 +1173,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
else
pr_err("MCE: %#lx: thp split failed\n", pfn);
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- put_page(p);
- if (p != hpage)
- put_page(hpage);
+ num_poisoned_pages_sub(nr_pages);
+ put_hwpoison_page(p);
return -EBUSY;
}
VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1214,16 +1233,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
@@ -1237,7 +1256,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
/*
@@ -1426,6 +1445,22 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
+ if (page_count(page) > 1) {
+ pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+ return 0;
+ }
+
+ if (page_mapped(page)) {
+ pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+ return 0;
+ }
+
+ if (page_mapping(page)) {
+ pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+ pfn);
+ return 0;
+ }
+
/*
* unpoison_memory() can encounter thp only when the thp is being
* worked by memory_failure() and the page lock is not held yet.
@@ -1450,7 +1485,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
if (TestClearPageHWPoison(p))
- atomic_long_dec(&num_poisoned_pages);
+ num_poisoned_pages_dec();
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1464,16 +1499,16 @@ int unpoison_memory(unsigned long pfn)
*/
if (TestClearPageHWPoison(page)) {
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
freeit = 1;
if (PageHuge(page))
clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
- put_page(page);
+ put_hwpoison_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_page(page);
+ put_hwpoison_page(page);
return 0;
}
@@ -1486,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
return alloc_huge_page_node(page_hstate(compound_head(p)),
nid);
else
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1533,7 +1568,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
/*
* Try to free it.
*/
- put_page(page);
+ put_hwpoison_page(page);
shake_page(page, 1);
/*
@@ -1542,7 +1577,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
ret = __get_any_page(page, pfn, 0);
if (!PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
@@ -1565,7 +1600,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
lock_page(hpage);
if (PageHWPoison(hpage)) {
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
@@ -1576,7 +1611,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
* get_any_page() and isolate_huge_page() takes a refcount each,
* so need to drop one here.
*/
- put_page(hpage);
+ put_hwpoison_page(hpage);
if (!ret) {
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
return -EBUSY;
@@ -1600,11 +1635,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ num_poisoned_pages_add(1 << compound_order(hpage));
} else {
SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
}
}
return ret;
@@ -1625,7 +1659,7 @@ static int __soft_offline_page(struct page *page, int flags)
wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1640,10 +1674,10 @@ static int __soft_offline_page(struct page *page, int flags)
* would need to fix isolation locking first.
*/
if (ret == 1) {
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft_offline: %#lx: invalidated\n", pfn);
SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
return 0;
}
@@ -1657,14 +1691,12 @@ static int __soft_offline_page(struct page *page, int flags)
* Drop page reference which is came from get_any_page()
* successful isolate_lru_page() already took another one.
*/
- put_page(page);
+ put_hwpoison_page(page);
if (!ret) {
LIST_HEAD(pagelist);
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- if (!TestSetPageHWPoison(page))
- atomic_long_inc(&num_poisoned_pages);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1679,8 +1711,6 @@ static int __soft_offline_page(struct page *page, int flags)
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- if (TestClearPageHWPoison(page))
- atomic_long_dec(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1719,12 +1749,16 @@ int soft_offline_page(struct page *page, int flags)
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(page);
return -EBUSY;
}
if (!PageHuge(page) && PageTransHuge(hpage)) {
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
pr_info("soft offline: %#lx: failed to split THP\n",
pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(page);
return -EBUSY;
}
}
@@ -1742,11 +1776,10 @@ int soft_offline_page(struct page *page, int flags)
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
if (!dequeue_hwpoisoned_huge_page(hpage))
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ num_poisoned_pages_add(1 << compound_order(hpage));
} else {
if (!TestSetPageHWPoison(page))
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
}
}
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index bb04d8f2f86c..6cd0b2160401 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping,
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
-
- /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
+ * i_mmap_lock for write to protect against truncate.
*/
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
goto uncharge_out;
}
@@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
+ * i_mmap_lock for write to protect against truncate.
*/
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
return ret;
uncharge_out:
@@ -3232,6 +3230,27 @@ out:
return 0;
}
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+ unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm,
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops)
+ if (vma_is_anonymous(vma))
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, flags);
+ else
return do_fault(mm, vma, address, pte, pmd,
flags, entry);
-
- return do_anonymous_page(mm, vma, address, pte, pmd,
- flags);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
@@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- int ret = VM_FAULT_FALLBACK;
- if (!vma->vm_ops)
- ret = do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
+ int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
@@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
- ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
- orig_pmd);
+ ret = wp_huge_pmd(mm, vma, address, pmd,
+ orig_pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a7f1e0d1d6b8..87a177917cb2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
qp->prev = vma;
- if (vma->vm_flags & VM_PFNMAP)
- return 1;
-
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
@@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+ return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE, 0);
}
@@ -2001,7 +1998,7 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
- page = alloc_pages_exact_node(hpage_node,
+ page = __alloc_pages_node(hpage_node,
gfp | __GFP_THISNODE, order);
goto out;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 2cc08de8b1db..4c533bc51d73 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
*/
void mempool_destroy(mempool_t *pool)
{
+ if (unlikely(!pool))
+ return;
+
while (pool->curr_nr) {
void *element = remove_element(pool);
pool->free(element, pool->pool_data);
diff --git a/mm/memtest.c b/mm/memtest.c
index 0a1cc133f6d7..8eaa4c3a5f65 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -1,11 +1,6 @@
#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
#include <linux/init.h>
-#include <linux/pfn.h>
#include <linux/memblock.h>
static u64 patterns[] __initdata = {
@@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
{
- printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
- (unsigned long long) pattern,
- (unsigned long long) start_bad,
- (unsigned long long) end_bad);
+ pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
+ cpu_to_be64(pattern), &start_bad, &end_bad);
memblock_reserve(start_bad, end_bad - start_bad);
}
@@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long)this_start,
- (unsigned long long)this_end,
- (unsigned long long)cpu_to_be64(pattern));
+ pr_info(" %pa - %pa pattern %016llx\n",
+ &this_start, &this_end, cpu_to_be64(pattern));
memtest(pattern, this_start, this_end - this_start);
}
}
}
/* default is disabled */
-static int memtest_pattern __initdata;
+static unsigned int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
+ int ret = 0;
+
if (arg)
- memtest_pattern = simple_strtoul(arg, NULL, 0);
+ ret = kstrtouint(arg, 0, &memtest_pattern);
else
memtest_pattern = ARRAY_SIZE(patterns);
- return 0;
+ return ret;
}
early_param("memtest", parse_memtest);
@@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
if (!memtest_pattern)
return;
- printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+ pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5c08cab5419e..02ce25df16c2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes or remove ptes */
if (page_mapped(page)) {
try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
- TTU_IGNORE_HWPOISON);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
@@ -952,9 +951,11 @@ out:
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/* Soft-offlined page shouldn't go through lru cache list */
- if (reason == MR_MEMORY_FAILURE)
+ if (reason == MR_MEMORY_FAILURE) {
put_page(page);
- else
+ if (!test_set_page_hwpoison(page))
+ num_poisoned_pages_inc();
+ } else
putback_lru_page(page);
}
@@ -1194,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
return alloc_huge_page_node(page_hstate(compound_head(p)),
pm->node);
else
- return alloc_pages_exact_node(pm->node,
+ return __alloc_pages_node(pm->node,
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
@@ -1554,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
int nid = (int) data;
struct page *newpage;
- newpage = alloc_pages_exact_node(nid,
+ newpage = __alloc_pages_node(nid,
(GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE | __GFP_NOMEMALLOC |
__GFP_NORETRY | __GFP_NOWARN) &
diff --git a/mm/mmap.c b/mm/mmap.c
index 82db4fc0a9d3..b6be3249f0a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2455,7 +2455,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
- int err = -ENOMEM;
+ int err;
if (is_vm_hugetlb_page(vma) && (addr &
~(huge_page_mask(hstate_vma(vma)))))
@@ -2463,7 +2463,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
- goto out_err;
+ return -ENOMEM;
/* most fields are the same, copy all, and then fixup */
*new = *vma;
@@ -2511,7 +2511,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
mpol_put(vma_policy(new));
out_free_vma:
kmem_cache_free(vm_area_cachep, new);
- out_err:
return err;
}
@@ -2872,6 +2871,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ return -ENOMEM;
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ return -ENOMEM;
+
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
* until its first write fault, when page's anon_vma and index
@@ -2884,16 +2890,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap_pgoff and in do_brk.
*/
- if (!vma->vm_file) {
+ if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- return -ENOMEM;
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, vma_pages(vma)))
- return -ENOMEM;
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
@@ -2918,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (unlikely(!vma->vm_file && !vma->anon_vma)) {
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
faulted_in_anon_vma = false;
}
@@ -2952,30 +2952,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new_vma) {
- *new_vma = *vma;
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
- if (vma_dup_policy(vma, new_vma))
- goto out_free_vma;
- INIT_LIST_HEAD(&new_vma->anon_vma_chain);
- if (anon_vma_clone(new_vma, vma))
- goto out_free_mempol;
- if (new_vma->vm_file)
- get_file(new_vma->vm_file);
- if (new_vma->vm_ops && new_vma->vm_ops->open)
- new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
- *need_rmap_locks = false;
- }
+ if (!new_vma)
+ goto out;
+ *new_vma = *vma;
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
+ if (vma_dup_policy(vma, new_vma))
+ goto out_free_vma;
+ INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
+ if (new_vma->vm_file)
+ get_file(new_vma->vm_file);
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
}
return new_vma;
- out_free_mempol:
+out_free_mempol:
mpol_put(vma_policy(new_vma));
- out_free_vma:
+out_free_vma:
kmem_cache_free(vm_area_cachep, new_vma);
+out:
return NULL;
}
@@ -3027,21 +3028,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
pgoff_t pgoff;
struct page **pages;
- /*
- * special mappings have no vm_file, and in that case, the mm
- * uses vm_pgoff internally. So we have to subtract it from here.
- * We are allowed to do this because we are the mm; do not copy
- * this code into drivers!
- */
- pgoff = vmf->pgoff - vma->vm_pgoff;
-
if (vma->vm_ops == &legacy_special_mapping_vmops)
pages = vma->vm_private_data;
else
pages = ((struct vm_special_mapping *)vma->vm_private_data)->
pages;
- for (; pgoff && *pages; ++pages)
+ for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dff991e0681e..1ecc0bcaecc5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* Determine the type of allocation constraint.
*/
#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
/* Default to all available memory */
*totalpages = totalram_pages + total_swap_pages;
- if (!zonelist)
+ if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (gfp_mask & __GFP_THISNODE)
+ if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;
/*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
*totalpages = total_swap_pages;
- for_each_node_mask(nid, *nodemask)
+ for_each_node_mask(nid, *oc->nodemask)
*totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
/* Check this allocation failure is caused by cpuset's wall function */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- if (!cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ high_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
}
#else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
*totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif
-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages)
{
- if (oom_unkillable_task(task, NULL, nodemask))
+ if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;
/*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!force_kill)
+ if (oc->order != -1)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !force_kill)
+ if (task_will_free_mem(task) && oc->order != -1)
return OOM_SCAN_ABORT;
return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+ unsigned int *ppoints, unsigned long totalpages)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
for_each_process_thread(g, p) {
unsigned int points;
- switch (oom_scan_process_thread(p, totalpages, nodemask,
- force_kill)) {
+ switch (oom_scan_process_thread(oc, p, totalpages)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
case OOM_SCAN_OK:
break;
};
- points = oom_badness(p, NULL, nodemask, totalpages);
+ points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}
-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+ struct mem_cgroup *memcg)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
- current->comm, gfp_mask, order,
+ current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, nodemask);
+ dump_tasks(memcg, oc->nodemask);
}
/*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+ struct mem_cgroup *memcg, const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(p);
if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, p, memcg);
task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, nodemask,
+ child_points = oom_badness(child, memcg, oc->nodemask,
totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, memcg, nodemask);
+ /* Do not panic for oom kills triggered by sysrq */
+ if (oc->order == -1)
+ return;
+ dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
/**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
{
- const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
unsigned int uninitialized_var(points);
enum oom_constraint constraint = CONSTRAINT_NONE;
- int killed = 0;
if (oom_killer_disabled)
return false;
@@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
- goto out;
+ return true;
/*
* If current has a pending SIGKILL or is exiting, then automatically
@@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current);
- goto out;
+ return true;
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+ constraint = constrained_alloc(oc, &totalpages);
+ if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc, constraint, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, nodemask) &&
+ !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
+ oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
- goto out;
+ return true;
}
- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ if (!p && oc->order != -1) {
+ dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
- killed = 1;
- }
-out:
- /*
- * Give the killed threads a good chance of exiting before trying to
- * allocate memory again.
- */
- if (killed)
+ if (p && p != (void *)-1UL) {
+ oom_kill_process(oc, p, points, totalpages, NULL,
+ "Out of memory");
+ /*
+ * Give the killed process a good chance to exit before trying
+ * to allocate memory again.
+ */
schedule_timeout_killable(1);
-
+ }
return true;
}
@@ -728,13 +711,20 @@ out:
*/
void pagefault_out_of_memory(void)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ };
+
if (mem_cgroup_oom_synchronize(true))
return;
if (!mutex_trylock(&oom_lock))
return;
- if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ if (!out_of_memory(&oc)) {
/*
* There shouldn't be any user tasks runnable while the
* OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b401d40cb4fd..48aaf7b9f253 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+ return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+ page->index = migratetype;
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
@@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- mt = get_freepage_migratetype(page);
+
+ mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
if (unlikely(has_isolate_pageblock(zone)))
mt = get_pageblock_migratetype(page);
@@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- set_freepage_migratetype(page, migratetype);
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
@@ -1383,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
return page;
}
@@ -1460,7 +1481,6 @@ int move_freepages(struct zone *zone,
order = page_order(page);
list_move(&page->lru,
&zone->free_area[order].free_list[migratetype]);
- set_freepage_migratetype(page, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -1630,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
expand(zone, page, order, current_order, area,
start_migratetype);
/*
- * The freepage_migratetype may differ from pageblock's
+ * The pcppage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
+ * find_suitable_fallback(). This is OK as long as it does not
+ * differ for MIGRATE_CMA pageblocks. Those can be used as
+ * fallback only via special __rmqueue_cma_fallback() function
*/
- set_freepage_migratetype(page, start_migratetype);
+ set_pcppage_migratetype(page, start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -1713,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
- if (is_migrate_cma(get_freepage_migratetype(page)))
+ if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1910,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
@@ -2115,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_freepage_migratetype(page));
+ get_pcppage_migratetype(page));
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2696,6 +2715,12 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct page *page;
*did_some_progress = 0;
@@ -2747,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);
@@ -3490,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
- * Note this is not alloc_pages_exact_node() which allocates on a specific node,
- * but is not exact.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
@@ -5066,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
{
unsigned long zone_start_pfn, zone_end_pfn;
- /* When hotadd a new node, the node should be empty */
+ /* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
@@ -5133,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
- /* When hotadd a new node, the node should be empty */
+ /* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
@@ -5306,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
*
* NOTE: pgdat should get zeroed by caller.
*/
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn)
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -5458,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5470,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn);
+ free_area_init_core(pgdat);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5481,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
*/
void __init setup_nr_node_ids(void)
{
- unsigned int node;
- unsigned int highest = 0;
+ unsigned int highest;
- for_each_node_mask(node, node_possible_map)
- highest = node;
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
nr_node_ids = highest + 1;
}
#endif
@@ -6006,7 +6026,7 @@ void __init mem_init_print_info(const char *str)
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
*
- * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
* function may optionally be used to account for unfreeable pages in the
@@ -6059,7 +6079,7 @@ void __init page_alloc_init(void)
}
/*
- * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
@@ -6103,7 +6123,7 @@ static void calculate_totalreserve_pages(void)
/*
* setup_per_zone_lowmem_reserve - called whenever
- * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
+ * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 303c908790ef..4568fd58f70a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,7 +9,8 @@
#include <linux/hugetlb.h>
#include "internal.h"
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page,
+ bool skip_hwpoisoned_pages)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -72,7 +73,7 @@ out:
return ret;
}
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
@@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
continue;
}
page = pfn_to_page(pfn);
- if (PageBuddy(page)) {
+ if (PageBuddy(page))
/*
- * If race between isolatation and allocation happens,
- * some free pages could be in MIGRATE_MOVABLE list
- * although pageblock's migratation type of the page
- * is MIGRATE_ISOLATE. Catch it and move the page into
- * MIGRATE_ISOLATE list.
+ * If the page is on a free list, it has to be on
+ * the correct MIGRATE_ISOLATE freelist. There is no
+ * simple way to verify that as VM_BUG_ON(), though.
*/
- if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
- struct page *end_page;
-
- end_page = page + (1 << page_order(page)) - 1;
- move_freepages(page_zone(page), page, end_page,
- MIGRATE_ISOLATE);
- }
pfn += 1 << page_order(page);
- }
- else if (page_count(page) == 0 &&
- get_freepage_migratetype(page) == MIGRATE_ISOLATE)
- pfn += 1;
- else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
- /*
- * The HWPoisoned page may be not in buddy
- * system, and page_count() is not 0.
- */
+ else if (skip_hwpoisoned_pages && PageHWPoison(page))
+ /* A HWPoisoned page cannot be also PageBuddy */
pfn++;
- continue;
- }
else
break;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index dbe0c1e8349c..48ce82926d93 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
+static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
+ generic_fillattr(inode, stat);
+
+ return 0;
+}
+
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = {
};
static const struct inode_operations shmem_inode_operations = {
+ .getattr = shmem_getattr,
.setattr = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
diff --git a/mm/slab.c b/mm/slab.c
index 60c936938b84..c77ebe6cc87c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
if (memcg_charge_slab(cachep, flags, cachep->gfporder))
return NULL;
- page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+ page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
memcg_uncharge_slab(cachep, cachep->gfporder);
slab_out_of_memory(cachep, flags, nodeid);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c26829fe4e37..5ce4faeb16fb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *root_cache)
{
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+ struct cgroup_subsys_state *css = &memcg->css;
struct memcg_cache_array *arr;
struct kmem_cache *s = NULL;
char *cache_name;
@@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
bool need_rcu_barrier = false;
bool busy = false;
+ if (unlikely(!s))
+ return;
+
BUG_ON(!is_root_cache(s));
get_online_cpus();
diff --git a/mm/slob.c b/mm/slob.c
index 165bbd3cd606..0d7e5df74d1f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -45,7 +45,7 @@
* NUMA support in SLOB is fairly simplistic, pushing most of the real
* logic down to the page allocator, and simply doing the node accounting
* on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_exact_node() with the specified node id is used
+ * provided, __alloc_pages_node() with the specified node id is used
* instead. The common case (or when the node id isn't explicitly provided)
* will default to the current node, as per numa_node_id().
*
@@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
#ifdef CONFIG_NUMA
if (node != NUMA_NO_NODE)
- page = alloc_pages_exact_node(node, gfp, order);
+ page = __alloc_pages_node(node, gfp, order);
else
#endif
page = alloc_pages(gfp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 084184e706c6..f614b5dc396b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
if (node == NUMA_NO_NODE)
page = alloc_pages(flags, order);
else
- page = alloc_pages_exact_node(node, flags, order);
+ page = __alloc_pages_node(node, flags, order);
if (!page)
memcg_uncharge_slab(s, order);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66138da..d504adb7fa5f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
return page;
}
-/*
- * Locate a page of swap in physical memory, reserving swap cache space
- * and reading the disk if it is not already cached.
- * A failure return means that either the page allocation failed or that
- * the swap entry is no longer in use.
- */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr)
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated)
{
struct page *found_page, *new_page = NULL;
+ struct address_space *swapper_space = swap_address_space(entry);
int err;
+ *new_page_allocated = false;
do {
/*
@@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swap_address_space(entry),
- entry.val);
+ found_page = find_get_page(swapper_space, entry.val);
if (found_page)
break;
@@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* Initiate read into locked page and return.
*/
lru_cache_add_anon(new_page);
- swap_readpage(new_page);
+ *new_page_allocated = true;
return new_page;
}
radix_tree_preload_end();
@@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ bool page_was_allocated;
+ struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+ vma, addr, &page_was_allocated);
+
+ if (page_was_allocated)
+ swap_readpage(retpage);
+
+ return retpage;
+}
+
static unsigned long swapin_nr_pages(unsigned long offset)
{
static unsigned long prev_offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index aebc2dd6e649..58877312cf6b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@ int page_swapcount(struct page *page)
}
/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+ int count, tmp_count, n;
+ struct swap_info_struct *p;
+ struct page *page;
+ pgoff_t offset;
+ unsigned char *map;
+
+ p = swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ count = swap_count(p->swap_map[swp_offset(entry)]);
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ count &= ~COUNT_CONTINUED;
+ n = SWAP_MAP_MAX + 1;
+
+ offset = swp_offset(entry);
+ page = vmalloc_to_page(p->swap_map + offset);
+ offset &= ~PAGE_MASK;
+ VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+ do {
+ page = list_entry(page->lru.next, struct page, lru);
+ map = kmap_atomic(page);
+ tmp_count = map[offset];
+ kunmap_atomic(map);
+
+ count += (tmp_count & ~COUNT_CONTINUED) * n;
+ n *= (SWAP_CONT_MAX + 1);
+ } while (tmp_count & COUNT_CONTINUED);
+out:
+ spin_unlock(&p->lock);
+ return count;
+}
+
+/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1139039122a..2d978b28a410 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
if (!memcg)
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
- if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+ if (memcg->css.cgroup)
return true;
#endif
return false;
@@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) Legacy memcg encounters a page that is not already marked
+ * 3) Legacy memcg encounters a page that is already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
@@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
SetPageReclaim(page);
nr_writeback++;
-
goto keep_locked;
/* Case 3 above */
} else {
+ unlock_page(page);
wait_on_page_writeback(page);
+ /* then go back and try same page again */
+ list_add_tail(&page->lru, page_list);
+ continue;
}
}
@@ -1196,7 +1199,7 @@ cull_mlocked:
if (PageSwapCache(page))
try_to_free_swap(page);
unlock_page(page);
- putback_lru_page(page);
+ list_add(&page->lru, &ret_pages);
continue;
activate_locked:
@@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long scan;
- for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+ !list_empty(src); scan++) {
struct page *page;
int nr_pages;
diff --git a/mm/zbud.c b/mm/zbud.c
index f3bf6f7627d8..fa48bcdff9d5 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,10 +96,10 @@ struct zbud_pool {
struct list_head buddied;
struct list_head lru;
u64 pages_nr;
- struct zbud_ops *ops;
+ const struct zbud_ops *ops;
#ifdef CONFIG_ZPOOL
struct zpool *zpool;
- struct zpool_ops *zpool_ops;
+ const struct zpool_ops *zpool_ops;
#endif
};
@@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
return -ENOENT;
}
-static struct zbud_ops zbud_zpool_ops = {
+static const struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
static void *zbud_zpool_create(char *name, gfp_t gfp,
- struct zpool_ops *zpool_ops,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct zbud_pool *pool;
@@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
{
struct zbud_pool *pool;
int i;
diff --git a/mm/zpool.c b/mm/zpool.c
index 722a4f60e90b..68d2dd8ed2d8 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -22,7 +22,7 @@ struct zpool {
struct zpool_driver *driver;
void *pool;
- struct zpool_ops *ops;
+ const struct zpool_ops *ops;
struct list_head list;
};
@@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
* Returns: New zpool on success, NULL on failure.
*/
struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
- struct zpool_ops *ops)
+ const struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -320,20 +320,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
return zpool->driver->total_size(zpool->pool);
}
-static int __init init_zpool(void)
-{
- pr_info("loaded\n");
- return 0;
-}
-
-static void __exit exit_zpool(void)
-{
- pr_info("unloaded\n");
-}
-
-module_init(init_zpool);
-module_exit(exit_zpool);
-
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0a7f81aa2249..f135b1b6fcdc 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -169,14 +169,12 @@ enum zs_stat_type {
NR_ZS_STAT_TYPE,
};
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static struct dentry *zs_stat_root;
-
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
#endif
/*
@@ -201,6 +199,8 @@ static int zs_size_classes;
static const int fullness_threshold_frac = 4;
struct size_class {
+ spinlock_t lock;
+ struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
@@ -210,16 +210,10 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
- /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
- bool huge;
-
-#ifdef CONFIG_ZSMALLOC_STAT
struct zs_size_stat stats;
-#endif
-
- spinlock_t lock;
- struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
};
/*
@@ -251,6 +245,15 @@ struct zs_pool {
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+ struct zs_pool_stats stats;
+
+ /* Compact classes */
+ struct shrinker shrinker;
+ /*
+ * To signify that register_shrinker() was successful
+ * and unregister_shrinker() will not Oops.
+ */
+ bool shrinker_enabled;
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool)
static void destroy_handle_cache(struct zs_pool *pool)
{
- if (pool->handle_cachep)
- kmem_cache_destroy(pool->handle_cachep);
+ kmem_cache_destroy(pool->handle_cachep);
}
static unsigned long alloc_handle(struct zs_pool *pool)
@@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+static void *zs_zpool_create(char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
return zs_create_pool(name, gfp);
@@ -441,8 +444,6 @@ static int get_size_class_index(int size)
return min(zs_size_classes - 1, idx);
}
-#ifdef CONFIG_ZSMALLOC_STAT
-
static inline void zs_stat_inc(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
@@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
return class->stats.objs[type];
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
static int __init zs_stat_init(void)
{
if (!debugfs_initialized())
@@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
}
#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return 0;
-}
-
static int __init zs_stat_init(void)
{
return 0;
@@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
{
}
-
#endif
@@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
- head = &class->fullness_list[fullness];
- if (*head)
- list_add_tail(&page->lru, &(*head)->lru);
-
- *head = page;
zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+
+ head = &class->fullness_list[fullness];
+ if (!*head) {
+ *head = page;
+ return;
+ }
+
+ /*
+ * We want to see more ZS_FULL pages and less almost
+ * empty/full. Put pages with higher ->inuse first.
+ */
+ list_add_tail(&page->lru, &(*head)->lru);
+ if (page->inuse >= (*head)->inuse)
+ *head = page;
}
/*
@@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long src, unsigned long dst,
+static void zs_object_copy(unsigned long dst, unsigned long src,
struct size_class *class)
{
struct page *s_page, *d_page;
@@ -1602,8 +1596,6 @@ struct zs_compact_control {
/* Starting object index within @s_page which used for live object
* in the subpage. */
int index;
- /* how many of objects are migrated */
- int nr_migrated;
};
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
unsigned long index = cc->index;
- int nr_migrated = 0;
int ret = 0;
while (1) {
@@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
used_obj = handle_to_obj(handle);
free_obj = obj_malloc(d_page, class, handle);
- zs_object_copy(used_obj, free_obj, class);
+ zs_object_copy(free_obj, used_obj, class);
index++;
record_obj(handle, free_obj);
unpin_tag(handle);
obj_free(pool, class, used_obj);
- nr_migrated++;
}
/* Remember last position in this iteration */
cc->s_page = s_page;
cc->index = index;
- cc->nr_migrated = nr_migrated;
return ret;
}
-static struct page *alloc_target_page(struct size_class *class)
+static struct page *isolate_target_page(struct size_class *class)
{
int i;
struct page *page;
@@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class)
return page;
}
-static void putback_zspage(struct zs_pool *pool, struct size_class *class,
- struct page *first_page)
+/*
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+ struct size_class *class,
+ struct page *first_page)
{
enum fullness_group fullness;
@@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
free_zspage(first_page);
}
+
+ return fullness;
}
static struct page *isolate_source_page(struct size_class *class)
{
- struct page *page;
+ int i;
+ struct page *page = NULL;
- page = class->fullness_list[ZS_ALMOST_EMPTY];
- if (page)
- remove_zspage(page, class, ZS_ALMOST_EMPTY);
+ for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
+ page = class->fullness_list[i];
+ if (!page)
+ continue;
+
+ remove_zspage(page, class, i);
+ break;
+ }
return page;
}
-static unsigned long __zs_compact(struct zs_pool *pool,
- struct size_class *class)
+/*
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
+{
+ unsigned long obj_wasted;
+
+ obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
+ zs_stat_get(class, OBJ_USED);
+
+ obj_wasted /= get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+
+ return obj_wasted * class->pages_per_zspage;
+}
+
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
{
- int nr_to_migrate;
struct zs_compact_control cc;
struct page *src_page;
struct page *dst_page = NULL;
- unsigned long nr_total_migrated = 0;
spin_lock(&class->lock);
while ((src_page = isolate_source_page(class))) {
BUG_ON(!is_first_page(src_page));
- /* The goal is to migrate all live objects in source page */
- nr_to_migrate = src_page->inuse;
+ if (!zs_can_compact(class))
+ break;
+
cc.index = 0;
cc.s_page = src_page;
- while ((dst_page = alloc_target_page(class))) {
+ while ((dst_page = isolate_target_page(class))) {
cc.d_page = dst_page;
/*
- * If there is no more space in dst_page, try to
- * allocate another zspage.
+ * If there is no more space in dst_page, resched
+ * and see if anyone had allocated another zspage.
*/
if (!migrate_zspage(pool, class, &cc))
break;
putback_zspage(pool, class, dst_page);
- nr_total_migrated += cc.nr_migrated;
- nr_to_migrate -= cc.nr_migrated;
}
/* Stop if we couldn't find slot */
@@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
break;
putback_zspage(pool, class, dst_page);
- putback_zspage(pool, class, src_page);
+ if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+ pool->stats.pages_compacted += class->pages_per_zspage;
spin_unlock(&class->lock);
- nr_total_migrated += cc.nr_migrated;
cond_resched();
spin_lock(&class->lock);
}
@@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(pool, class, src_page);
spin_unlock(&class->lock);
-
- return nr_total_migrated;
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
- unsigned long nr_migrated = 0;
struct size_class *class;
for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool)
continue;
if (class->index != i)
continue;
- nr_migrated += __zs_compact(pool, class);
+ __zs_compact(pool, class);
}
- return nr_migrated;
+ return pool->stats.pages_compacted;
}
EXPORT_SYMBOL_GPL(zs_compact);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+ memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long pages_freed;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ pages_freed = pool->stats.pages_compacted;
+ /*
+ * Compact classes and calculate compaction delta.
+ * Can run concurrently with a manually triggered
+ * (by user) compaction.
+ */
+ pages_freed = zs_compact(pool) - pages_freed;
+
+ return pages_freed ? pages_freed : SHRINK_STOP;
+}
+
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int i;
+ struct size_class *class;
+ unsigned long pages_to_free = 0;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ if (!pool->shrinker_enabled)
+ return 0;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+
+ pages_to_free += zs_can_compact(class);
+ }
+
+ return pages_to_free;
+}
+
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+ if (pool->shrinker_enabled) {
+ unregister_shrinker(&pool->shrinker);
+ pool->shrinker_enabled = false;
+ }
+}
+
+static int zs_register_shrinker(struct zs_pool *pool)
+{
+ pool->shrinker.scan_objects = zs_shrinker_scan;
+ pool->shrinker.count_objects = zs_shrinker_count;
+ pool->shrinker.batch = 0;
+ pool->shrinker.seeks = DEFAULT_SEEKS;
+
+ return register_shrinker(&pool->shrinker);
+}
+
/**
* zs_create_pool - Creates an allocation pool to work from.
* @flags: allocation flags used to allocate pool metadata
@@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
if (zs_pool_stat_create(name, pool))
goto err;
+ /*
+ * Not critical, we still can use the pool
+ * and user can trigger compaction manually.
+ */
+ if (zs_register_shrinker(pool) == 0)
+ pool->shrinker_enabled = true;
return pool;
err:
@@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool)
{
int i;
+ zs_unregister_shrinker(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < zs_size_classes; i++) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 2d5727baed59..48a1d081e2a5 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -446,75 +446,14 @@ enum zswap_get_swap_ret {
static int zswap_get_swap_cache_page(swp_entry_t entry,
struct page **retpage)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
- int err;
+ bool page_was_allocated;
- *retpage = NULL;
- do {
- /*
- * First check the swap cache. Since this is normally
- * called after lookup_swap_cache() failed, re-calling
- * that would confuse statistics.
- */
- found_page = find_get_page(swapper_space, entry.val);
- if (found_page)
- break;
-
- /*
- * Get a new page to read into from swap.
- */
- if (!new_page) {
- new_page = alloc_page(GFP_KERNEL);
- if (!new_page)
- break; /* Out of memory */
- }
-
- /*
- * call radix_tree_preload() while we can wait.
- */
- err = radix_tree_preload(GFP_KERNEL);
- if (err)
- break;
-
- /*
- * Swap entry may have been freed since our caller observed it.
- */
- err = swapcache_prepare(entry);
- if (err == -EEXIST) { /* seems racy */
- radix_tree_preload_end();
- continue;
- }
- if (err) { /* swp entry is obsolete ? */
- radix_tree_preload_end();
- break;
- }
-
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
- SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
- if (likely(!err)) {
- radix_tree_preload_end();
- lru_cache_add_anon(new_page);
- *retpage = new_page;
- return ZSWAP_SWAPCACHE_NEW;
- }
- radix_tree_preload_end();
- ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- swapcache_free(entry);
- } while (err != -ENOMEM);
-
- if (new_page)
- page_cache_release(new_page);
- if (!found_page)
+ *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
+ NULL, 0, &page_was_allocated);
+ if (page_was_allocated)
+ return ZSWAP_SWAPCACHE_NEW;
+ if (!*retpage)
return ZSWAP_SWAPCACHE_FAIL;
- *retpage = found_page;
return ZSWAP_SWAPCACHE_EXIST;
}
@@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
zswap_trees[type] = NULL;
}
-static struct zpool_ops zswap_zpool_ops = {
+static const struct zpool_ops zswap_zpool_ops = {
.evict = zswap_writeback_entry
};