summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c24
-rw-r--r--mm/compaction.c98
-rw-r--r--mm/huge_memory.c15
-rw-r--r--mm/internal.h1
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memory.c13
-rw-r--r--mm/mempolicy.c130
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/page_alloc.c64
-rw-r--r--mm/page_isolation.c26
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/vmscan.c111
13 files changed, 208 insertions, 297 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1324cd74faec..b93376c39b61 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
while (start < end) {
unsigned long *map, idx, vec;
+ unsigned shift;
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
+ shift = idx & (BITS_PER_LONG - 1);
+ /*
+ * vec holds at most BITS_PER_LONG map bits,
+ * bit 0 corresponds to start.
+ */
vec = ~map[idx / BITS_PER_LONG];
+
+ if (shift) {
+ vec >>= shift;
+ if (end - start >= BITS_PER_LONG)
+ vec |= ~map[idx / BITS_PER_LONG + 1] <<
+ (BITS_PER_LONG - shift);
+ }
/*
* If we have a properly aligned and fully unreserved
* BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
- unsigned long off = 0;
+ unsigned long cur = start;
- vec >>= start & (BITS_PER_LONG - 1);
- while (vec) {
+ start = ALIGN(start + 1, BITS_PER_LONG);
+ while (vec && cur != start) {
if (vec & 1) {
- page = pfn_to_page(start + off);
+ page = pfn_to_page(cur);
__free_pages_bootmem(page, 0);
count++;
}
vec >>= 1;
- off++;
+ ++cur;
}
- start = ALIGN(start + 1, BITS_PER_LONG);
}
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e466497..c62bd063d766 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
+ unsigned int order;
unsigned long watermark;
if (fatal_signal_pending(current))
@@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
- if (cc->page) {
- /* Was a suitable page captured? */
- if (*cc->page)
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&area->free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (cc->order >= pageblock_order && area->nr_free)
return COMPACT_PARTIAL;
- } else {
- unsigned int order;
- for (order = cc->order; order < MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[cc->order];
- /* Job done if page is free of the right migratetype */
- if (!list_empty(&area->free_list[cc->migratetype]))
- return COMPACT_PARTIAL;
-
- /* Job done if allocation would set block type */
- if (cc->order >= pageblock_order && area->nr_free)
- return COMPACT_PARTIAL;
- }
}
return COMPACT_CONTINUE;
@@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
return COMPACT_CONTINUE;
}
-static void compact_capture_page(struct compact_control *cc)
-{
- unsigned long flags;
- int mtype, mtype_low, mtype_high;
-
- if (!cc->page || *cc->page)
- return;
-
- /*
- * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
- * regardless of the migratetype of the freelist is is captured from.
- * This is fine because the order for a high-order MIGRATE_MOVABLE
- * allocation is typically at least a pageblock size and overall
- * fragmentation is not impaired. Other allocation types must
- * capture pages from their own migratelist because otherwise they
- * could pollute other pageblocks like MIGRATE_MOVABLE with
- * difficult to move pages and making fragmentation worse overall.
- */
- if (cc->migratetype == MIGRATE_MOVABLE) {
- mtype_low = 0;
- mtype_high = MIGRATE_PCPTYPES;
- } else {
- mtype_low = cc->migratetype;
- mtype_high = cc->migratetype + 1;
- }
-
- /* Speculatively examine the free lists without zone lock */
- for (mtype = mtype_low; mtype < mtype_high; mtype++) {
- int order;
- for (order = cc->order; order < MAX_ORDER; order++) {
- struct page *page;
- struct free_area *area;
- area = &(cc->zone->free_area[order]);
- if (list_empty(&area->free_list[mtype]))
- continue;
-
- /* Take the lock and attempt capture of the page */
- if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
- return;
- if (!list_empty(&area->free_list[mtype])) {
- page = list_entry(area->free_list[mtype].next,
- struct page, lru);
- if (capture_free_page(page, cc->order, mtype)) {
- spin_unlock_irqrestore(&cc->zone->lock,
- flags);
- *cc->page = page;
- return;
- }
- }
- spin_unlock_irqrestore(&cc->zone->lock, flags);
- }
- }
-}
-
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
@@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
goto out;
}
}
-
- /* Capture a page now if it is a suitable size */
- compact_capture_page(cc);
}
out:
@@ -1069,8 +1007,7 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- bool sync, bool *contended,
- struct page **page)
+ bool sync, bool *contended)
{
unsigned long ret;
struct compact_control cc = {
@@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
- .page = page,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync, bool *contended, struct page **page)
+ bool sync, bool *contended)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
int status;
status = compact_zone_order(zone, order, gfp_mask, sync,
- contended, page);
+ contended);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
@@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
struct compact_control cc = {
.order = order,
.sync = false,
- .page = NULL,
};
return __compact_pgdat(pgdat, &cc);
@@ -1203,14 +1138,13 @@ static int compact_node(int nid)
struct compact_control cc = {
.order = -1,
.sync = true,
- .page = NULL,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
}
/* Compact all nodes in the system */
-static int compact_nodes(void)
+static void compact_nodes(void)
{
int nid;
@@ -1219,8 +1153,6 @@ static int compact_nodes(void)
for_each_online_node(nid)
compact_node(nid);
-
- return COMPACT_COMPLETE;
}
/* The written value is actually unused, all memory is compacted */
@@ -1231,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
if (write)
- return compact_nodes();
+ compact_nodes();
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9e894edc7811..6001ee6347a9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1819,9 +1819,19 @@ int split_huge_page(struct page *page)
BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
- anon_vma = page_lock_anon_vma_read(page);
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(page);
if (!anon_vma)
goto out;
+ anon_vma_lock_write(anon_vma);
+
ret = 0;
if (!PageCompound(page))
goto out_unlock;
@@ -1832,7 +1842,8 @@ int split_huge_page(struct page *page)
BUG_ON(PageCompound(page));
out_unlock:
- page_unlock_anon_vma_read(anon_vma);
+ anon_vma_unlock(anon_vma);
+ put_anon_vma(anon_vma);
out:
return ret;
}
diff --git a/mm/internal.h b/mm/internal.h
index d597f94cc205..9ba21100ebf3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,7 +135,6 @@ struct compact_control {
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
bool contended; /* True if a lock was contended */
- struct page **page; /* Page captured of requested size */
};
unsigned long
diff --git a/mm/memblock.c b/mm/memblock.c
index 625905523c2a..88adc8afb610 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
}
this->size += next->size;
- memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
+ /* move forward from next + 1, index of which is i + 2 */
+ memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
type->cnt--;
}
}
diff --git a/mm/memory.c b/mm/memory.c
index e0a9b0ce4f10..bb1369f7b9b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -184,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
return 1;
}
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return 0;
+
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
return 0;
+ tlb->batch_count++;
batch->next = NULL;
batch->nr = 0;
batch->max = MAX_GATHER_BATCH;
@@ -216,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
tlb->active = &tlb->local;
+ tlb->batch_count = 0;
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
@@ -3706,6 +3711,14 @@ retry:
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
+ /*
+ * If the pmd is splitting, return and retry the
+ * the fault. Alternative: wait until the split
+ * is done, and goto retry.
+ */
+ if (pmd_trans_splitting(orig_pmd))
+ return 0;
+
if (pmd_numa(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d1b315e98627..e2df1c1fb41f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2132,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->mutex */
+/* Caller holds sp->lock */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2196,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- mutex_lock(&sp->mutex);
+ spin_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- mutex_unlock(&sp->mutex);
+ spin_unlock(&sp->lock);
return pol;
}
@@ -2328,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
sp_free(n);
}
+static void sp_node_init(struct sp_node *node, unsigned long start,
+ unsigned long end, struct mempolicy *pol)
+{
+ node->start = start;
+ node->end = end;
+ node->policy = pol;
+}
+
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
@@ -2344,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
-
- n->start = start;
- n->end = end;
- n->policy = newpol;
+ sp_node_init(n, start, end, newpol);
return n;
}
@@ -2357,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n;
+ struct sp_node *n_new = NULL;
+ struct mempolicy *mpol_new = NULL;
int ret = 0;
- mutex_lock(&sp->mutex);
+restart:
+ spin_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2372,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
- struct sp_node *new2;
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!n_new)
+ goto alloc_new;
+
+ *mpol_new = *n->policy;
+ atomic_set(&mpol_new->refcnt, 1);
+ sp_node_init(n_new, n->end, end, mpol_new);
+ sp_insert(sp, n_new);
n->end = start;
- sp_insert(sp, new2);
+ n_new = NULL;
+ mpol_new = NULL;
break;
} else
n->end = start;
@@ -2390,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
}
if (new)
sp_insert(sp, new);
-out:
- mutex_unlock(&sp->mutex);
+ spin_unlock(&sp->lock);
+ ret = 0;
+
+err_out:
+ if (mpol_new)
+ mpol_put(mpol_new);
+ if (n_new)
+ kmem_cache_free(sn_cache, n_new);
+
return ret;
+
+alloc_new:
+ spin_unlock(&sp->lock);
+ ret = -ENOMEM;
+ n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ if (!n_new)
+ goto err_out;
+ mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!mpol_new)
+ goto err_out;
+ goto restart;
}
/**
@@ -2410,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- mutex_init(&sp->mutex);
+ spin_lock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
@@ -2476,14 +2504,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- mutex_lock(&p->mutex);
+ spin_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
- mutex_unlock(&p->mutex);
+ spin_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
@@ -2595,8 +2623,7 @@ void numa_default_policy(void)
*/
/*
- * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
- * Used only for mpol_parse_str() and mpol_to_str()
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
static const char * const policy_modes[] =
{
@@ -2610,28 +2637,20 @@ static const char * const policy_modes[] =
#ifdef CONFIG_TMPFS
/**
- * mpol_parse_str - parse string to mempolicy
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
- * @no_context: flag whether to "contextualize" the mempolicy
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * if @no_context is true, save the input nodemask in w.user_nodemask in
- * the returned mempolicy. This will be used to "clone" the mempolicy in
- * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
- * mount option. Note that if 'static' or 'relative' mode flags were
- * specified, the input nodemask will already have been saved. Saving
- * it again is redundant, but safe.
- *
* On success, returns 0, else 1
*/
-int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
unsigned short mode;
- unsigned short uninitialized_var(mode_flags);
+ unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
@@ -2719,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (IS_ERR(new))
goto out;
- if (no_context) {
- /* save for contextualization */
- new->w.user_nodemask = nodes;
- } else {
- int ret;
- NODEMASK_SCRATCH(scratch);
- if (scratch) {
- task_lock(current);
- ret = mpol_set_nodemask(new, &nodes, scratch);
- task_unlock(current);
- } else
- ret = -ENOMEM;
- NODEMASK_SCRATCH_FREE(scratch);
- if (ret) {
- mpol_put(new);
- goto out;
- }
- }
+ /*
+ * Save nodes for mpol_to_str() to show the tmpfs mount options
+ * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
+ */
+ if (mode != MPOL_PREFERRED)
+ new->v.nodes = nodes;
+ else if (nodelist)
+ new->v.preferred_node = first_node(nodes);
+ else
+ new->flags |= MPOL_F_LOCAL;
+
+ /*
+ * Save nodes for contextualization: this will be used to "clone"
+ * the mempolicy in a specific context [cpuset] at a later time.
+ */
+ new->w.user_nodemask = nodes;
+
err = 0;
out:
@@ -2756,13 +2774,12 @@ out:
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
- * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
*
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
int l;
@@ -2788,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_PREFERRED:
nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL; /* pseudo-policy */
+ mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
@@ -2796,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
- if (no_context)
- nodes = pol->w.user_nodemask;
- else
- nodes = pol->v.nodes;
+ nodes = pol->v.nodes;
break;
default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 3b676b0c5c3e..c38778610aa8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1679,9 +1679,21 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
page_xchg_last_nid(new_page, page_last_nid(page));
isolated = numamigrate_isolate_page(pgdat, page);
- if (!isolated) {
+
+ /*
+ * Failing to isolate or a GUP pin prevents migration. The expected
+ * page count is 2. 1 for anonymous pages without a mapping and 1
+ * for the callers pin. If the page was isolated, the page will
+ * need to be put back on the LRU.
+ */
+ if (!isolated || page_count(page) != 2) {
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
put_page(new_page);
+ if (isolated) {
+ putback_lru_page(page);
+ isolated = 0;
+ goto out;
+ }
goto out_keep_locked;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index f54b235f29a9..35730ee9d515 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2886,7 +2886,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- down_write(&anon_vma->root->rwsem);
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
* anon_vma->root->rwsem. If some other vma in this mm shares
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ba5e37127fc..df2022ff0c8a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -221,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
-/*
- * NOTE:
- * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
- * Instead, use {un}set_pageblock_isolate.
- */
void set_pageblock_migratetype(struct page *page, int migratetype)
{
@@ -1389,14 +1384,8 @@ void split_page(struct page *page, unsigned int order)
set_page_refcounted(page + i);
}
-/*
- * Similar to the split_page family of functions except that the page
- * required at the given order and being isolated now to prevent races
- * with parallel allocators
- */
-int capture_free_page(struct page *page, int alloc_order, int migratetype)
+static int __isolate_free_page(struct page *page, unsigned int order)
{
- unsigned int order;
unsigned long watermark;
struct zone *zone;
int mt;
@@ -1404,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
BUG_ON(!PageBuddy(page));
zone = page_zone(page);
- order = page_order(page);
mt = get_pageblock_migratetype(page);
if (mt != MIGRATE_ISOLATE) {
@@ -1413,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
- __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
}
/* Remove page from free list */
@@ -1421,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
zone->free_area[order].nr_free--;
rmv_page_order(page);
- if (alloc_order != order)
- expand(zone, page, alloc_order, order,
- &zone->free_area[order], migratetype);
-
- /* Set the pageblock if the captured page is at least a pageblock */
+ /* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -1436,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
}
}
- return 1UL << alloc_order;
+ return 1UL << order;
}
/*
@@ -1454,10 +1438,9 @@ int split_free_page(struct page *page)
unsigned int order;
int nr_pages;
- BUG_ON(!PageBuddy(page));
order = page_order(page);
- nr_pages = capture_free_page(page, order, 0);
+ nr_pages = __isolate_free_page(page, order);
if (!nr_pages)
return 0;
@@ -1655,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return true;
}
-#ifdef CONFIG_MEMORY_ISOLATION
-static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
-{
- if (unlikely(zone->nr_pageblock_isolate))
- return zone->nr_pageblock_isolate * pageblock_nr_pages;
- return 0;
-}
-#else
-static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
-{
- return 0;
-}
-#endif
-
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
@@ -1684,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
- /*
- * If the zone has MIGRATE_ISOLATE type free pages, we should consider
- * it. nr_zone_isolate_freepages is never accurate so kswapd might not
- * sleep although it could do so. But this is more desirable for memory
- * hotplug than sleeping which can cause a livelock in the direct
- * reclaim path.
- */
- free_pages -= nr_zone_isolate_freepages(z);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
}
@@ -2163,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
- struct page *page = NULL;
-
if (!order)
return NULL;
@@ -2176,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration,
- contended_compaction, &page);
+ contended_compaction);
current->flags &= ~PF_MEMALLOC;
- /* If compaction captured a page, prep and use it */
- if (page) {
- prep_new_page(page, order, gfp_mask);
- goto got_page;
- }
-
if (*did_some_progress != COMPACT_SKIPPED) {
+ struct page *page;
+
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
@@ -2195,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page) {
-got_page:
preferred_zone->compact_blockskip_flush = false;
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
@@ -5631,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
pfn &= (PAGES_PER_SECTION-1);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
- pfn = pfn - zone->zone_start_pfn;
+ pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9d2264ea4606..383bdbb98b04 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,28 +8,6 @@
#include <linux/memory.h>
#include "internal.h"
-/* called while holding zone->lock */
-static void set_pageblock_isolate(struct page *page)
-{
- if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
- return;
-
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- page_zone(page)->nr_pageblock_isolate++;
-}
-
-/* called while holding zone->lock */
-static void restore_pageblock_isolate(struct page *page, int migratetype)
-{
- struct zone *zone = page_zone(page);
- if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
- return;
-
- BUG_ON(zone->nr_pageblock_isolate <= 0);
- set_pageblock_migratetype(page, migratetype);
- zone->nr_pageblock_isolate--;
-}
-
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
{
struct zone *zone;
@@ -80,7 +58,7 @@ out:
unsigned long nr_pages;
int migratetype = get_pageblock_migratetype(page);
- set_pageblock_isolate(page);
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
goto out;
nr_pages = move_freepages_block(zone, page, migratetype);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
- restore_pageblock_isolate(page, migratetype);
+ set_pageblock_migratetype(page, migratetype);
out:
spin_unlock_irqrestore(&zone->lock, flags);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 5c90d84c2b02..5dd56f6efdbd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
if (!mpol || mpol->mode == MPOL_DEFAULT)
return; /* show nothing */
- mpol_to_str(buffer, sizeof(buffer), mpol, 1);
+ mpol_to_str(buffer, sizeof(buffer), mpol);
seq_printf(seq, ",mpol=%s", buffer);
}
@@ -2463,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (!gid_valid(sbinfo->gid))
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- if (mpol_parse_str(value, &sbinfo->mpol, 1))
+ if (mpol_parse_str(value, &sbinfo->mpol))
goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
diff --git a/mm/vmscan.c b/mm/vmscan.c
index adc7e9058181..196709f5ee58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2452,12 +2452,16 @@ static bool zone_balanced(struct zone *zone, int order,
}
/*
- * pgdat_balanced is used when checking if a node is balanced for high-order
- * allocations. Only zones that meet watermarks and are in a zone allowed
- * by the callers classzone_idx are added to balanced_pages. The total of
- * balanced pages must be at least 25% of the zones allowed by classzone_idx
- * for the node to be considered balanced. Forcing all zones to be balanced
- * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * pgdat_balanced() is used when checking if a node is balanced.
+ *
+ * For order-0, all zones must be balanced!
+ *
+ * For high-order allocations only zones that meet watermarks and are in a
+ * zone allowed by the callers classzone_idx are added to balanced_pages. The
+ * total of balanced pages must be at least 25% of the zones allowed by
+ * classzone_idx for the node to be considered balanced. Forcing all zones to
+ * be balanced for high orders can cause excessive reclaim when there are
+ * imbalanced zones.
* The choice of 25% is due to
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
@@ -2467,17 +2471,43 @@ static bool zone_balanced(struct zone *zone, int order,
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
*/
-static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
- int classzone_idx)
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
unsigned long present_pages = 0;
+ unsigned long balanced_pages = 0;
int i;
- for (i = 0; i <= classzone_idx; i++)
- present_pages += pgdat->node_zones[i].present_pages;
+ /* Check the watermark levels */
+ for (i = 0; i <= classzone_idx; i++) {
+ struct zone *zone = pgdat->node_zones + i;
- /* A special case here: if zone has no page, we think it's balanced */
- return balanced_pages >= (present_pages >> 2);
+ if (!populated_zone(zone))
+ continue;
+
+ present_pages += zone->present_pages;
+
+ /*
+ * A special case here:
+ *
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well!
+ */
+ if (zone->all_unreclaimable) {
+ balanced_pages += zone->present_pages;
+ continue;
+ }
+
+ if (zone_balanced(zone, order, 0, i))
+ balanced_pages += zone->present_pages;
+ else if (!order)
+ return false;
+ }
+
+ if (order)
+ return balanced_pages >= (present_pages >> 2);
+ else
+ return true;
}
/*
@@ -2489,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)
{
- int i;
- unsigned long balanced = 0;
- bool all_zones_ok = true;
-
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return false;
@@ -2511,39 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
}
- /* Check the watermark levels */
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- /*
- * balance_pgdat() skips over all_unreclaimable after
- * DEF_PRIORITY. Effectively, it considers them balanced so
- * they must be considered balanced here as well if kswapd
- * is to sleep
- */
- if (zone->all_unreclaimable) {
- balanced += zone->present_pages;
- continue;
- }
-
- if (!zone_balanced(zone, order, 0, i))
- all_zones_ok = false;
- else
- balanced += zone->present_pages;
- }
-
- /*
- * For high-order requests, the balanced zones must contain at least
- * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
- * must be balanced
- */
- if (order)
- return pgdat_balanced(pgdat, balanced, classzone_idx);
- else
- return all_zones_ok;
+ return pgdat_balanced(pgdat, order, classzone_idx);
}
/*
@@ -2571,7 +2565,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
struct zone *unbalanced_zone;
- unsigned long balanced;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
@@ -2605,7 +2598,6 @@ loop_again:
int has_under_min_watermark_zone = 0;
unbalanced_zone = NULL;
- balanced = 0;
/*
* Scan in the highmem->dma direction for the highest
@@ -2761,8 +2753,6 @@ loop_again:
* speculatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
- if (i <= *classzone_idx)
- balanced += zone->present_pages;
}
}
@@ -2776,7 +2766,7 @@ loop_again:
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);
- if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
+ if (pgdat_balanced(pgdat, order, *classzone_idx))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2785,7 +2775,7 @@ loop_again:
if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
if (has_under_min_watermark_zone)
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
- else
+ else if (unbalanced_zone)
wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
}
@@ -2800,12 +2790,7 @@ loop_again:
} while (--sc.priority >= 0);
out:
- /*
- * order-0: All zones must meet high watermark for a balanced node
- * high-order: Balanced zones must make up at least 25% of the node
- * for the node to be balanced
- */
- if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) {
+ if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
cond_resched();
try_to_freeze();
@@ -3137,8 +3122,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
-static int __devinit cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
{
int nid;