diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 557 |
1 files changed, 242 insertions, 315 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 452513bf02ce..ea759b935360 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) return false; } -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - if (pfn >= NODE_DATA(nid)->first_deferred_pfn) - return true; - - return false; -} - /* * Returns false when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. @@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - return false; -} - static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) @@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone, { unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { @@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); - /* Set the pageblock if the isolated page is at least a pageblock */ + /* + * Set the pageblock if the isolated page is at least half of a + * pageblock + */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, else page = list_first_entry(list, struct page, lru); - __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); pcp->count--; @@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && - !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) - set_bit(ZONE_FAIR_DEPLETED, &zone->flags); - - __count_zone_vm_events(PGALLOC, zone, 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return local_zone->node == zone->node; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return true; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } #endif /* CONFIG_NUMA */ -static void reset_alloc_batches(struct zone *preferred_zone) -{ - struct zone *zone = preferred_zone->zone_pgdat->node_zones; - - do { - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); - } while (zone++ != preferred_zone); -} - /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z = ac->preferred_zoneref; struct zone *zone; - bool fair_skipped = false; - bool apply_fair = (alloc_flags & ALLOC_FAIR); + struct pglist_data *last_pgdat_dirty_limit = NULL; -zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. @@ -2904,50 +2864,33 @@ zonelist_scan: !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* - * Distribute pages in proportion to the individual - * zone size to ensure fair page aging. The zone a - * page was allocated in should have no effect on the - * time the page has in memory before being reclaimed. - */ - if (apply_fair) { - if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - fair_skipped = true; - continue; - } - if (!zone_local(ac->preferred_zoneref->zone, zone)) { - if (fair_skipped) - goto reset_fair; - apply_fair = false; - } - } - /* * When allocating a page cache page for writing, we - * want to get it from a zone that is within its dirty - * limit, such that no single zone holds more than its + * want to get it from a node that is within its dirty + * limit, such that no single node holds more than its * proportional share of globally allowed dirty pages. - * The dirty limits take into account the zone's + * The dirty limits take into account the node's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * - * This may look like it could increase pressure on - * lower zones by failing allocations in higher zones - * before they are full. But the pages that do spill - * over are limited as the lower zones are protected - * by this very same mechanism. It should not become - * a practical burden to them. - * * XXX: For now, allow allocations to potentially - * exceed the per-zone dirty limit in the slowpath + * exceed the per-node dirty limit in the slowpath * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed - * zones are together not big enough to reach the + * nodes are together not big enough to reach the * global limit. The proper fix for these situations - * will require awareness of zones in the + * will require awareness of nodes in the * dirty-throttling and the flusher threads. */ - if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) - continue; + if (ac->spread_dirty_pages) { + if (last_pgdat_dirty_limit == zone->zone_pgdat) + continue; + + if (!node_dirty_ok(zone->zone_pgdat)) { + last_pgdat_dirty_limit = zone->zone_pgdat; + continue; + } + } mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_fast(zone, order, mark, @@ -2959,16 +2902,16 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (zone_reclaim_mode == 0 || + if (node_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; - ret = zone_reclaim(zone, gfp_mask, order); + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { - case ZONE_RECLAIM_NOSCAN: + case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; - case ZONE_RECLAIM_FULL: + case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: @@ -2998,23 +2941,6 @@ try_this_zone: } } - /* - * The first pass makes sure allocations are spread fairly within the - * local node. However, the local node might have free pages left - * after the fairness batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without fairness, and - * include remote zones now, before entering the slowpath and waking - * kswapd: prefer spilling to a remote zone over swapping locally. - */ - if (fair_skipped) { -reset_fair: - apply_fair = false; - fair_skipped = false; - reset_alloc_batches(ac->preferred_zoneref->zone); - z = ac->preferred_zoneref; - goto zonelist_scan; - } - return NULL; } @@ -3159,7 +3085,6 @@ out: return page; } - /* * Maximum number of compaction retries wit a progress before OOM * killer is consider as the only way to move forward. @@ -3171,17 +3096,16 @@ out: static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { struct page *page; - int contended_compaction; if (!order) return NULL; current->flags |= PF_MEMALLOC; *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - mode, &contended_compaction); + prio); current->flags &= ~PF_MEMALLOC; if (*compact_result <= COMPACT_INACTIVE) @@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTFAIL); - /* - * In all zones where compaction was attempted (and not - * deferred or skipped), lock contention has been detected. - * For THP allocation we do not want to disrupt the others - * so we fallback to base pages instead. - */ - if (contended_compaction == COMPACT_CONTENDED_LOCK) - *compact_result = COMPACT_CONTENDED; - - /* - * If compaction was aborted due to need_resched(), we do not - * want to further increase allocation latency, unless it is - * khugepaged trying to collapse. - */ - if (contended_compaction == COMPACT_CONTENDED_SCHED - && !(current->flags & PF_KTHREAD)) - *compact_result = COMPACT_CONTENDED; - cond_resched(); return NULL; @@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, - enum compact_result compact_result, enum migrate_mode *migrate_mode, + enum compact_result compact_result, + enum compact_priority *compact_priority, int compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; @@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, /* * compaction considers all the zone as desperately out of memory * so it doesn't really make much sense to retry except when the - * failure could be caused by weak migration mode. + * failure could be caused by insufficient priority */ if (compaction_failed(compact_result)) { - if (*migrate_mode == MIGRATE_ASYNC) { - *migrate_mode = MIGRATE_SYNC_LIGHT; + if (*compact_priority > MIN_COMPACT_PRIORITY) { + (*compact_priority)--; return true; } return false; @@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { *compact_result = COMPACT_SKIPPED; return NULL; @@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, - enum migrate_mode *migrate_mode, + enum compact_priority *compact_priority, int compaction_retries) { struct zone *zone; @@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; retry: - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; + pg_data_t *last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, ac_classzone_idx(ac)); + ac->high_zoneidx, ac->nodemask) { + if (last_pgdat != zone->zone_pgdat) + wakeup_kswapd(zone, order, ac->high_zoneidx); + last_pgdat = zone->zone_pgdat; + } } static inline unsigned int @@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (gfp_mask & __GFP_MEMALLOC) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - alloc_flags |= ALLOC_NO_WATERMARKS; - } #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { - return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); -} + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return false; -static inline bool is_thp_gfp_mask(gfp_t gfp_mask) -{ - return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; + if (gfp_mask & __GFP_MEMALLOC) + return true; + if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + return true; + if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + return true; + + return false; } /* @@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; /* - * Keep reclaiming pages while there is a chance this will lead somewhere. - * If none of the target zones can satisfy our allocation request even - * if all reclaimable pages are considered then we are screwed and have - * to go OOM. + * Keep reclaiming pages while there is a chance this will lead + * somewhere. If none of the target zones can satisfy our allocation + * request even if all reclaimable pages are considered then we are + * screwed and have to go OOM. */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { @@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * prevent from pre mature OOM */ if (!did_some_progress) { - unsigned long writeback; - unsigned long dirty; + unsigned long write_pending; - writeback = zone_page_state_snapshot(zone, - NR_WRITEBACK); - dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); + write_pending = zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); - if (2*(writeback + dirty) > reclaimable) { + if (2 * write_pending > reclaimable) { congestion_wait(BLK_RW_ASYNC, HZ/10); return true; } @@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; - enum migrate_mode migration_mode = MIGRATE_ASYNC; + enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; enum compact_result compact_result; int compaction_retries = 0; int no_progress_loops = 0; @@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry: + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* - * OK, we're below the kswapd watermark and have kicked background - * reclaim. Now things get more complex, so set up alloc_flags according - * to how we want to proceed. + * The adjusted alloc_flags might result in immediate success, so try + * that first */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + goto got_pg; + + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. Don't try + * that for allocations that are allowed to ignore watermarks, as the + * ALLOC_NO_WATERMARKS attempt didn't yet happen. + */ + if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && + !gfp_pfmemalloc_allowed(gfp_mask)) { + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, + INIT_COMPACT_PRIORITY, + &compact_result); + if (page) + goto got_pg; + + /* + * Checks for costly allocations with __GFP_NORETRY, which + * includes THP page fault allocations + */ + if (gfp_mask & __GFP_NORETRY) { + /* + * If compaction is deferred for high-order allocations, + * it is because sync compaction recently failed. If + * this is the case and the caller requested a THP + * allocation, we do not want to heavily disrupt the + * system, so we fail the allocation instead of entering + * direct reclaim. + */ + if (compact_result == COMPACT_DEFERRED) + goto nopage; + + /* + * Looks like reclaim/compaction is worth trying, but + * sync compaction could be very expensive, so keep + * using async compaction. + */ + compact_priority = INIT_COMPACT_PRIORITY; + } + } + +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_kswapds(order, ac); + + if (gfp_pfmemalloc_allowed(gfp_mask)) + alloc_flags = ALLOC_NO_WATERMARKS; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) { + if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } - /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + /* Attempt with potentially adjusted zonelist and alloc_flags */ + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg; - /* Allocate without watermarks if the context allows */ - if (alloc_flags & ALLOC_NO_WATERMARKS) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - if (page) - goto got_pg; - } - /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* @@ -3640,38 +3590,6 @@ retry: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* - * Try direct compaction. The first pass is asynchronous. Subsequent - * attempts after direct reclaim are synchronous - */ - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, - migration_mode, - &compact_result); - if (page) - goto got_pg; - - /* Checks for THP-specific high-order allocations */ - if (is_thp_gfp_mask(gfp_mask)) { - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a THP allocation, we do not want - * to heavily disrupt the system, so we fail the allocation - * instead of entering direct reclaim. - */ - if (compact_result == COMPACT_DEFERRED) - goto nopage; - - /* - * Compaction is contended so rather back off than cause - * excessive stalls. - */ - if(compact_result == COMPACT_CONTENDED) - goto nopage; - } - - if (order && compaction_made_progress(compact_result)) - compaction_retries++; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, @@ -3679,16 +3597,25 @@ retry: if (page) goto got_pg; + /* Try direct compaction and then allocating */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + compact_priority, &compact_result); + if (page) + goto got_pg; + + if (order && compaction_made_progress(compact_result)) + compaction_retries++; + /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) - goto noretry; + goto nopage; /* * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) - goto noretry; + goto nopage; /* * Costly allocations might have made a progress but this doesn't mean @@ -3712,7 +3639,7 @@ retry: */ if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, - compact_result, &migration_mode, + compact_result, &compact_priority, compaction_retries)) goto retry; @@ -3727,25 +3654,6 @@ retry: goto retry; } -noretry: - /* - * High-order allocations do not necessarily loop after direct reclaim - * and reclaim/compaction depends on compaction being called after - * reclaim so call directly if necessary. - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. All other requests should tolerate - * at least light sync migration. - */ - if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_ASYNC; - else - migration_mode = MIGRATE_SYNC_LIGHT; - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, - ac, migration_mode, - &compact_result); - if (page) - goto got_pg; nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: @@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { struct page *page; unsigned int cpuset_mems_cookie; - unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), @@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = global_page_state(NR_SHMEM); + val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; - val->sharedram = node_page_state(nid, NR_SHMEM); - val->freeram = node_page_state(nid, NR_FREE_PAGES); + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; @@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter) unsigned long free_pcp = 0; int cpu; struct zone *zone; + pg_data_t *pgdat; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter) " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n" -#endif " free:%lu free_pcp:%lu free_cma:%lu\n", - global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_INACTIVE_ANON), - global_page_state(NR_ISOLATED_ANON), - global_page_state(NR_ACTIVE_FILE), - global_page_state(NR_INACTIVE_FILE), - global_page_state(NR_ISOLATED_FILE), - global_page_state(NR_UNEVICTABLE), - global_page_state(NR_FILE_DIRTY), - global_page_state(NR_WRITEBACK), - global_page_state(NR_UNSTABLE_NFS), + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state(NR_UNSTABLE_NFS), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), - global_page_state(NR_FILE_MAPPED), - global_page_state(NR_SHMEM), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR, - global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR, - global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR, -#endif global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); + for_each_online_pgdat(pgdat) { + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " unstable:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) + * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), +#endif + K(node_page_state(pgdat, NR_SHMEM)), + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + node_page_state(pgdat, NR_PAGES_SCANNED), + !pgdat_reclaimable(pgdat) ? "yes" : "no"); + } + for_each_populated_zone(zone) { int i; @@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" + " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " mapped:%lukB" - " shmem:%lukB" -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - " shmem_thp: %lukB" - " shmem_pmdmapped: %lukB" - " anon_thp: %lukB" -#endif " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" - " unstable:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" - " writeback_tmp:%lukB" - " pages_scanned:%lu" - " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), - K(zone_page_state(zone, NR_ACTIVE_ANON)), - K(zone_page_state(zone, NR_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ACTIVE_FILE)), - K(zone_page_state(zone, NR_INACTIVE_FILE)), - K(zone_page_state(zone, NR_UNEVICTABLE)), - K(zone_page_state(zone, NR_ISOLATED_ANON)), - K(zone_page_state(zone, NR_ISOLATED_FILE)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_FILE_DIRTY)), - K(zone_page_state(zone, NR_WRITEBACK)), - K(zone_page_state(zone, NR_FILE_MAPPED)), - K(zone_page_state(zone, NR_SHMEM)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR), - K(zone_page_state(zone, NR_SHMEM_PMDMAPPED) - * HPAGE_PMD_NR), - K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR), -#endif K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), - zone_page_state(zone, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, + zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), - K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES)), - K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - K(zone_page_state(zone, NR_PAGES_SCANNED)), - (!zone_reclaimable(zone) ? "yes" : "no") - ); + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(" %ld", zone->lowmem_reserve[i]); @@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter) hugetlb_show_meminfo(); - printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } @@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone) zone->pageset = alloc_percpu(struct per_cpu_pageset); for_each_possible_cpu(cpu) zone_pageset_init(zone, cpu); + + if (!zone->zone_pgdat->per_cpu_nodestats) { + zone->zone_pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); + } } /* @@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kcompactd_wait); #endif pgdat_page_ext_init(pgdat); + spin_lock_init(&pgdat->lru_lock); + lruvec_init(node_lruvec(pgdat)); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) + pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) / 100; - zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; + pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; + zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); - spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); - zone->zone_pgdat = pgdat; zone_pcp_init(zone); - /* For bootup, initialized properly in watermark setup */ - mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); - - lruvec_init(&zone->lruvec); if (!size) continue; @@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + pgdat->per_cpu_nodestats = NULL; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, @@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void) enum zone_type i, j; for_each_online_pgdat(pgdat) { + + pgdat->totalreserve_pages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; @@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; - zone->totalreserve_pages = max; + pgdat->totalreserve_pages += max; reserve_pages += max; } @@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - spin_unlock_irqrestore(&zone->lock, flags); } @@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_unmapped_pages = (zone->managed_pages * + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } @@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_slab_pages = (zone->managed_pages * + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } |