diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 26 | ||||
-rw-r--r-- | mm/filemap_xip.c | 22 | ||||
-rw-r--r-- | mm/hugetlb.c | 24 | ||||
-rw-r--r-- | mm/madvise.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 25 | ||||
-rw-r--r-- | mm/mempolicy.c | 51 | ||||
-rw-r--r-- | mm/mempool.c | 3 | ||||
-rw-r--r-- | mm/mlock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 38 | ||||
-rw-r--r-- | mm/mremap.c | 13 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 332 | ||||
-rw-r--r-- | mm/rmap.c | 24 | ||||
-rw-r--r-- | mm/shmem.c | 44 | ||||
-rw-r--r-- | mm/slab.c | 67 | ||||
-rw-r--r-- | mm/slob.c | 538 | ||||
-rw-r--r-- | mm/slub.c | 142 | ||||
-rw-r--r-- | mm/sparse.c | 42 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 42 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
25 files changed, 1009 insertions, 478 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 62e5d0d0bd5a..086af703da43 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -168,3 +168,7 @@ config NR_QUICK depends on QUICKLIST default "2" if (SUPERH && !SUPERH64) default "1" + +config VIRT_TO_BUS + def_bool y + depends on !ARCH_NO_VIRT_TO_BUS diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e5de3781d3fe..f50a2811f9dc 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout) } EXPORT_SYMBOL(congestion_wait); -long congestion_wait_interruptible(int rw, long timeout) -{ - long ret; - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; - - prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE); - if (signal_pending(current)) - ret = -ERESTARTSYS; - else - ret = io_schedule_timeout(timeout); - finish_wait(wqh, &wait); - return ret; -} -EXPORT_SYMBOL(congestion_wait_interruptible); - /** * congestion_end - wake up sleepers on a congested backing_dev_info * @rw: READ or WRITE diff --git a/mm/filemap.c b/mm/filemap.c index edb1b0b5cc8d..100b99c2d504 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + BUG_ON(page_mapped(page)); } void remove_from_page_cache(struct page *page) @@ -1218,6 +1219,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = retval ?: desc.error; break; } + if (desc.count > 0) + break; } } out: @@ -1245,26 +1248,6 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o return written; } -ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - read_descriptor_t desc; - - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - do_generic_file_read(in_file, ppos, &desc, actor); - if (desc.written) - return desc.written; - return desc.error; -} -EXPORT_SYMBOL(generic_file_sendfile); - static ssize_t do_readahead(struct address_space *mapping, struct file *filp, unsigned long index, unsigned long nr) @@ -1786,7 +1769,6 @@ retry: page = __read_cache_page(mapping, index, filler, data); if (IS_ERR(page)) return page; - mark_page_accessed(page); if (PageUptodate(page)) goto out; @@ -1985,7 +1967,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (unlikely(*pos + *count > MAX_NON_LFS && !(file->f_flags & O_LARGEFILE))) { if (*pos >= MAX_NON_LFS) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } if (*count > MAX_NON_LFS - (unsigned long)*pos) { @@ -2003,7 +1984,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (likely(!isblk)) { if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { if (*count || *pos > inode->i_sb->s_maxbytes) { - send_sig(SIGXFSZ, current, 0); return -EFBIG; } /* zero-length writes at ->s_maxbytes are OK */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index fa360e566d88..65ffc321f0c0 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -159,28 +159,6 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) } EXPORT_SYMBOL_GPL(xip_file_read); -ssize_t -xip_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - read_descriptor_t desc; - - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file, - ppos, &desc, actor); - if (desc.written) - return desc.written; - return desc.error; -} -EXPORT_SYMBOL_GPL(xip_file_sendfile); - /* * __xip_unmap is invoked from xip_unmap and * xip_write diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eb7180db3033..acc0fb3cf067 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -66,7 +66,7 @@ static void enqueue_huge_page(struct page *page) static struct page *dequeue_huge_page(struct vm_area_struct *vma, unsigned long address) { - int nid = numa_node_id(); + int nid; struct page *page = NULL; struct zonelist *zonelist = huge_zonelist(vma, address); struct zone **z; @@ -101,13 +101,20 @@ static void free_huge_page(struct page *page) static int alloc_fresh_huge_page(void) { - static int nid = 0; + static int prev_nid; struct page *page; - page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); - nid = next_node(nid, node_online_map); + static DEFINE_SPINLOCK(nid_lock); + int nid; + + spin_lock(&nid_lock); + nid = next_node(prev_nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); + prev_nid = nid; + spin_unlock(&nid_lock); + + page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, + HUGETLB_PAGE_ORDER); if (page) { set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); @@ -326,9 +333,10 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, pte_t entry; entry = pte_mkwrite(pte_mkdirty(*ptep)); - ptep_set_access_flags(vma, address, ptep, entry, 1); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); + if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + } } diff --git a/mm/madvise.c b/mm/madvise.c index 60542d006ec1..93ee375b38e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -287,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) struct vm_area_struct * vma, *prev; int unmapped_error = 0; int error = -EINVAL; + int write; size_t len; - if (madvise_need_mmap_write(behavior)) + write = madvise_need_mmap_write(behavior); + if (write) down_write(¤t->mm->mmap_sem); else down_read(¤t->mm->mmap_sem); @@ -354,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) vma = find_vma(current->mm, start); } out: - if (madvise_need_mmap_write(behavior)) + if (write) up_write(¤t->mm->mmap_sem); else up_read(¤t->mm->mmap_sem); diff --git a/mm/memory.c b/mm/memory.c index cb94488ab96d..b3d73bb1f680 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -78,11 +78,9 @@ unsigned long num_physpages; * and ZONE_HIGHMEM. */ void * high_memory; -unsigned long vmalloc_earlyreserve; EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); -EXPORT_SYMBOL(vmalloc_earlyreserve); int randomize_va_space __read_mostly = 1; @@ -1055,6 +1053,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, do { struct page *page; + /* + * If tsk is ooming, cut off its access to large memory + * allocations. It has a pending SIGKILL, but it can't + * be processed until returning to user space. + */ + if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) + return -ENOMEM; + if (write) foll_flags |= FOLL_WRITE; @@ -1691,9 +1697,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_set_access_flags(vma, address, page_table, entry, 1); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); + if (ptep_set_access_flags(vma, address, page_table, entry,1)) { + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + } ret |= VM_FAULT_WRITE; goto unlock; } @@ -2525,10 +2532,9 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte_t *pte, pmd_t *pmd, int write_access) { pte_t entry; - pte_t old_entry; spinlock_t *ptl; - old_entry = entry = *pte; + entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { @@ -2561,8 +2567,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (!pte_same(old_entry, entry)) { - ptep_set_access_flags(vma, address, pte, entry, write_access); + if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); } else { @@ -2674,7 +2679,7 @@ int make_pages_present(unsigned long addr, unsigned long end) write = (vma->vm_flags & VM_WRITE) != 0; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); if (ret < 0) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d76e8eb342d0..188f8d9c4aed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -101,8 +101,6 @@ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; -#define PDprintk(fmt...) - /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; @@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) { struct mempolicy *policy; - PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); + pr_debug("setting mode %d nodes[0] %lx\n", + mode, nodes ? nodes_addr(*nodes)[0] : -1); + if (mode == MPOL_DEFAULT) return NULL; policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) int err = 0; struct mempolicy *old = vma->vm_policy; - PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); @@ -776,8 +776,8 @@ long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); + pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode, nmask ? nodes_addr(*nmask)[0] : -1); down_write(&mm->mmap_sem); vma = check_range(mm, start, end, nmask, @@ -1434,7 +1434,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, + pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, new->policy ? new->policy->policy : 0); } @@ -1459,7 +1459,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - PDprintk("deleting %lx-l%x\n", n->start, n->end); + pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); mpol_free(n->policy); kmem_cache_free(sn_cache, n); @@ -1558,10 +1558,10 @@ int mpol_set_shared_policy(struct shared_policy *info, struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - PDprintk("set_shared_policy %lx sz %lu %d %lx\n", + pr_debug("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -1597,6 +1597,10 @@ void mpol_free_shared_policy(struct shared_policy *p) /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { + nodemask_t interleave_nodes; + unsigned long largest = 0; + int nid, prefer = 0; + policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL, NULL); @@ -1605,10 +1609,31 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL, NULL); - /* Set interleaving policy for system init. This way not all - the data structures allocated at system boot end up in node zero. */ + /* + * Set interleaving policy for system init. Interleaving is only + * enabled across suitably sized nodes (default is >= 16MB), or + * fall back to the largest node if they're all smaller. + */ + nodes_clear(interleave_nodes); + for_each_online_node(nid) { + unsigned long total_pages = node_present_pages(nid); + + /* Preserve the largest node */ + if (largest < total_pages) { + largest = total_pages; + prefer = nid; + } + + /* Interleave this node? */ + if ((total_pages << PAGE_SHIFT) >= (16 << 20)) + node_set(nid, interleave_nodes); + } + + /* All too small, use the largest */ + if (unlikely(nodes_empty(interleave_nodes))) + node_set(prefer, interleave_nodes); - if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) + if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); } diff --git a/mm/mempool.c b/mm/mempool.c index cc1ca86dfc24..3e8f1fed0e1f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -263,6 +263,9 @@ void mempool_free(void *element, mempool_t *pool) { unsigned long flags; + if (unlikely(element == NULL)) + return; + smp_mb(); if (pool->curr_nr < pool->min_nr) { spin_lock_irqsave(&pool->lock, flags); diff --git a/mm/mlock.c b/mm/mlock.c index 4d3fea267e0d..7b2656055d6a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -244,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user) locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + if (lock_limit == RLIM_INFINITY) + allowed = 1; lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + if (!allowed && + locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) goto out; get_uid(user); user->locked_shm += locked; diff --git a/mm/mmap.c b/mm/mmap.c index 68b9ad2ef1d6..144b4a290f2c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long flags, unsigned long pgoff) { struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; struct inode *inode; unsigned int vm_flags; - int correct_wcount = 0; int error; - struct rb_node ** rb_link, * rb_parent; int accountable = 1; - unsigned long charged = 0, reqprot = prot; + unsigned long reqprot = prot; /* * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1023,10 +1020,28 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } } - error = security_file_mmap(file, reqprot, prot, flags); + error = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (error) return error; - + + return mmap_region(file, addr, len, flags, vm_flags, pgoff, + accountable); +} +EXPORT_SYMBOL(do_mmap_pgoff); + +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, unsigned long flags, + unsigned int vm_flags, unsigned long pgoff, + int accountable) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + int correct_wcount = 0; + int error; + struct rb_node **rb_link, *rb_parent; + unsigned long charged = 0; + struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + /* Clear old maps */ error = -ENOMEM; munmap_back: @@ -1175,8 +1190,6 @@ unacct_error: return error; } -EXPORT_SYMBOL(do_mmap_pgoff); - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * @@ -1536,9 +1549,14 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. + * Also guard against wrapping around to address 0. */ - address += 4 + PAGE_SIZE - 1; - address &= PAGE_MASK; + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else { + anon_vma_unlock(vma); + return -ENOMEM; + } error = 0; /* Somebody else might have raced and expanded it already */ diff --git a/mm/mremap.c b/mm/mremap.c index 5d4bd4f95b8e..bc7c52efc71b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr, if ((addr <= new_addr) && (addr+old_len) > new_addr) goto out; + ret = security_file_mmap(0, 0, 0, 0, new_addr, 1); + if (ret) + goto out; + ret = do_munmap(mm, new_addr, new_len); if (ret) goto out; @@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr, new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); - ret = new_addr; - if (new_addr & ~PAGE_MASK) + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; + } + + ret = security_file_mmap(0, 0, 0, 0, new_addr, 1); + if (ret) goto out; } ret = move_vma(vma, addr, old_len, new_len, new_addr); diff --git a/mm/nommu.c b/mm/nommu.c index 2b16b00a5b11..8bbbf147a794 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -367,6 +367,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) return find_vma(mm, addr); } +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + /* * look up the first VMA exactly that exactly matches addr * - should be called with mm->mmap_sem at least held readlocked @@ -639,7 +644,7 @@ static int validate_mmap_request(struct file *file, } /* allow the security API to have its say */ - ret = security_file_mmap(file, reqprot, prot, flags); + ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (ret < 0) return ret; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eec1481ba44f..ea9da3bed3e9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) { - mod_timer(&wb_timer, - jiffies + dirty_writeback_interval); - } else { + if (dirty_writeback_interval) + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); + else del_timer(&wb_timer); - } return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd8e33582d25..f9e4e647d7e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -126,13 +126,13 @@ static unsigned long __meminitdata dma_reserve; #endif #endif - struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; - int __meminitdata nr_nodemap_entries; - unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; - unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; + static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; + static int __meminitdata nr_nodemap_entries; + static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; - unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; + static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; + static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ @@ -900,11 +900,13 @@ static struct fail_page_alloc_attr { u32 ignore_gfp_highmem; u32 ignore_gfp_wait; + u32 min_order; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *ignore_gfp_highmem_file; struct dentry *ignore_gfp_wait_file; + struct dentry *min_order_file; #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ @@ -912,6 +914,7 @@ static struct fail_page_alloc_attr { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, .ignore_gfp_highmem = 1, + .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) @@ -922,6 +925,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc); static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { + if (order < fail_page_alloc.min_order) + return 0; if (gfp_mask & __GFP_NOFAIL) return 0; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) @@ -953,12 +958,17 @@ static int __init fail_page_alloc_debugfs(void) fail_page_alloc.ignore_gfp_highmem_file = debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); + fail_page_alloc.min_order_file = + debugfs_create_u32("min-order", mode, dir, + &fail_page_alloc.min_order); if (!fail_page_alloc.ignore_gfp_wait_file || - !fail_page_alloc.ignore_gfp_highmem_file) { + !fail_page_alloc.ignore_gfp_highmem_file || + !fail_page_alloc.min_order_file) { err = -ENOMEM; debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); + debugfs_remove(fail_page_alloc.min_order_file); cleanup_fault_attr_dentries(&fail_page_alloc.attr); } @@ -1621,8 +1631,8 @@ void show_free_areas(void) * * Add all populated zones of a node to the zonelist. */ -static int __meminit build_zonelists_node(pg_data_t *pgdat, - struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) +static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, + int nr_zones, enum zone_type zone_type) { struct zone *zone; @@ -1641,9 +1651,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat, return nr_zones; } + +/* + * zonelist_order: + * 0 = automatic detection of better ordering. + * 1 = order by ([node] distance, -zonetype) + * 2 = order by (-zonetype, [node] distance) + * + * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create + * the same zonelist. So only NUMA can configure this param. + */ +#define ZONELIST_ORDER_DEFAULT 0 +#define ZONELIST_ORDER_NODE 1 +#define ZONELIST_ORDER_ZONE 2 + +/* zonelist order in the kernel. + * set_zonelist_order() will set this to NODE or ZONE. + */ +static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; +static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; + + #ifdef CONFIG_NUMA +/* The value user specified ....changed by config */ +static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; +/* string for sysctl */ +#define NUMA_ZONELIST_ORDER_LEN 16 +char numa_zonelist_order[16] = "default"; + +/* + * interface for configure zonelist ordering. + * command line option "numa_zonelist_order" + * = "[dD]efault - default, automatic configuration. + * = "[nN]ode - order by node locality, then by zone within node + * = "[zZ]one - order by zone, then by locality within zone + */ + +static int __parse_numa_zonelist_order(char *s) +{ + if (*s == 'd' || *s == 'D') { + user_zonelist_order = ZONELIST_ORDER_DEFAULT; + } else if (*s == 'n' || *s == 'N') { + user_zonelist_order = ZONELIST_ORDER_NODE; + } else if (*s == 'z' || *s == 'Z') { + user_zonelist_order = ZONELIST_ORDER_ZONE; + } else { + printk(KERN_WARNING + "Ignoring invalid numa_zonelist_order value: " + "%s\n", s); + return -EINVAL; + } + return 0; +} + +static __init int setup_numa_zonelist_order(char *s) +{ + if (s) + return __parse_numa_zonelist_order(s); + return 0; +} +early_param("numa_zonelist_order", setup_numa_zonelist_order); + +/* + * sysctl handler for numa_zonelist_order + */ +int numa_zonelist_order_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, + loff_t *ppos) +{ + char saved_string[NUMA_ZONELIST_ORDER_LEN]; + int ret; + + if (write) + strncpy(saved_string, (char*)table->data, + NUMA_ZONELIST_ORDER_LEN); + ret = proc_dostring(table, write, file, buffer, length, ppos); + if (ret) + return ret; + if (write) { + int oldval = user_zonelist_order; + if (__parse_numa_zonelist_order((char*)table->data)) { + /* + * bogus value. restore saved string + */ + strncpy((char*)table->data, saved_string, + NUMA_ZONELIST_ORDER_LEN); + user_zonelist_order = oldval; + } else if (oldval != user_zonelist_order) + build_all_zonelists(); + } + return 0; +} + + #define MAX_NODE_LOAD (num_online_nodes()) -static int __meminitdata node_load[MAX_NUMNODES]; +static int node_load[MAX_NUMNODES]; + /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1658,7 +1761,7 @@ static int __meminitdata node_load[MAX_NUMNODES]; * on them otherwise. * It returns -1 if no node is found. */ -static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) +static int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; @@ -1704,13 +1807,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) return best_node; } -static void __meminit build_zonelists(pg_data_t *pgdat) + +/* + * Build zonelists ordered by node and zones within node. + * This results in maximum locality--normal zone overflows into local + * DMA zone, if any--but risks exhausting DMA zone. + */ +static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) { - int j, node, local_node; enum zone_type i; - int prev_node, load; + int j; struct zonelist *zonelist; + + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++) + ; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); + zonelist->zones[j] = NULL; + } +} + +/* + * Build zonelists ordered by zone and nodes within zones. + * This results in conserving DMA zone[s] until all Normal memory is + * exhausted, but results in overflowing to remote node while memory + * may still exist in local DMA zone. + */ +static int node_order[MAX_NUMNODES]; + +static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) +{ + enum zone_type i; + int pos, j, node; + int zone_type; /* needs to be signed */ + struct zone *z; + struct zonelist *zonelist; + + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + pos = 0; + for (zone_type = i; zone_type >= 0; zone_type--) { + for (j = 0; j < nr_nodes; j++) { + node = node_order[j]; + z = &NODE_DATA(node)->node_zones[zone_type]; + if (populated_zone(z)) { + zonelist->zones[pos++] = z; + check_highest_zone(zone_type); + } + } + } + zonelist->zones[pos] = NULL; + } +} + +static int default_zonelist_order(void) +{ + int nid, zone_type; + unsigned long low_kmem_size,total_size; + struct zone *z; + int average_size; + /* + * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. + * If they are really small and used heavily, the system can fall + * into OOM very easily. + * This function detect ZONE_DMA/DMA32 size and confgigures zone order. + */ + /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ + low_kmem_size = 0; + total_size = 0; + for_each_online_node(nid) { + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + z = &NODE_DATA(nid)->node_zones[zone_type]; + if (populated_zone(z)) { + if (zone_type < ZONE_NORMAL) + low_kmem_size += z->present_pages; + total_size += z->present_pages; + } + } + } + if (!low_kmem_size || /* there are no DMA area. */ + low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ + return ZONELIST_ORDER_NODE; + /* + * look into each node's config. + * If there is a node whose DMA/DMA32 memory is very big area on + * local memory, NODE_ORDER may be suitable. + */ + average_size = total_size / (num_online_nodes() + 1); + for_each_online_node(nid) { + low_kmem_size = 0; + total_size = 0; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + z = &NODE_DATA(nid)->node_zones[zone_type]; + if (populated_zone(z)) { + if (zone_type < ZONE_NORMAL) + low_kmem_size += z->present_pages; + total_size += z->present_pages; + } + } + if (low_kmem_size && + total_size > average_size && /* ignore small node */ + low_kmem_size > total_size * 70/100) + return ZONELIST_ORDER_NODE; + } + return ZONELIST_ORDER_ZONE; +} + +static void set_zonelist_order(void) +{ + if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) + current_zonelist_order = default_zonelist_order(); + else + current_zonelist_order = user_zonelist_order; +} + +static void build_zonelists(pg_data_t *pgdat) +{ + int j, node, load; + enum zone_type i; nodemask_t used_mask; + int local_node, prev_node; + struct zonelist *zonelist; + int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_NR_ZONES; i++) { @@ -1723,6 +1942,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) load = num_online_nodes(); prev_node = local_node; nodes_clear(used_mask); + + memset(node_load, 0, sizeof(node_load)); + memset(node_order, 0, sizeof(node_order)); + j = 0; + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { int distance = node_distance(local_node, node); @@ -1738,23 +1962,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat) * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (distance != node_distance(local_node, prev_node)) - node_load[node] += load; + node_load[node] = load; + prev_node = node; load--; - for (i = 0; i < MAX_NR_ZONES; i++) { - zonelist = pgdat->node_zonelists + i; - for (j = 0; zonelist->zones[j] != NULL; j++); + if (order == ZONELIST_ORDER_NODE) + build_zonelists_in_node_order(pgdat, node); + else + node_order[j++] = node; /* remember order */ + } - j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); - zonelist->zones[j] = NULL; - } + if (order == ZONELIST_ORDER_ZONE) { + /* calculate node order -- i.e., DMA last! */ + build_zonelists_in_zone_order(pgdat, j); } } /* Construct the zonelist performance cache - see further mmzone.h */ -static void __meminit build_zonelist_cache(pg_data_t *pgdat) +static void build_zonelist_cache(pg_data_t *pgdat) { int i; @@ -1771,9 +1997,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) } } + #else /* CONFIG_NUMA */ -static void __meminit build_zonelists(pg_data_t *pgdat) +static void set_zonelist_order(void) +{ + current_zonelist_order = ZONELIST_ORDER_ZONE; +} + +static void build_zonelists(pg_data_t *pgdat) { int node, local_node; enum zone_type i,j; @@ -1809,7 +2041,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat) } /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ -static void __meminit build_zonelist_cache(pg_data_t *pgdat) +static void build_zonelist_cache(pg_data_t *pgdat) { int i; @@ -1820,7 +2052,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ /* return values int ....just for stop_machine_run() */ -static int __meminit __build_all_zonelists(void *dummy) +static int __build_all_zonelists(void *dummy) { int nid; @@ -1831,8 +2063,10 @@ static int __meminit __build_all_zonelists(void *dummy) return 0; } -void __meminit build_all_zonelists(void) +void build_all_zonelists(void) { + set_zonelist_order(); + if (system_state == SYSTEM_BOOTING) { __build_all_zonelists(NULL); cpuset_init_current_mems_allowed(); @@ -1843,8 +2077,13 @@ void __meminit build_all_zonelists(void) /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); - printk("Built %i zonelists. Total pages: %ld\n", - num_online_nodes(), vm_total_pages); + printk("Built %i zonelists in %s order. Total pages: %ld\n", + num_online_nodes(), + zonelist_order_name[current_zonelist_order], + vm_total_pages); +#ifdef CONFIG_NUMA + printk("Policy zone: %s\n", zone_names[policy_zone]); +#endif } /* @@ -1953,8 +2192,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } } -void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, - unsigned long size) +static void __meminit zone_init_free_lists(struct pglist_data *pgdat, + struct zone *zone, unsigned long size) { int order; for (order = 0; order < MAX_ORDER ; order++) { @@ -1968,7 +2207,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __cpuinit zone_batchsize(struct zone *zone) +static int __devinit zone_batchsize(struct zone *zone) { int batch; @@ -2370,7 +2609,7 @@ void __init push_node_boundaries(unsigned int nid, } /* If necessary, push the node boundary out for reserve hotadd */ -static void __init account_node_boundary(unsigned int nid, +static void __meminit account_node_boundary(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", @@ -2390,7 +2629,7 @@ static void __init account_node_boundary(unsigned int nid, void __init push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn) {} -static void __init account_node_boundary(unsigned int nid, +static void __meminit account_node_boundary(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) {} #endif @@ -2431,7 +2670,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() */ -unsigned long __meminit zone_spanned_pages_in_node(int nid, +static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { @@ -2519,7 +2758,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, } /* Return the number of page frames in holes in a zone on a node */ -unsigned long __meminit zone_absent_pages_in_node(int nid, +static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { @@ -2536,14 +2775,14 @@ unsigned long __meminit zone_absent_pages_in_node(int nid, } #else -static inline unsigned long zone_spanned_pages_in_node(int nid, +static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *zones_size) { return zones_size[zone_type]; } -static inline unsigned long zone_absent_pages_in_node(int nid, +static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *zholes_size) { @@ -3355,13 +3594,28 @@ void *__init alloc_large_system_hash(const char *tablename, for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) ; table = (void*) __get_free_pages(GFP_ATOMIC, order); + /* + * If bucketsize is not a power-of-two, we may free + * some pages at the end of hash table. + */ + if (table) { + unsigned long alloc_end = (unsigned long)table + + (PAGE_SIZE << order); + unsigned long used = (unsigned long)table + + PAGE_ALIGN(size); + split_page(virt_to_page(table), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", tablename, (1U << log2qty), ilog2(size) - PAGE_SHIFT, diff --git a/mm/rmap.c b/mm/rmap.c index 850165d32b7a..61e492597a0b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -53,24 +53,6 @@ struct kmem_cache *anon_vma_cachep; -static inline void validate_anon_vma(struct vm_area_struct *find_vma) -{ -#ifdef CONFIG_DEBUG_VM - struct anon_vma *anon_vma = find_vma->anon_vma; - struct vm_area_struct *vma; - unsigned int mapcount = 0; - int found = 0; - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - mapcount++; - BUG_ON(mapcount > 100000); - if (vma == find_vma) - found = 1; - } - BUG_ON(!found); -#endif -} - /* This must be called under the mmap_sem. */ int anon_vma_prepare(struct vm_area_struct *vma) { @@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) { + if (anon_vma) list_add_tail(&vma->anon_vma_node, &anon_vma->head); - validate_anon_vma(vma); - } } void anon_vma_link(struct vm_area_struct *vma) @@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma) if (anon_vma) { spin_lock(&anon_vma->lock); list_add_tail(&vma->anon_vma_node, &anon_vma->head); - validate_anon_vma(vma); spin_unlock(&anon_vma->lock); } } @@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma) return; spin_lock(&anon_vma->lock); - validate_anon_vma(vma); list_del(&vma->anon_vma_node); /* We must garbage collect the anon_vma if it's empty */ diff --git a/mm/shmem.c b/mm/shmem.c index e537317bec4d..0493e4d0bcaa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -967,6 +967,8 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ *nodelist++ = '\0'; if (nodelist_parse(nodelist, *policy_nodes)) goto out; + if (!nodes_subset(*policy_nodes, node_online_map)) + goto out; } if (!strcmp(value, "default")) { *policy = MPOL_DEFAULT; @@ -1098,9 +1100,9 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read * in under swappage, which is then assigned to filepage. - * But shmem_prepare_write passes in a locked filepage, - * which may be found not uptodate by other callers too, - * and may need to be copied from the swappage read in. + * But shmem_readpage and shmem_prepare_write pass in a locked + * filepage, which may be found not uptodate by other callers + * too, and may need to be copied from the swappage read in. */ repeat: if (!filepage) @@ -1483,9 +1485,18 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_symlink_inline_operations; /* - * Normally tmpfs makes no use of shmem_prepare_write, but it - * lets a tmpfs file be used read-write below the loop driver. + * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; + * but providing them allows a tmpfs file to be used for splice, sendfile, and + * below the loop driver, in the generic fashion that many filesystems support. */ +static int shmem_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); + unlock_page(page); + return error; +} + static int shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) { @@ -1709,25 +1720,6 @@ static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count return desc.error; } -static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - read_descriptor_t desc; - - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - do_shmem_file_read(in_file, ppos, &desc, actor); - if (desc.written) - return desc.written; - return desc.error; -} - static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -2384,6 +2376,7 @@ static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS + .readpage = shmem_readpage, .prepare_write = shmem_prepare_write, .commit_write = simple_commit_write, #endif @@ -2397,7 +2390,8 @@ static const struct file_operations shmem_file_operations = { .read = shmem_file_read, .write = shmem_file_write, .fsync = simple_sync_file, - .sendfile = shmem_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, #endif }; diff --git a/mm/slab.c b/mm/slab.c index 2e71a328aa09..a453383333fc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -137,6 +137,7 @@ /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) +#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -547,7 +548,7 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) if (cachep->flags & SLAB_STORE_USER) return (unsigned long long *)(objp + cachep->buffer_size - sizeof(unsigned long long) - - BYTES_PER_WORD); + REDZONE_ALIGN); return (unsigned long long *) (objp + cachep->buffer_size - sizeof(unsigned long long)); } @@ -774,7 +775,6 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, */ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); #endif - WARN_ON_ONCE(size == 0); while (size > csizep->cs_size) csizep++; @@ -929,7 +929,7 @@ static void next_reap_node(void) * the CPUs getting into lockstep and contending for the global cache chain * lock. */ -static void __devinit start_cpu_timer(int cpu) +static void __cpuinit start_cpu_timer(int cpu) { struct delayed_work *reap_work = &per_cpu(reap_work, cpu); @@ -2179,7 +2179,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ - if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) + if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; @@ -2220,12 +2221,20 @@ kmem_cache_create (const char *name, size_t size, size_t align, } /* - * Redzoning and user store require word alignment. Note this will be - * overridden by architecture or caller mandated alignment if either - * is greater than BYTES_PER_WORD. + * Redzoning and user store require word alignment or possibly larger. + * Note this will be overridden by architecture or caller mandated + * alignment if either is greater than BYTES_PER_WORD. */ - if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) - ralign = __alignof__(unsigned long long); + if (flags & SLAB_STORE_USER) + ralign = BYTES_PER_WORD; + + if (flags & SLAB_RED_ZONE) { + ralign = REDZONE_ALIGN; + /* If redzoning, ensure that the second redzone is suitably + * aligned, by adjusting the object size accordingly. */ + size += REDZONE_ALIGN - 1; + size &= ~(REDZONE_ALIGN - 1); + } /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { @@ -2262,9 +2271,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of - * the real object. + * the real object. But if the second red zone needs to be + * aligned to 64 bits, we must allow that much space. */ - size += BYTES_PER_WORD; + if (flags & SLAB_RED_ZONE) + size += REDZONE_ALIGN; + else + size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size @@ -3539,7 +3552,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (use_alien_caches && cache_free_alien(cachep, objp)) + if (cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -4144,26 +4157,17 @@ static void print_slabinfo_header(struct seq_file *m) static void *s_start(struct seq_file *m, loff_t *pos) { loff_t n = *pos; - struct list_head *p; mutex_lock(&cache_chain_mutex); if (!n) print_slabinfo_header(m); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + + return seq_list_start(&cache_chain, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct kmem_cache *cachep = p; - ++*pos; - return cachep->next.next == &cache_chain ? - NULL : list_entry(cachep->next.next, struct kmem_cache, next); + return seq_list_next(p, &cache_chain, pos); } static void s_stop(struct seq_file *m, void *p) @@ -4173,7 +4177,7 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; unsigned long active_objs; unsigned long num_objs; @@ -4342,17 +4346,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, static void *leaks_start(struct seq_file *m, loff_t *pos) { - loff_t n = *pos; - struct list_head *p; - mutex_lock(&cache_chain_mutex); - p = cache_chain.next; - while (n--) { - p = p->next; - if (p == &cache_chain) - return NULL; - } - return list_entry(p, struct kmem_cache, next); + return seq_list_start(&cache_chain, *pos); } static inline int add_caller(unsigned long *n, unsigned long v) @@ -4417,7 +4412,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = p; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); struct slab *slabp; struct kmem_list3 *l3; const char *name; diff --git a/mm/slob.c b/mm/slob.c index 71976c5d40d3..b4899079d8b0 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -3,57 +3,159 @@ * * Matt Mackall <mpm@selenic.com> 12/30/03 * + * NUMA support by Paul Mundt, 2007. + * * How SLOB works: * * The core of SLOB is a traditional K&R style heap allocator, with * support for returning aligned objects. The granularity of this - * allocator is 8 bytes on x86, though it's perhaps possible to reduce - * this to 4 if it's deemed worth the effort. The slob heap is a - * singly-linked list of pages from __get_free_page, grown on demand - * and allocation from the heap is currently first-fit. + * allocator is as little as 2 bytes, however typically most architectures + * will require 4 bytes on 32-bit and 8 bytes on 64-bit. + * + * The slob heap is a linked list of pages from alloc_pages(), and + * within each page, there is a singly-linked list of free blocks (slob_t). + * The heap is grown on demand and allocation from the heap is currently + * first-fit. * * Above this is an implementation of kmalloc/kfree. Blocks returned - * from kmalloc are 8-byte aligned and prepended with a 8-byte header. + * from kmalloc are prepended with a 4-byte header with the kmalloc size. * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls - * __get_free_pages directly so that it can return page-aligned blocks - * and keeps a linked list of such pages and their orders. These - * objects are detected in kfree() by their page alignment. + * alloc_pages() directly, allocating compound pages so the page order + * does not have to be separately tracked, and also stores the exact + * allocation size in page->private so that it can be used to accurately + * provide ksize(). These objects are detected in kfree() because slob_page() + * is false for them. * * SLAB is emulated on top of SLOB by simply calling constructors and - * destructors for every SLAB allocation. Objects are returned with - * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is - * set, in which case the low-level allocator will fragment blocks to - * create the proper alignment. Again, objects of page-size or greater - * are allocated by calling __get_free_pages. As SLAB objects know - * their size, no separate size bookkeeping is necessary and there is - * essentially no allocation space overhead. + * destructors for every SLAB allocation. Objects are returned with the + * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which + * case the low-level allocator will fragment blocks to create the proper + * alignment. Again, objects of page-size or greater are allocated by + * calling alloc_pages(). As SLAB objects know their size, no separate + * size bookkeeping is necessary and there is essentially no allocation + * space overhead, and compound pages aren't needed for multi-page + * allocations. + * + * NUMA support in SLOB is fairly simplistic, pushing most of the real + * logic down to the page allocator, and simply doing the node accounting + * on the upper levels. In the event that a node id is explicitly + * provided, alloc_pages_node() with the specified node id is used + * instead. The common case (or when the node id isn't explicitly provided) + * will default to the current node, as per numa_node_id(). + * + * Node aware pages are still inserted in to the global freelist, and + * these are scanned for by matching against the node id encoded in the + * page flags. As a result, block allocations that can be satisfied from + * the freelist will only be done so on pages residing on the same node, + * in order to prevent random node placement. */ +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/timer.h> #include <linux/rcupdate.h> +#include <linux/list.h> +#include <asm/atomic.h> + +/* + * slob_block has a field 'units', which indicates size of block if +ve, + * or offset of next block if -ve (in SLOB_UNITs). + * + * Free blocks of size 1 unit simply contain the offset of the next block. + * Those with larger size contain their size in the first SLOB_UNIT of + * memory, and the offset of the next free block in the second SLOB_UNIT. + */ +#if PAGE_SIZE <= (32767 * 2) +typedef s16 slobidx_t; +#else +typedef s32 slobidx_t; +#endif struct slob_block { - int units; - struct slob_block *next; + slobidx_t units; }; typedef struct slob_block slob_t; +/* + * We use struct page fields to manage some slob allocation aspects, + * however to avoid the horrible mess in include/linux/mm_types.h, we'll + * just define our own struct page type variant here. + */ +struct slob_page { + union { + struct { + unsigned long flags; /* mandatory */ + atomic_t _count; /* mandatory */ + slobidx_t units; /* free units left in page */ + unsigned long pad[2]; + slob_t *free; /* first free slob_t in page */ + struct list_head list; /* linked list of free pages */ + }; + struct page page; + }; +}; +static inline void struct_slob_page_wrong_size(void) +{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } + +/* + * free_slob_page: call before a slob_page is returned to the page allocator. + */ +static inline void free_slob_page(struct slob_page *sp) +{ + reset_page_mapcount(&sp->page); + sp->page.mapping = NULL; +} + +/* + * All (partially) free slob pages go on this list. + */ +static LIST_HEAD(free_slob_pages); + +/* + * slob_page: True for all slob pages (false for bigblock pages) + */ +static inline int slob_page(struct slob_page *sp) +{ + return test_bit(PG_active, &sp->flags); +} + +static inline void set_slob_page(struct slob_page *sp) +{ + __set_bit(PG_active, &sp->flags); +} + +static inline void clear_slob_page(struct slob_page *sp) +{ + __clear_bit(PG_active, &sp->flags); +} + +/* + * slob_page_free: true for pages on free_slob_pages list. + */ +static inline int slob_page_free(struct slob_page *sp) +{ + return test_bit(PG_private, &sp->flags); +} + +static inline void set_slob_page_free(struct slob_page *sp) +{ + list_add(&sp->list, &free_slob_pages); + __set_bit(PG_private, &sp->flags); +} + +static inline void clear_slob_page_free(struct slob_page *sp) +{ + list_del(&sp->list); + __clear_bit(PG_private, &sp->flags); +} + #define SLOB_UNIT sizeof(slob_t) #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) #define SLOB_ALIGN L1_CACHE_BYTES -struct bigblock { - int order; - void *pages; - struct bigblock *next; -}; -typedef struct bigblock bigblock_t; - /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free @@ -64,133 +166,285 @@ struct slob_rcu { int size; }; -static slob_t arena = { .next = &arena, .units = 1 }; -static slob_t *slobfree = &arena; -static bigblock_t *bigblocks; +/* + * slob_lock protects all slob allocator structures. + */ static DEFINE_SPINLOCK(slob_lock); -static DEFINE_SPINLOCK(block_lock); -static void slob_free(void *b, int size); -static void slob_timer_cbk(void); +/* + * Encode the given size and next info into a free slob block s. + */ +static void set_slob(slob_t *s, slobidx_t size, slob_t *next) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t offset = next - base; + if (size > 1) { + s[0].units = size; + s[1].units = offset; + } else + s[0].units = -offset; +} -static void *slob_alloc(size_t size, gfp_t gfp, int align) +/* + * Return the size of a slob block. + */ +static slobidx_t slob_units(slob_t *s) +{ + if (s->units > 0) + return s->units; + return 1; +} + +/* + * Return the next free slob block pointer after this one. + */ +static slob_t *slob_next(slob_t *s) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t next; + + if (s[0].units < 0) + next = -s[0].units; + else + next = s[1].units; + return base+next; +} + +/* + * Returns true if s is the last free block in its page. + */ +static int slob_last(slob_t *s) +{ + return !((unsigned long)slob_next(s) & ~PAGE_MASK); +} + +static void *slob_new_page(gfp_t gfp, int order, int node) +{ + void *page; + +#ifdef CONFIG_NUMA + if (node != -1) + page = alloc_pages_node(node, gfp, order); + else +#endif + page = alloc_pages(gfp, order); + + if (!page) + return NULL; + + return page_address(page); +} + +/* + * Allocate a slob block within a given slob_page sp. + */ +static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) { slob_t *prev, *cur, *aligned = 0; int delta = 0, units = SLOB_UNITS(size); - unsigned long flags; - spin_lock_irqsave(&slob_lock, flags); - prev = slobfree; - for (cur = prev->next; ; prev = cur, cur = cur->next) { + for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { + slobidx_t avail = slob_units(cur); + if (align) { aligned = (slob_t *)ALIGN((unsigned long)cur, align); delta = aligned - cur; } - if (cur->units >= units + delta) { /* room enough? */ + if (avail >= units + delta) { /* room enough? */ + slob_t *next; + if (delta) { /* need to fragment head to align? */ - aligned->units = cur->units - delta; - aligned->next = cur->next; - cur->next = aligned; - cur->units = delta; + next = slob_next(cur); + set_slob(aligned, avail - delta, next); + set_slob(cur, delta, aligned); prev = cur; cur = aligned; + avail = slob_units(cur); } - if (cur->units == units) /* exact fit? */ - prev->next = cur->next; /* unlink */ - else { /* fragment */ - prev->next = cur + units; - prev->next->units = cur->units - units; - prev->next->next = cur->next; - cur->units = units; + next = slob_next(cur); + if (avail == units) { /* exact fit? unlink. */ + if (prev) + set_slob(prev, slob_units(prev), next); + else + sp->free = next; + } else { /* fragment */ + if (prev) + set_slob(prev, slob_units(prev), cur + units); + else + sp->free = cur + units; + set_slob(cur + units, avail - units, next); } - slobfree = prev; - spin_unlock_irqrestore(&slob_lock, flags); + sp->units -= units; + if (!sp->units) + clear_slob_page_free(sp); return cur; } - if (cur == slobfree) { - spin_unlock_irqrestore(&slob_lock, flags); - - if (size == PAGE_SIZE) /* trying to shrink arena? */ - return 0; + if (slob_last(cur)) + return NULL; + } +} - cur = (slob_t *)__get_free_page(gfp); - if (!cur) - return 0; +/* + * slob_alloc: entry point into the slob allocator. + */ +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +{ + struct slob_page *sp; + slob_t *b = NULL; + unsigned long flags; - slob_free(cur, PAGE_SIZE); - spin_lock_irqsave(&slob_lock, flags); - cur = slobfree; + spin_lock_irqsave(&slob_lock, flags); + /* Iterate through each partially free page, try to find room */ + list_for_each_entry(sp, &free_slob_pages, list) { +#ifdef CONFIG_NUMA + /* + * If there's a node specification, search for a partial + * page with a matching node id in the freelist. + */ + if (node != -1 && page_to_nid(&sp->page) != node) + continue; +#endif + + if (sp->units >= SLOB_UNITS(size)) { + b = slob_page_alloc(sp, size, align); + if (b) + break; } } + spin_unlock_irqrestore(&slob_lock, flags); + + /* Not enough space: must allocate a new page */ + if (!b) { + b = slob_new_page(gfp, 0, node); + if (!b) + return 0; + sp = (struct slob_page *)virt_to_page(b); + set_slob_page(sp); + + spin_lock_irqsave(&slob_lock, flags); + sp->units = SLOB_UNITS(PAGE_SIZE); + sp->free = b; + INIT_LIST_HEAD(&sp->list); + set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); + set_slob_page_free(sp); + b = slob_page_alloc(sp, size, align); + BUG_ON(!b); + spin_unlock_irqrestore(&slob_lock, flags); + } + return b; } +/* + * slob_free: entry point into the slob allocator. + */ static void slob_free(void *block, int size) { - slob_t *cur, *b = (slob_t *)block; + struct slob_page *sp; + slob_t *prev, *next, *b = (slob_t *)block; + slobidx_t units; unsigned long flags; if (!block) return; + BUG_ON(!size); - if (size) - b->units = SLOB_UNITS(size); + sp = (struct slob_page *)virt_to_page(block); + units = SLOB_UNITS(size); - /* Find reinsertion point */ spin_lock_irqsave(&slob_lock, flags); - for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next) - if (cur >= cur->next && (b > cur || b < cur->next)) - break; - if (b + b->units == cur->next) { - b->units += cur->next->units; - b->next = cur->next->next; - } else - b->next = cur->next; + if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { + /* Go directly to page allocator. Do not pass slob allocator */ + if (slob_page_free(sp)) + clear_slob_page_free(sp); + clear_slob_page(sp); + free_slob_page(sp); + free_page((unsigned long)b); + goto out; + } - if (cur + cur->units == b) { - cur->units += b->units; - cur->next = b->next; - } else - cur->next = b; + if (!slob_page_free(sp)) { + /* This slob page is about to become partially free. Easy! */ + sp->units = units; + sp->free = b; + set_slob(b, units, + (void *)((unsigned long)(b + + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); + set_slob_page_free(sp); + goto out; + } - slobfree = cur; + /* + * Otherwise the page is already partially free, so find reinsertion + * point. + */ + sp->units += units; + if (b < sp->free) { + set_slob(b, units, sp->free); + sp->free = b; + } else { + prev = sp->free; + next = slob_next(prev); + while (b > next) { + prev = next; + next = slob_next(prev); + } + + if (!slob_last(prev) && b + units == next) { + units += slob_units(next); + set_slob(b, units, slob_next(next)); + } else + set_slob(b, units, next); + + if (prev + slob_units(prev) == b) { + units = slob_units(b) + slob_units(prev); + set_slob(prev, units, slob_next(b)); + } else + set_slob(prev, slob_units(prev), b); + } +out: spin_unlock_irqrestore(&slob_lock, flags); } -void *__kmalloc(size_t size, gfp_t gfp) -{ - slob_t *m; - bigblock_t *bb; - unsigned long flags; +/* + * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. + */ - if (size < PAGE_SIZE - SLOB_UNIT) { - m = slob_alloc(size + SLOB_UNIT, gfp, 0); - return m ? (void *)(m + 1) : 0; - } +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) +#endif - bb = slob_alloc(sizeof(bigblock_t), gfp, 0); - if (!bb) - return 0; +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) +#endif - bb->order = get_order(size); - bb->pages = (void *)__get_free_pages(gfp, bb->order); +void *__kmalloc_node(size_t size, gfp_t gfp, int node) +{ + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + + if (size < PAGE_SIZE - align) { + unsigned int *m; + m = slob_alloc(size + align, gfp, align, node); + if (m) + *m = size; + return (void *)m + align; + } else { + void *ret; - if (bb->pages) { - spin_lock_irqsave(&block_lock, flags); - bb->next = bigblocks; - bigblocks = bb; - spin_unlock_irqrestore(&block_lock, flags); - return bb->pages; + ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + if (ret) { + struct page *page; + page = virt_to_page(ret); + page->private = size; + } + return ret; } - - slob_free(bb, sizeof(bigblock_t)); - return 0; } -EXPORT_SYMBOL(__kmalloc); +EXPORT_SYMBOL(__kmalloc_node); /** * krealloc - reallocate memory. The contents will remain unchanged. @@ -227,52 +481,34 @@ EXPORT_SYMBOL(krealloc); void kfree(const void *block) { - bigblock_t *bb, **last = &bigblocks; - unsigned long flags; + struct slob_page *sp; if (!block) return; - if (!((unsigned long)block & (PAGE_SIZE-1))) { - /* might be on the big block list */ - spin_lock_irqsave(&block_lock, flags); - for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { - if (bb->pages == block) { - *last = bb->next; - spin_unlock_irqrestore(&block_lock, flags); - free_pages((unsigned long)block, bb->order); - slob_free(bb, sizeof(bigblock_t)); - return; - } - } - spin_unlock_irqrestore(&block_lock, flags); - } - - slob_free((slob_t *)block - 1, 0); - return; + sp = (struct slob_page *)virt_to_page(block); + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + slob_free(m, *m + align); + } else + put_page(&sp->page); } - EXPORT_SYMBOL(kfree); +/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t ksize(const void *block) { - bigblock_t *bb; - unsigned long flags; + struct slob_page *sp; if (!block) return 0; - if (!((unsigned long)block & (PAGE_SIZE-1))) { - spin_lock_irqsave(&block_lock, flags); - for (bb = bigblocks; bb; bb = bb->next) - if (bb->pages == block) { - spin_unlock_irqrestore(&slob_lock, flags); - return PAGE_SIZE << bb->order; - } - spin_unlock_irqrestore(&block_lock, flags); - } - - return ((slob_t *)block - 1)->units * SLOB_UNIT; + sp = (struct slob_page *)virt_to_page(block); + if (slob_page(sp)) + return ((slob_t *)block - 1)->units + SLOB_UNIT; + else + return sp->page.private; } struct kmem_cache { @@ -289,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, { struct kmem_cache *c; - c = slob_alloc(sizeof(struct kmem_cache), flags, 0); + c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1); if (c) { c->name = name; @@ -302,6 +538,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, c->ctor = ctor; /* ignore alignment unless it's forced */ c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + if (c->align < ARCH_SLAB_MINALIGN) + c->align = ARCH_SLAB_MINALIGN; if (c->align < align) c->align = align; } else if (flags & SLAB_PANIC) @@ -317,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_destroy); -void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) +void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; if (c->size < PAGE_SIZE) - b = slob_alloc(c->size, flags, c->align); + b = slob_alloc(c->size, flags, c->align, node); else - b = (void *)__get_free_pages(flags, get_order(c->size)); + b = slob_new_page(flags, get_order(c->size), node); if (c->ctor) c->ctor(b, c, 0); return b; } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_node); void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) { @@ -385,9 +623,6 @@ const char *kmem_cache_name(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_name); -static struct timer_list slob_timer = TIMER_INITIALIZER( - (void (*)(unsigned long))slob_timer_cbk, 0, 0); - int kmem_cache_shrink(struct kmem_cache *d) { return 0; @@ -399,17 +634,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b) return 0; } -void __init kmem_cache_init(void) +static unsigned int slob_ready __read_mostly; + +int slab_is_available(void) { - slob_timer_cbk(); + return slob_ready; } -static void slob_timer_cbk(void) +void __init kmem_cache_init(void) { - void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); - - if (p) - free_page((unsigned long)p); - - mod_timer(&slob_timer, jiffies + HZ); + slob_ready = 1; } diff --git a/mm/slub.c b/mm/slub.c index 51663a3c3c24..6aea48942c29 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -323,7 +323,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) /* * Debug settings: */ +#ifdef CONFIG_SLUB_DEBUG_ON +static int slub_debug = DEBUG_DEFAULT_FLAGS; +#else static int slub_debug; +#endif static char *slub_debug_slabs; @@ -888,38 +892,57 @@ fail: static int __init setup_slub_debug(char *str) { - if (!str || *str != '=') - slub_debug = DEBUG_DEFAULT_FLAGS; - else { - str++; - if (*str == 0 || *str == ',') - slub_debug = DEBUG_DEFAULT_FLAGS; - else - for( ;*str && *str != ','; str++) - switch (*str) { - case 'f' : case 'F' : - slub_debug |= SLAB_DEBUG_FREE; - break; - case 'z' : case 'Z' : - slub_debug |= SLAB_RED_ZONE; - break; - case 'p' : case 'P' : - slub_debug |= SLAB_POISON; - break; - case 'u' : case 'U' : - slub_debug |= SLAB_STORE_USER; - break; - case 't' : case 'T' : - slub_debug |= SLAB_TRACE; - break; - default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n",*str); - } + slub_debug = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + if (*str == ',') + /* + * No options but restriction on slabs. This means full + * debugging for slabs matching a pattern. + */ + goto check_slabs; + + slub_debug = 0; + if (*str == '-') + /* + * Switch off all debugging measures. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for ( ;*str && *str != ','; str++) { + switch (tolower(*str)) { + case 'f': + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z': + slub_debug |= SLAB_RED_ZONE; + break; + case 'p': + slub_debug |= SLAB_POISON; + break; + case 'u': + slub_debug |= SLAB_STORE_USER; + break; + case 't': + slub_debug |= SLAB_TRACE; + break; + default: + printk(KERN_ERR "slub_debug option '%c' " + "unknown. skipped\n",*str); + } } +check_slabs: if (*str == ',') slub_debug_slabs = str + 1; +out: return 1; } @@ -1798,8 +1821,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); - /* new_slab() disables interupts */ - local_irq_enable(); BUG_ON(!page); n = page->freelist; @@ -1811,6 +1832,12 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); add_partial(n, page); + + /* + * new_slab() disables interupts. If we do not reenable interrupts here + * then bootup would continue with interrupts disabled. + */ + local_irq_enable(); return n; } @@ -2016,7 +2043,6 @@ error: s->offset, flags); return 0; } -EXPORT_SYMBOL(kmem_cache_open); /* * Check if a given pointer is valid @@ -2241,7 +2267,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (s) return slab_alloc(s, flags, -1, __builtin_return_address(0)); - return NULL; + return ZERO_SIZE_PTR; } EXPORT_SYMBOL(__kmalloc); @@ -2252,16 +2278,20 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (s) return slab_alloc(s, flags, node, __builtin_return_address(0)); - return NULL; + return ZERO_SIZE_PTR; } EXPORT_SYMBOL(__kmalloc_node); #endif size_t ksize(const void *object) { - struct page *page = get_object_page(object); + struct page *page; struct kmem_cache *s; + if (object == ZERO_SIZE_PTR) + return 0; + + page = get_object_page(object); BUG_ON(!page); s = page->slab; BUG_ON(!s); @@ -2293,7 +2323,13 @@ void kfree(const void *x) struct kmem_cache *s; struct page *page; - if (!x) + /* + * This has to be an unsigned comparison. According to Linus + * some gcc version treat a pointer as a signed entity. Then + * this comparison would be true for all "negative" pointers + * (which would cover the whole upper half of the address space). + */ + if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR) return; page = virt_to_head_page(x); @@ -2398,12 +2434,12 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) void *ret; size_t ks; - if (unlikely(!p)) + if (unlikely(!p || p == ZERO_SIZE_PTR)) return kmalloc(new_size, flags); if (unlikely(!new_size)) { kfree(p); - return NULL; + return ZERO_SIZE_PTR; } ks = ksize(p); @@ -2426,6 +2462,7 @@ EXPORT_SYMBOL(krealloc); void __init kmem_cache_init(void) { int i; + int caches = 0; #ifdef CONFIG_NUMA /* @@ -2436,20 +2473,29 @@ void __init kmem_cache_init(void) create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", sizeof(struct kmem_cache_node), GFP_KERNEL); kmalloc_caches[0].refcount = -1; + caches++; #endif /* Able to allocate the per node structures */ slab_state = PARTIAL; /* Caches that are not of the two-to-the-power-of size */ - create_kmalloc_cache(&kmalloc_caches[1], + if (KMALLOC_MIN_SIZE <= 64) { + create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_KERNEL); - create_kmalloc_cache(&kmalloc_caches[2], + caches++; + } + if (KMALLOC_MIN_SIZE <= 128) { + create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_KERNEL); + caches++; + } - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); + caches++; + } slab_state = UP; @@ -2466,8 +2512,8 @@ void __init kmem_cache_init(void) nr_cpu_ids * sizeof(struct page *); printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," - " Processors=%d, Nodes=%d\n", - KMALLOC_SHIFT_HIGH, cache_line_size(), + " CPUs=%d, Nodes=%d\n", + caches, cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } @@ -2652,7 +2698,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) struct kmem_cache *s = get_slab(size, gfpflags); if (!s) - return NULL; + return ZERO_SIZE_PTR; return slab_alloc(s, gfpflags, -1, caller); } @@ -2663,7 +2709,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s = get_slab(size, gfpflags); if (!s) - return NULL; + return ZERO_SIZE_PTR; return slab_alloc(s, gfpflags, node, caller); } @@ -2857,7 +2903,7 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max) order = get_order(sizeof(struct location) * max); - l = (void *)__get_free_pages(GFP_KERNEL, order); + l = (void *)__get_free_pages(GFP_ATOMIC, order); if (!l) return 0; @@ -3022,13 +3068,15 @@ static int list_locations(struct kmem_cache *s, char *buf, n += sprintf(buf + n, " pid=%ld", l->min_pid); - if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) { + if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && + n < PAGE_SIZE - 60) { n += sprintf(buf + n, " cpus="); n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, l->cpus); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) { + if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + n < PAGE_SIZE - 60) { n += sprintf(buf + n, " nodes="); n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, l->nodes); diff --git a/mm/sparse.c b/mm/sparse.c index 545e4d3afcdf..e03b39f3540f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -240,6 +240,27 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) return NULL; } +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void __init sparse_init(void) +{ + unsigned long pnum; + struct page *map; + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + if (!valid_section_nr(pnum)) + continue; + + map = sparse_early_mem_map_alloc(pnum); + if (!map) + continue; + sparse_init_one_section(__nr_to_section(pnum), pnum, map); + } +} + +#ifdef CONFIG_MEMORY_HOTPLUG static struct page *__kmalloc_section_memmap(unsigned long nr_pages) { struct page *page, *ret; @@ -280,27 +301,6 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) } /* - * Allocate the accumulated non-linear sections, allocate a mem_map - * for each and record the physical to section mapping. - */ -void __init sparse_init(void) -{ - unsigned long pnum; - struct page *map; - - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - if (!valid_section_nr(pnum)) - continue; - - map = sparse_early_mem_map_alloc(pnum); - if (!map) - continue; - sparse_init_one_section(__nr_to_section(pnum), pnum, map); - } -} - -#ifdef CONFIG_MEMORY_HOTPLUG -/* * returns the number of sections whose mem_maps were properly * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. diff --git a/mm/swap_state.c b/mm/swap_state.c index 5f7cf2a4cb55..925d5c50f18d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_list, to make sync_page look nicer, and to allow + * vmscan's shrink_page_list, to make sync_page look nicer, and to allow * future use of radix_tree tags in the swap cache. */ static const struct address_space_operations swap_aops = { diff --git a/mm/swapfile.c b/mm/swapfile.c index acc172cbe3aa..7ff0a81c7b01 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type) /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_list will preserve it. + * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); unlock_page(page); diff --git a/mm/truncate.c b/mm/truncate.c index 4fbe1a2da5fb..7c994f2d6145 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) +unsigned long __invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end, bool be_atomic) { struct pagevec pvec; pgoff_t next = start; @@ -308,17 +295,38 @@ unlock: break; } pagevec_release(&pvec); + if (likely(!be_atomic)) + cond_resched(); } return ret; } + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return __invalidate_mapping_pages(mapping, start, end, false); +} EXPORT_SYMBOL(invalidate_mapping_pages); /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because - * shrink_list() has a temp ref on them, or because they're transiently sitting - * in the lru_cache_add() pagevecs. + * shrink_page_list() has a temp ref on them, or because they're transiently + * sitting in the lru_cache_add() pagevecs. */ static int invalidate_complete_page2(struct address_space *mapping, struct page *page) diff --git a/mm/vmstat.c b/mm/vmstat.c index 38254297a494..eceaf496210f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -477,8 +477,8 @@ const struct seq_operations fragmentation_op = { static const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", - "nr_active", "nr_inactive", + "nr_active", "nr_anon_pages", "nr_mapped", "nr_file_pages", |