diff options
161 files changed, 2390 insertions, 1724 deletions
@@ -133,6 +133,7 @@ James Ketrenos <jketreno@io.(none)> Jan Glauber <jan.glauber@gmail.com> <jang@de.ibm.com> Jan Glauber <jan.glauber@gmail.com> <jang@linux.vnet.ibm.com> Jan Glauber <jan.glauber@gmail.com> <jglauber@cavium.com> +Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@linux.intel.com> Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com> Jason Gunthorpe <jgg@ziepe.ca> <jgg@nvidia.com> Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0f1fb7e86237..d246ad46d845 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1343,6 +1343,7 @@ current integrity status. failslab= + fail_usercopy= fail_page_alloc= fail_make_request=[KNL] General fault injection mechanism. diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 640934b6f7b4..a137a0e6d068 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -475,13 +475,15 @@ or iterations will move the index to the first index in the range. Each entry will only be returned once, no matter how many indices it occupies. -Using xas_next() or xas_prev() with a multi-index xa_state -is not supported. Using either of these functions on a multi-index entry -will reveal sibling entries; these should be skipped over by the caller. - -Storing ``NULL`` into any index of a multi-index entry will set the entry -at every index to ``NULL`` and dissolve the tie. Splitting a multi-index -entry into entries occupying smaller ranges is not yet supported. +Using xas_next() or xas_prev() with a multi-index xa_state is not +supported. Using either of these functions on a multi-index entry will +reveal sibling entries; these should be skipped over by the caller. + +Storing ``NULL`` into any index of a multi-index entry will set the +entry at every index to ``NULL`` and dissolve the tie. A multi-index +entry can be split into entries occupying smaller ranges by calling +xas_split_alloc() without the xa_lock held, followed by taking the lock +and calling xas_split(). Functions and structures ======================== diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index f850ad018b70..31ecfe44e5b4 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -16,6 +16,10 @@ Available fault injection capabilities injects page allocation failures. (alloc_pages(), get_free_pages(), ...) +- fail_usercopy + + injects failures in user memory access functions. (copy_from_user(), get_user(), ...) + - fail_futex injects futex deadlock and uaddr fault errors. @@ -177,6 +181,7 @@ use the boot option:: failslab= fail_page_alloc= + fail_usercopy= fail_make_request= fail_futex= mmc_core.fail_request=<interval>,<probability>,<space>,<times> @@ -222,7 +227,7 @@ How to add new fault injection capability - debugfs entries - failslab, fail_page_alloc, and fail_make_request use this way. + failslab, fail_page_alloc, fail_usercopy, and fail_make_request use this way. Helper functions: fault_create_debugfs_attr(name, parent, attr); diff --git a/MAINTAINERS b/MAINTAINERS index 0f59b0412953..2c31900a6f11 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9715,7 +9715,7 @@ F: security/keys/encrypted-keys/ KEYS-TRUSTED M: James Bottomley <jejb@linux.ibm.com> -M: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> +M: Jarkko Sakkinen <jarkko@kernel.org> M: Mimi Zohar <zohar@linux.ibm.com> L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org @@ -9727,7 +9727,7 @@ F: security/keys/trusted-keys/ KEYS/KEYRINGS M: David Howells <dhowells@redhat.com> -M: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> +M: Jarkko Sakkinen <jarkko@kernel.org> L: keyrings@vger.kernel.org S: Maintained F: Documentation/security/keys/core.rst @@ -17717,7 +17717,7 @@ F: drivers/platform/x86/toshiba-wmi.c TPM DEVICE DRIVER M: Peter Huewe <peterhuewe@gmx.de> -M: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> +M: Jarkko Sakkinen <jarkko@kernel.org> R: Jason Gunthorpe <jgg@ziepe.ca> L: linux-integrity@vger.kernel.org S: Maintained diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d8686bf3ae2f..ef12e097f318 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -537,7 +537,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); return 0; } @@ -547,7 +547,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone, { if (!vmem_map) { memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; struct memmap_init_callback_data args; diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 495fc0ccb453..f6d2c4449aeb 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -615,7 +615,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) VM_BUG_ON(pfn >> (64 - PAGE_SHIFT)); VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK); - return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot)); + return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE); } static inline unsigned long pte_pfn(pte_t pte) @@ -651,11 +651,6 @@ static inline pte_t pte_mkexec(pte_t pte) return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC)); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); -} - static inline pte_t pte_mkwrite(pte_t pte) { /* @@ -819,6 +814,14 @@ static inline int pte_none(pte_t pte) static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, int percpu) { + + VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE))); + /* + * Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE + * in all the callers. + */ + pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); + if (radix_enabled()) return radix__set_pte_at(mm, addr, ptep, pte, percpu); return hash__set_pte_at(mm, addr, ptep, pte, percpu); @@ -866,6 +869,13 @@ static inline bool pte_ci(pte_t pte) static inline void pmd_clear(pmd_t *pmdp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* + * Don't use this if we can possibly have a hash page table + * entry mapping this. + */ + WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pmdp = __pmd(0); } @@ -914,6 +924,13 @@ static inline int pmd_bad(pmd_t pmd) static inline void pud_clear(pud_t *pudp) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) { + /* + * Don't use this if we can possibly have a hash page table + * entry mapping this. + */ + WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE)); + } *pudp = __pud(0); } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 4b7c3472eab1..6277e7596ae5 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -140,11 +140,6 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } -static inline pte_t pte_mkpte(pte_t pte) -{ - return pte; -} - static inline pte_t pte_mkspecial(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPECIAL); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9c0547d77af3..ab57b07ef39a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -184,9 +184,6 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - /* Add the pte bit when trying to set a pte */ - pte = pte_mkpte(pte); - /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this * is called. @@ -275,8 +272,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = pte_mkpte(pte); - pte = set_pte_filter(pte); val = pte_val(pte); diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 13b369d2cc45..6828108486f8 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -224,7 +224,7 @@ static int memtrace_online(void) ent->mem = 0; } - if (add_memory(ent->nid, ent->start, ent->size)) { + if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) { pr_err("Failed to add trace memory to node %d\n", ent->nid); ret += 1; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 5d545b78111f..d8bbf0cc1601 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -606,7 +606,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) block_sz = memory_block_size_bytes(); /* Add the memory */ - rc = __add_memory(lmb->nid, lmb->base_addr, block_sz); + rc = __add_memory(lmb->nid, lmb->base_addr, block_sz, MHP_NONE); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index ad6e90fbc813..b02fd51e5589 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -194,7 +194,8 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); - result = __add_memory(node, info->start_addr, info->length); + result = __add_memory(node, info->start_addr, info->length, + MHP_NONE); /* * If the memory block has been used by the kernel, add_memory() diff --git a/drivers/base/memory.c b/drivers/base/memory.c index adf828dfccf0..eef4ffb6122c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -432,7 +432,8 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr, nid = memory_add_physaddr_to_nid(phys_addr); ret = __add_memory(nid, phys_addr, - MIN_MEMORY_BLOCK_SIZE * sections_per_block); + MIN_MEMORY_BLOCK_SIZE * sections_per_block, + MHP_NONE); if (ret) goto out; diff --git a/drivers/base/node.c b/drivers/base/node.c index 43d21f9e88b1..6ffa470e2984 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -772,8 +772,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn) return pfn_to_nid(pfn); } -static int do_register_memory_block_under_node(int nid, - struct memory_block *mem_blk) +static void do_register_memory_block_under_node(int nid, + struct memory_block *mem_blk) { int ret; @@ -786,12 +786,19 @@ static int do_register_memory_block_under_node(int nid, ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); - if (ret) - return ret; + if (ret && ret != -EEXIST) + dev_err_ratelimited(&node_devices[nid]->dev, + "can't create link to %s in sysfs (%d)\n", + kobject_name(&mem_blk->dev.kobj), ret); - return sysfs_create_link_nowarn(&mem_blk->dev.kobj, + ret = sysfs_create_link_nowarn(&mem_blk->dev.kobj, &node_devices[nid]->dev.kobj, kobject_name(&node_devices[nid]->dev.kobj)); + if (ret && ret != -EEXIST) + dev_err_ratelimited(&mem_blk->dev, + "can't create link to %s in sysfs (%d)\n", + kobject_name(&node_devices[nid]->dev.kobj), + ret); } /* register memory section under specified node if it spans that node */ @@ -827,7 +834,8 @@ static int register_mem_block_under_node_early(struct memory_block *mem_blk, if (page_nid != nid) continue; - return do_register_memory_block_under_node(nid, mem_blk); + do_register_memory_block_under_node(nid, mem_blk); + return 0; } /* mem section does not span the specified node */ return 0; @@ -842,7 +850,8 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, { int nid = *(int *)arg; - return do_register_memory_block_under_node(nid, mem_blk); + do_register_memory_block_under_node(nid, mem_blk); + return 0; } /* @@ -860,8 +869,8 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } -int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, - enum meminit_context context) +void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, + enum meminit_context context) { walk_memory_blocks_func_t func; @@ -870,9 +879,9 @@ int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, else func = register_mem_block_under_node_early; - return walk_memory_blocks(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), (void *)&nid, - func); + walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), + (void *)&nid, func); + return; } #ifdef CONFIG_HUGETLBFS diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bff3d4021c18..029403c18ca3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1270,7 +1270,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret)) + if (WARN_ON(ret)) pr_err("Decompression failed! err=%d, page=%u\n", ret, index); return ret; diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 6c933f2b604e..b4368c5b6a0c 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -35,11 +35,17 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) return 0; } +struct dax_kmem_data { + const char *res_name; + struct resource *res[]; +}; + static int dev_dax_kmem_probe(struct dev_dax *dev_dax) { struct device *dev = &dev_dax->dev; + struct dax_kmem_data *data; + int rc = -ENOMEM; int i, mapped = 0; - char *res_name; int numa_node; /* @@ -55,14 +61,17 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) return -EINVAL; } - res_name = kstrdup(dev_name(dev), GFP_KERNEL); - if (!res_name) + data = kzalloc(sizeof(*data) + sizeof(struct resource *) * dev_dax->nr_range, GFP_KERNEL); + if (!data) return -ENOMEM; + data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); + if (!data->res_name) + goto err_res_name; + for (i = 0; i < dev_dax->nr_range; i++) { struct resource *res; struct range range; - int rc; rc = dax_kmem_range(dev_dax, i, &range); if (rc) { @@ -72,7 +81,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) } /* Region is permanently reserved if hotremove fails. */ - res = request_mem_region(range.start, range_len(&range), res_name); + res = request_mem_region(range.start, range_len(&range), data->res_name); if (!res) { dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", i, range.start, range.end); @@ -82,9 +91,10 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) */ if (mapped) continue; - kfree(res_name); - return -EBUSY; + rc = -EBUSY; + goto err_request_mem; } + data->res[i] = res; /* * Set flags appropriate for System RAM. Leave ..._BUSY clear @@ -99,23 +109,30 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) * this as RAM automatically. */ rc = add_memory_driver_managed(numa_node, range.start, - range_len(&range), kmem_name); + range_len(&range), kmem_name, MHP_NONE); if (rc) { dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", i, range.start, range.end); - release_mem_region(range.start, range_len(&range)); + release_resource(res); + kfree(res); + data->res[i] = NULL; if (mapped) continue; - kfree(res_name); - return rc; + goto err_request_mem; } mapped++; } - dev_set_drvdata(dev, res_name); + dev_set_drvdata(dev, data); return 0; + +err_request_mem: + kfree(data->res_name); +err_res_name: + kfree(data); + return rc; } #ifdef CONFIG_MEMORY_HOTREMOVE @@ -123,7 +140,7 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax) { int i, success = 0; struct device *dev = &dev_dax->dev; - const char *res_name = dev_get_drvdata(dev); + struct dax_kmem_data *data = dev_get_drvdata(dev); /* * We have one shot for removing memory, if some memory blocks were not @@ -142,7 +159,9 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax) rc = remove_memory(dev_dax->target_node, range.start, range_len(&range)); if (rc == 0) { - release_mem_region(range.start, range_len(&range)); + release_resource(data->res[i]); + kfree(data->res[i]); + data->res[i] = NULL; success++; continue; } @@ -153,7 +172,8 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax) } if (success >= dev_dax->nr_range) { - kfree(res_name); + kfree(data->res_name); + kfree(data); dev_set_drvdata(dev, NULL); } diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 32e3bc0aa665..b64d2efbefe7 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT)); + (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 37794d88b1f3..a4ba0b87d6de 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -845,8 +845,6 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) * will only be one mm, so no big deal. */ mmap_read_lock(mm); - if (!mmget_still_valid(mm)) - goto skip_mm; mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -865,7 +863,6 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) } } mutex_unlock(&ufile->umap_lock); - skip_mm: mmap_read_unlock(mm); mmput(mm); } diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index a30342942e26..94331d999d27 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -871,15 +871,16 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, rmcd_error("pin_user_pages_fast err=%ld", pinned); nr_pages = 0; - } else + } else { rmcd_error("pinned %ld out of %ld pages", pinned, nr_pages); + /* + * Set nr_pages up to mean "how many pages to unpin, in + * the error handler: + */ + nr_pages = pinned; + } ret = -EFAULT; - /* - * Set nr_pages up to mean "how many pages to unpin, in - * the error handler: - */ - nr_pages = pinned; goto err_pg; } @@ -1679,6 +1680,7 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv, struct rio_dev *rdev; struct rio_switch *rswitch = NULL; struct rio_mport *mport; + struct device *dev; size_t size; u32 rval; u32 swpinfo = 0; @@ -1693,8 +1695,10 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv, rmcd_debug(RDEV, "name:%s ct:0x%x did:0x%x hc:0x%x", dev_info.name, dev_info.comptag, dev_info.destid, dev_info.hopcount); - if (bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name)) { + dev = bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name); + if (dev) { rmcd_debug(RDEV, "device %s already exists", dev_info.name); + put_device(dev); return -EEXIST; } diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index a864b21af602..f6e97f0830f6 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -406,7 +406,7 @@ static void __init add_memory_merged(u16 rn) if (!size) goto skip_add; for (addr = start; addr < start + size; addr += block_size) - add_memory(0, addr, block_size); + add_memory(0, addr, block_size, MHP_NONE); skip_add: first_rn = rn; num = 1; diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 1ab1f5cda4ac..b0f4b92a87ed 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1480,31 +1480,29 @@ static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) } else { mmap_read_lock(mm); } - if (mmget_still_valid(mm)) { - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); + if (try) { + if (!mutex_trylock(&vdev->vma_lock)) { + mmap_read_unlock(mm); + mmput(mm); + return 0; } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; + } else { + mutex_lock(&vdev->vma_lock); + } + list_for_each_entry_safe(mmap_vma, tmp, + &vdev->vma_list, vma_next) { + struct vm_area_struct *vma = mmap_vma->vma; - if (vma->vm_mm != mm) - continue; + if (vma->vm_mm != mm) + continue; - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); + list_del(&mmap_vma->vma_next); + kfree(mmap_vma); - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); } + mutex_unlock(&vdev->vma_lock); mmap_read_unlock(mm); mmput(mm); } diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 834b7c13ef3d..ba4de598f663 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -424,7 +424,8 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), - vm->resource_name); + vm->resource_name, + MEMHP_MERGE_RESOURCE); } /* diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 51427c752b37..b57b2067ecbf 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void) mutex_unlock(&balloon_mutex); /* add_memory_resource() requires the device_hotplug lock */ lock_device_hotplug(); - rc = add_memory_resource(nid, resource); + rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE); unlock_device_hotplug(); mutex_lock(&balloon_mutex); diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 75105f45c51a..322b7dfb4ea0 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -8,6 +8,7 @@ #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> +#include <linux/nospec.h> #include "autofs_i.h" @@ -563,7 +564,7 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static ioctl_fn _ioctls[] = { + static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, @@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; + if (idx >= ARRAY_SIZE(_ioctls)) + return NULL; + idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); + return _ioctls[idx]; } /* ioctl dispatcher */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13d053982dd7..e7e9d0cde51a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> @@ -421,6 +422,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) return 0; } +static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) +{ + unsigned long alignment = 0; + int i; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + unsigned long p_align = cmds[i].p_align; + + /* skip non-power of two alignments as invalid */ + if (!is_power_of_2(p_align)) + continue; + alignment = max(alignment, p_align); + } + } + + /* ensure we align to at least one page */ + return ELF_PAGEALIGN(alignment); +} + /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded @@ -1008,6 +1029,7 @@ out_free_interp: int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; + unsigned long alignment; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1086,6 +1108,9 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (alignment) + load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED; } else load_bias = 0; @@ -1389,126 +1414,6 @@ out: * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ -/* - * The purpose of always_dump_vma() is to make sure that special kernel mappings - * that are useful for post-mortem analysis are included in every core dump. - * In that way we ensure that the core dump is fully interpretable later - * without matching up the same kernel and hardware config to see what PC values - * meant. These special mappings include - vDSO, vsyscall, and other - * architecture specific mappings - */ -static bool always_dump_vma(struct vm_area_struct *vma) -{ - /* Any vsyscall mappings? */ - if (vma == get_gate_vma(vma->vm_mm)) - return true; - - /* - * Assume that all vmas with a .name op should always be dumped. - * If this changes, a new vm_ops field can easily be added. - */ - if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) - return true; - - /* - * arch_vma_name() returns non-NULL for special architecture mappings, - * such as vDSO sections. - */ - if (arch_vma_name(vma)) - return true; - - return false; -} - -/* - * Decide what to dump of a segment, part, all or none. - */ -static unsigned long vma_dump_size(struct vm_area_struct *vma, - unsigned long mm_flags) -{ -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - - /* always dump the vdso and vsyscall sections */ - if (always_dump_vma(vma)) - goto whole; - - if (vma->vm_flags & VM_DONTDUMP) - return 0; - - /* support for DAX */ - if (vma_is_dax(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) - goto whole; - return 0; - } - - /* Hugetlb memory check */ - if (is_vm_hugetlb_page(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) - goto whole; - return 0; - } - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) - return 0; - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0 ? - FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) - goto whole; - return 0; - } - - /* Dump segments that have been written to. */ - if (vma->anon_vma && FILTER(ANON_PRIVATE)) - goto whole; - if (vma->vm_file == NULL) - return 0; - - if (FILTER(MAPPED_PRIVATE)) - goto whole; - - /* - * If this looks like the beginning of a DSO or executable mapping, - * check for an ELF header. If we find one, dump the first page to - * aid in determining what was mapped here. - */ - if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { - u32 __user *header = (u32 __user *) vma->vm_start; - u32 word; - /* - * Doing it this way gets the constant folded by GCC. - */ - union { - u32 cmp; - char elfmag[SELFMAG]; - } magic; - BUILD_BUG_ON(SELFMAG != sizeof word); - magic.elfmag[EI_MAG0] = ELFMAG0; - magic.elfmag[EI_MAG1] = ELFMAG1; - magic.elfmag[EI_MAG2] = ELFMAG2; - magic.elfmag[EI_MAG3] = ELFMAG3; - if (unlikely(get_user(word, header))) - word = 0; - if (word == magic.cmp) - return PAGE_SIZE; - } - -#undef FILTER - - return 0; - -whole: - return vma->vm_end - vma->vm_start; -} - /* An ELF note in memory */ struct memelfnote { @@ -2220,32 +2125,6 @@ static void free_note_info(struct elf_note_info *info) #endif -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} -/* - * Helper function for iterating across a vma list. It ensures that the caller - * will visit `gate_vma' prior to terminating the search. - */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) - return NULL; - return gate_vma; -} - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { @@ -2272,9 +2151,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs, i; - size_t vma_data_size = 0; - struct vm_area_struct *vma, *gate_vma; + int vma_count, segs, i; + size_t vma_data_size; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2282,30 +2160,16 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - elf_addr_t *vma_filesz = NULL; + struct core_vma_metadata *vma_meta; + + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + return 0; - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ - /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); - - gate_vma = get_gate_vma(current->mm); - if (gate_vma != NULL) - segs++; + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2343,24 +2207,6 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * Zero vma process will get ZERO_SIZE_PTR here. - * Let coredump continue for register state at least. - */ - vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), - GFP_KERNEL); - if (!vma_filesz) - goto end_coredump; - - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long dump_size; - - dump_size = vma_dump_size(vma, cprm->mm_flags); - vma_filesz[i++] = dump_size; - vma_data_size += dump_size; - } - offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2381,21 +2227,23 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = vma_filesz[i++]; - phdr.p_memsz = vma->vm_end - vma->vm_start; + phdr.p_filesz = meta->dump_size; + phdr.p_memsz = meta->end - meta->start; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -2417,28 +2265,11 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long addr; - unsigned long end; + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - end = vma->vm_start + vma_filesz[i++]; - - for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { - struct page *page; - int stop; - - page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else - stop = !dump_skip(cprm, PAGE_SIZE); - if (stop) - goto end_coredump; - } + if (!dump_user_range(cprm, meta->start, meta->dump_size)) + goto end_coredump; } dump_truncate(cprm); @@ -2453,7 +2284,7 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_filesz); + kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 50f845702b92..be4062b8ba75 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic int pr_fpvalid; /* True if math co-processor being used. */ }; -/* - * Decide whether a segment is worth dumping; default is yes to be - * sure (missing info is worse than too much; etc). - * Personally I'd include everything, and use the coredump limit... - * - * I think we should skip something. But I am not sure how. H.J. - */ -static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) -{ - int dump_ok; - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) { - kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* If we may not read the contents, don't allow us to dump - * them either. "dump_write()" can't handle it anyway. - */ - if (!(vma->vm_flags & VM_READ)) { - kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* support for DAX */ - if (vma_is_dax(vma)) { - if (vma->vm_flags & VM_SHARED) { - dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } else { - dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } - return dump_ok; - } - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0) { - dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - - dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - -#ifdef CONFIG_MMU - /* By default, if it hasn't been written to, don't write it out */ - if (!vma->anon_vma) { - dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } -#endif - - dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, - dump_ok ? "yes" : "no"); - return dump_ok; -} - /* An ELF note in memory */ struct memelfnote { @@ -1524,54 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, /* * dump the segments for an MMU process */ -static bool elf_fdpic_dump_segments(struct coredump_params *cprm) +static bool elf_fdpic_dump_segments(struct coredump_params *cprm, + struct core_vma_metadata *vma_meta, + int vma_count) { - struct vm_area_struct *vma; + int i; - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { -#ifdef CONFIG_MMU - unsigned long addr; -#endif + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - if (!maydump(vma, cprm->mm_flags)) - continue; - -#ifdef CONFIG_MMU - for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { - bool res; - struct page *page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - res = dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else { - res = dump_skip(cprm, PAGE_SIZE); - } - if (!res) - return false; - } -#else - if (!dump_emit(cprm, (void *) vma->vm_start, - vma->vm_end - vma->vm_start)) + if (!dump_user_range(cprm, meta->start, meta->dump_size)) return false; -#endif } return true; } -static size_t elf_core_vma_data_size(unsigned long mm_flags) -{ - struct vm_area_struct *vma; - size_t size = 0; - - for (vma = current->mm->mmap; vma; vma = vma->vm_next) - if (maydump(vma, mm_flags)) - size += vma->vm_end - vma->vm_start; - return size; -} - /* * Actual dumper * @@ -1582,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags) static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs; + int vma_count, segs; int i; - struct vm_area_struct *vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; struct memelfnote psinfo_note, auxv_note; @@ -1598,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ + struct core_vma_metadata *vma_meta = NULL; + size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1619,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + goto end_coredump; + for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1638,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1684,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += elf_core_vma_data_size(cprm->mm_flags); + offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1704,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; size_t sz; - sz = vma->vm_end - vma->vm_start; + sz = meta->end - meta->start; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; + phdr.p_filesz = meta->dump_size; phdr.p_memsz = sz; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -1752,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - if (!elf_fdpic_dump_segments(cprm)) + if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1776,6 +1667,7 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } + kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ca2273727225..b0983e2a4e2c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1168,7 +1168,7 @@ EXPORT_SYMBOL(configfs_depend_item); /* * Release the dependent linkage. This is much simpler than - * configfs_depend_item() because we know that that the client driver is + * configfs_depend_item() because we know that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ void configfs_undepend_item(struct config_item *target) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index fb65b706cc0d..1f0270229d7b 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -267,7 +267,7 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come * on the first write. - * Hint: if you're writing a value, first read the file, modify only the + * Hint: if you're writing a value, first read the file, modify only * the value you're changing, then write entire buffer back. */ diff --git a/fs/coredump.c b/fs/coredump.c index 76e7c10edfc0..0cd9056d79cc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -840,17 +840,17 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) ssize_t n; if (cprm->written + nr > cprm->limit) return 0; - while (nr) { - if (dump_interrupted()) - return 0; - n = __kernel_write(file, addr, nr, &pos); - if (n <= 0) - return 0; - file->f_pos = pos; - cprm->written += n; - cprm->pos += n; - nr -= n; - } + + + if (dump_interrupted()) + return 0; + n = __kernel_write(file, addr, nr, &pos); + if (n != nr) + return 0; + file->f_pos = pos; + cprm->written += n; + cprm->pos += n; + return 1; } EXPORT_SYMBOL(dump_emit); @@ -876,6 +876,40 @@ int dump_skip(struct coredump_params *cprm, size_t nr) } EXPORT_SYMBOL(dump_skip); +#ifdef CONFIG_ELF_CORE +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len) +{ + unsigned long addr; + + for (addr = start; addr < start + len; addr += PAGE_SIZE) { + struct page *page; + int stop; + + /* + * To avoid having to allocate page tables for virtual address + * ranges that have never been used yet, and also to make it + * easy to generate sparse core files, use a helper that returns + * NULL when encountering an empty page table entry that would + * otherwise have been filled with the zero page. + */ + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + + stop = !dump_emit(cprm, kaddr, PAGE_SIZE); + kunmap(page); + put_page(page); + } else { + stop = !dump_skip(cprm, PAGE_SIZE); + } + if (stop) + return 0; + } + return 1; +} +#endif + int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->pos & (align - 1); @@ -902,3 +936,183 @@ void dump_truncate(struct coredump_params *cprm) } } EXPORT_SYMBOL(dump_truncate); + +/* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* + * Decide how much of @vma's contents should be included in a core dump. + */ +static unsigned long vma_dump_size(struct vm_area_struct *vma, + unsigned long mm_flags) +{ +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) + goto whole; + + if (vma->vm_flags & VM_DONTDUMP) + return 0; + + /* support for DAX */ + if (vma_is_dax(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) + goto whole; + return 0; + } + + /* Hugetlb memory check */ + if (is_vm_hugetlb_page(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + return 0; + } + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & VM_IO) + return 0; + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (file_inode(vma->vm_file)->i_nlink == 0 ? + FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) + goto whole; + return 0; + } + + /* Dump segments that have been written to. */ + if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE)) + goto whole; + if (vma->vm_file == NULL) + return 0; + + if (FILTER(MAPPED_PRIVATE)) + goto whole; + + /* + * If this is the beginning of an executable file mapping, + * dump the first page to aid in determining what was mapped here. + */ + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && + (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + +#undef FILTER + + return 0; + +whole: + return vma->vm_end - vma->vm_start; +} + +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} + +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + +/* + * Under the mmap_lock, take a snapshot of relevant information about the task's + * VMAs. + */ +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr) +{ + struct vm_area_struct *vma, *gate_vma; + struct mm_struct *mm = current->mm; + int i; + size_t vma_data_size = 0; + + /* + * Once the stack expansion code is fixed to not change VMA bounds + * under mmap_lock in read mode, this can be changed to take the + * mmap_lock in read mode. + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; + + gate_vma = get_gate_vma(mm); + *vma_count = mm->map_count + (gate_vma ? 1 : 0); + + *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); + if (!*vma_meta) { + mmap_write_unlock(mm); + return -ENOMEM; + } + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma), i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + m->start = vma->vm_start; + m->end = vma->vm_end; + m->flags = vma->vm_flags; + m->dump_size = vma_dump_size(vma, cprm->mm_flags); + + vma_data_size += m->dump_size; + } + + mmap_write_unlock(mm); + + if (WARN_ON(i != *vma_count)) + return -EFAULT; + + *vma_data_size_ptr = vma_data_size; + return 0; +} diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index bbd5e7e0632b..5b7ba8f71153 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -349,6 +349,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -358,8 +359,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 9eb0dba851e8..054ec852b5ea 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -228,6 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -237,8 +238,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/inode.c b/fs/inode.c index 72c4c347afb7..9d78c37b00b8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + if (sb->s_type->fs_flags & FS_THP_SUPPORT) + __set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index e516ae389ca5..5900879d5693 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -355,7 +355,7 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, /** * nilfs_bmap_assign - assign a new block number to a block * @bmap: bmap - * @bhp: pointer to buffer head + * @bh: pointer to buffer head * @blocknr: block number * @binfo: block information * diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 86d4d850d130..025fb082575a 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -889,7 +889,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) * nilfs_cpfile_change_cpmode - change checkpoint mode * @cpfile: inode of checkpoint file * @cno: checkpoint number - * @status: mode of checkpoint + * @mode: mode of checkpoint * * Description: nilfs_change_cpmode() changes the mode of the checkpoint * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. @@ -930,12 +930,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) /** * nilfs_cpfile_get_stat - get checkpoint statistics * @cpfile: inode of checkpoint file - * @stat: pointer to a structure of checkpoint statistics + * @cpstat: pointer to a structure of checkpoint statistics * * Description: nilfs_cpfile_get_stat() returns information about checkpoints. * * Return Value: On success, 0 is returned, and checkpoints information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @cpstat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b175f1330408..171fb5cd427f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -69,7 +69,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, /** * nilfs_forget_buffer - discard dirty state - * @inode: owner inode of the buffer * @bh: buffer head of the buffer to be discarded */ void nilfs_forget_buffer(struct buffer_head *bh) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 42ff67c0c14f..63722475e17e 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -546,13 +546,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, /** * nilfs_sufile_get_stat - get segment usage statistics * @sufile: inode of segment usage file - * @stat: pointer to a structure of segment usage statistics + * @sustat: pointer to a structure of segment usage statistics * * Description: nilfs_sufile_get_stat() returns information about segment * usage. * * Return Value: On success, 0 is returned, and segment usage information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @sustat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 846d43df3fdf..217aa2705d5d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1244,24 +1244,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } - /* - * Avoid to modify vma->vm_flags - * without locked ops while the - * coredump reads the vm_flags. - */ - if (!mmget_still_valid(mm)) { - /* - * Silently return "count" - * like if get_task_mm() - * failed. FIXME: should this - * function have returned - * -ESRCH if get_task_mm() - * failed like if - * get_proc_task() fails? - */ - mmap_write_unlock(mm); - goto out_mm; - } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 414695454956..355523f4a4bf 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -224,7 +224,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); + nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e582d001f792..b1b7d3f5752f 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -356,6 +356,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) } i->i_mode = mode; + i->i_blocks = (i->i_size + 511) >> 9; unlock_new_inode(i); return i; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0e4a3837da52..000b457ad087 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -601,8 +601,6 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - /* no task can run (and in turn coredump) yet */ - VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -842,7 +840,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - bool still_valid; WRITE_ONCE(ctx->released, true); @@ -858,7 +855,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_lock for writing. */ mmap_write_lock(mm); - still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -869,17 +865,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - if (still_valid) { - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; - } + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1309,8 +1303,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1511,8 +1503,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 99f2ac30b1d9..5b74bdf159d6 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -188,12 +188,10 @@ static inline unsigned fls_long(unsigned long l) static inline int get_count_order(unsigned int count) { - int order; + if (count == 0) + return -1; - order = fls(count) - 1; - if (count & (count - 1)) - order++; - return order; + return fls(--count); } /** @@ -206,10 +204,7 @@ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; - else if (l & (l - 1UL)) - return (int)fls_long(l); - else - return (int)fls_long(l) - 1; + return (int)fls_long(--l); } /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c09375e0a0eb..639cae2c158b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -8,6 +8,7 @@ #include <linux/genhd.h> #include <linux/list.h> #include <linux/llist.h> +#include <linux/minmax.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/pagemap.h> diff --git a/include/linux/bvec.h b/include/linux/bvec.h index dd74503f7e5e..2efec10bf792 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -7,10 +7,14 @@ #ifndef __LINUX_BVEC_ITER_H #define __LINUX_BVEC_ITER_H -#include <linux/kernel.h> #include <linux/bug.h> #include <linux/errno.h> +#include <linux/limits.h> +#include <linux/minmax.h> #include <linux/mm.h> +#include <linux/types.h> + +struct page; /** * struct bio_vec - a contiguous range of physical memory addresses diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 7a899e83835d..e58e8c207782 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -7,6 +7,12 @@ #include <linux/fs.h> #include <asm/siginfo.h> +struct core_vma_metadata { + unsigned long start, end; + unsigned long flags; + unsigned long dump_size; +}; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -16,6 +22,11 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len); +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else diff --git a/include/linux/fault-inject-usercopy.h b/include/linux/fault-inject-usercopy.h new file mode 100644 index 000000000000..56c3a693fdd9 --- /dev/null +++ b/include/linux/fault-inject-usercopy.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_FAULT_INJECT_USERCOPY_H__ +#define __LINUX_FAULT_INJECT_USERCOPY_H__ + +/* + * This header provides a wrapper for injecting failures to user space memory + * access functions. + */ + +#include <linux/types.h> + +#ifdef CONFIG_FAULT_INJECTION_USERCOPY + +bool should_fail_usercopy(void); + +#else + +static inline bool should_fail_usercopy(void) { return false; } + +#endif /* CONFIG_FAULT_INJECTION_USERCOPY */ + +#endif /* __LINUX_FAULT_INJECT_USERCOPY_H__ */ diff --git a/include/linux/fs.h b/include/linux/fs.h index ae97d87a00d2..d1d166b46131 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2209,6 +2209,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ +#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2696,33 +2697,6 @@ static inline errseq_t file_sample_sb_err(struct file *file) return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); } -static inline int filemap_nr_thps(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - return atomic_read(&mapping->nr_thps); -#else - return 0; -#endif -} - -static inline void filemap_nr_thps_inc(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_inc(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - -static inline void filemap_nr_thps_dec(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_dec(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/include/linux/idr.h b/include/linux/idr.h index 3ade03e5c7af..a0dce14090a9 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -263,7 +263,8 @@ void ida_destroy(struct ida *ida); * * Allocate an ID between 0 and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -280,7 +281,8 @@ static inline int ida_alloc(struct ida *ida, gfp_t gfp) * * Allocate an ID between @min and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -297,7 +299,8 @@ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) * * Allocate an ID between 0 and @max, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -311,6 +314,10 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } +/* + * ida_simple_get() and ida_simple_remove() are deprecated. Use + * ida_alloc() and ida_free() instead respectively. + */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 6c2b06fe8beb..5135d4b86cd6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -58,6 +58,10 @@ struct resource { #define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ #define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ +/* IORESOURCE_SYSRAM specific bits. */ +#define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ +#define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */ + #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 @@ -103,7 +107,6 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) -#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) @@ -248,8 +251,10 @@ extern struct resource * __request_region(struct resource *, extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE -extern int release_mem_region_adjustable(struct resource *, resource_size_t, - resource_size_t); +extern void release_mem_region_adjustable(resource_size_t, resource_size_t); +#endif +#ifdef CONFIG_MEMORY_HOTPLUG +extern void merge_system_ram_resource(struct resource *res); #endif /* Wrappers for managed devices */ diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fed6ba96c527..5e13f801c902 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -3,8 +3,9 @@ #define _LINUX_JIFFIES_H #include <linux/cache.h> +#include <linux/limits.h> #include <linux/math64.h> -#include <linux/kernel.h> +#include <linux/minmax.h> #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e4aa29b1ad62..c629215fdad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -11,6 +11,7 @@ #include <linux/compiler.h> #include <linux/bitops.h> #include <linux/log2.h> +#include <linux/minmax.h> #include <linux/typecheck.h> #include <linux/printk.h> #include <linux/build_bug.h> @@ -833,155 +834,6 @@ ftrace_vprintk(const char *fmt, va_list ap) static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* CONFIG_TRACING */ -/* - * min()/max()/clamp() macros must accomplish three things: - * - * - avoid multiple evaluations of the arguments (so side-effects like - * "x++" happen only once) when non-constant. - * - perform strict type-checking (to generate warnings instead of - * nasty runtime surprises). See the "unnecessary" pointer comparison - * in __typecheck(). - * - retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). - */ -#define __typecheck(x, y) \ - (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) - -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de> - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - -#define __no_side_effects(x, y) \ - (__is_constexpr(x) && __is_constexpr(y)) - -#define __safe_cmp(x, y) \ - (__typecheck(x, y) && __no_side_effects(x, y)) - -#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) - -#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ - __cmp(unique_x, unique_y, op); }) - -#define __careful_cmp(x, y, op) \ - __builtin_choose_expr(__safe_cmp(x, y), \ - __cmp(x, y, op), \ - __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) - -/** - * min - return minimum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define min(x, y) __careful_cmp(x, y, <) - -/** - * max - return maximum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define max(x, y) __careful_cmp(x, y, >) - -/** - * min3 - return minimum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define min3(x, y, z) min((typeof(x))min(x, y), z) - -/** - * max3 - return maximum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define max3(x, y, z) max((typeof(x))max(x, y), z) - -/** - * min_not_zero - return the minimum that is _not_ zero, unless both are zero - * @x: value1 - * @y: value2 - */ -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -/** - * clamp - return a value clamped to a given range with strict typechecking - * @val: current value - * @lo: lowest allowable value - * @hi: highest allowable value - * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. - */ -#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) - -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - -/** - * min_t - return minimum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) - -/** - * max_t - return maximum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) - -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - - -/** - * swap - swap values of @a and @b - * @a: first value - * @b: second value - */ -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - /* This counts to 12. Any more, it will return 13th argument. */ #define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n #define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) diff --git a/include/linux/list.h b/include/linux/list.h index 0d0d17a10d25..a18c87b63376 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -610,6 +610,15 @@ static inline void list_splice_tail_init(struct list_head *list, pos = n, n = pos->prev) /** + * list_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_entry_is_head(pos, head, member) \ + (&pos->member == (head)) + +/** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. @@ -617,7 +626,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -628,7 +637,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -653,7 +662,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -667,7 +676,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -679,7 +688,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -692,7 +701,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -705,7 +714,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -721,7 +730,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -736,7 +745,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -752,7 +761,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c0faa7a30c46..d65c6fdc5cfc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -57,6 +57,19 @@ enum { MMOP_ONLINE_MOVABLE, }; +/* Flags for add_memory() and friends to specify memory hotplug details. */ +typedef int __bitwise mhp_t; + +/* No special request */ +#define MHP_NONE ((__force mhp_t)0) +/* + * Allow merging of the added System RAM resource with adjacent, + * mergeable resources. After a successful call to add_memory_resource() + * with this flag set, the resource pointer must no longer be used as it + * might be stale, or the resource might have changed. + */ +#define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -103,8 +116,8 @@ extern int online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); -extern unsigned long __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern void __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); @@ -247,13 +260,6 @@ static inline void zone_span_writelock(struct zone *zone) {} static inline void zone_span_writeunlock(struct zone *zone) {} static inline void zone_seqlock_init(struct zone *zone) {} -static inline int mhp_notimplemented(const char *func) -{ - printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); - dump_stack(); - return -ENOSYS; -} - static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } @@ -344,14 +350,18 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {} extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); +#ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(int nid); -extern int __add_memory(int nid, u64 start, u64 size); -extern int add_memory(int nid, u64 start, u64 size); -extern int add_memory_resource(int nid, struct resource *resource); +extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory_resource(int nid, struct resource *resource, + mhp_t mhp_flags); extern int add_memory_driver_managed(int nid, u64 start, u64 size, - const char *resource_name); + const char *resource_name, + mhp_t mhp_flags); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype); extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); @@ -363,8 +373,8 @@ extern void sparse_remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, - int online_type); extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages); +#endif /* CONFIG_MEMORY_HOTPLUG */ + #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/minmax.h b/include/linux/minmax.h new file mode 100644 index 000000000000..c0f57b0c64d9 --- /dev/null +++ b/include/linux/minmax.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MINMAX_H +#define _LINUX_MINMAX_H + +/* + * min()/max()/clamp() macros must accomplish three things: + * + * - avoid multiple evaluations of the arguments (so side-effects like + * "x++" happen only once) when non-constant. + * - perform strict type-checking (to generate warnings instead of + * nasty runtime surprises). See the "unnecessary" pointer comparison + * in __typecheck(). + * - retain result as a constant expressions when called with only + * constant expressions (to avoid tripping VLA warnings in stack + * allocation usage). + */ +#define __typecheck(x, y) \ + (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) + +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de> + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + +#define __no_side_effects(x, y) \ + (__is_constexpr(x) && __is_constexpr(y)) + +#define __safe_cmp(x, y) \ + (__typecheck(x, y) && __no_side_effects(x, y)) + +#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) + +#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ + typeof(x) unique_x = (x); \ + typeof(y) unique_y = (y); \ + __cmp(unique_x, unique_y, op); }) + +#define __careful_cmp(x, y, op) \ + __builtin_choose_expr(__safe_cmp(x, y), \ + __cmp(x, y, op), \ + __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) + +/** + * min - return minimum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define min(x, y) __careful_cmp(x, y, <) + +/** + * max - return maximum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define max(x, y) __careful_cmp(x, y, >) + +/** + * min3 - return minimum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define min3(x, y, z) min((typeof(x))min(x, y), z) + +/** + * max3 - return maximum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define max3(x, y, z) max((typeof(x))max(x, y), z) + +/** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +/** + * clamp - return a value clamped to a given range with strict typechecking + * @val: current value + * @lo: lowest allowable value + * @hi: highest allowable value + * + * This macro does strict typechecking of @lo/@hi to make sure they are of the + * same type as @val. See the unnecessary pointer comparisons. + */ +#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) + +/* + * ..and if you can't take the strict + * types, you can specify one yourself. + * + * Or not use min/max/clamp at all, of course. + */ + +/** + * min_t - return minimum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) + +/** + * max_t - return maximum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) + +/** + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. + */ +#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) + +/** + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. + */ +#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) + +/** + * swap - swap values of @a and @b + * @a: first value + * @b: second value + */ +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 620961e4f32b..61a2633fcc7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2440,7 +2440,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); @@ -3025,8 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int get_hwpoison_page(struct page *page); -#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); @@ -3066,6 +3064,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c27fb1faffe5..fb3bf696c05e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -266,6 +266,8 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } +#define ANON_AND_FILE 2 + enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI @@ -283,8 +285,8 @@ struct lruvec { unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; - /* Refaults at the time of last reclaim cycle, anon=0, file=1 */ - unsigned long refaults[2]; + /* Refaults at the time of last reclaim cycle */ + unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_MEMCG @@ -441,6 +443,8 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H +#define ASYNC_AND_SYNC 2 + struct zone { /* Read-mostly fields */ @@ -560,8 +564,8 @@ struct zone { #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; + /* pfn where compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif @@ -1416,7 +1420,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define pfn_to_nid(pfn) (0) #endif -#define early_pfn_valid(pfn) pfn_valid(pfn) void sparse_init(void); #else #define sparse_init() do {} while (0) @@ -1436,10 +1439,6 @@ struct mminit_pfnnid_cache { int last_nid; }; -#ifndef early_pfn_valid -#define early_pfn_valid(pfn) (1) -#endif - /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we * need to check pfn validity within that MAX_ORDER_NR_PAGES block. diff --git a/include/linux/node.h b/include/linux/node.h index 014ba3ab2efd..8e5a29897936 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,15 +99,14 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context); +void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else -static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context) +static inline void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context) { - return 0; } #endif @@ -130,8 +129,7 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn, - MEMINIT_EARLY); + link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY); } return error; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 3334ce056335..ac398e143c9a 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -90,9 +90,9 @@ * for such situations. See below and CPUMASK_ALLOC also. */ -#include <linux/kernel.h> #include <linux/threads.h> #include <linux/bitmap.h> +#include <linux/minmax.h> #include <linux/numa.h> typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 38ded408bd4c..4f6ba9379112 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -431,13 +431,9 @@ PAGEFLAG_FALSE(Uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) -extern bool set_hwpoison_free_buddy_page(struct page *page); +extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) -static inline bool set_hwpoison_free_buddy_page(struct page *page) -{ - return 0; -} #define __PG_HWPOISON 0 #endif diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 8679ccd722e8..3468794f83d2 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int order); +extern void __split_page_owner(struct page *page, unsigned int nr); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(struct page *page); @@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int order) +static inline void split_page_owner(struct page *page, unsigned int nr) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, order); + __split_page_owner(page, nr); } static inline void copy_page_owner(struct page *oldpage, struct page *newpage) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3afd3242b54..c77b7c31b2e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -29,6 +29,7 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, + AS_THP_SUPPORT = 6, /* THPs supported */ }; /** @@ -120,6 +121,40 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +static inline bool mapping_thp_support(struct address_space *mapping) +{ + return test_bit(AS_THP_SUPPORT, &mapping->flags); +} + +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + void release_pages(struct page **pages, int nr); /* @@ -726,17 +761,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); -void page_cache_async_readahead(struct address_space *, struct file_ra_state *, - struct file *, struct page *, pgoff_t index, - unsigned long req_count); -void page_cache_readahead_unbounded(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_count); - /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. @@ -777,6 +801,67 @@ struct readahead_control { unsigned int _batch_count; }; +#define DEFINE_READAHEAD(rac, f, m, i) \ + struct readahead_control rac = { \ + .file = f, \ + .mapping = m, \ + ._index = i, \ + } + +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); +void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *, + unsigned long req_count); +void page_cache_async_ra(struct readahead_control *, struct file_ra_state *, + struct page *, unsigned long req_count); + +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. + */ +static inline +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, pgoff_t index, + unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_sync_ra(&ractl, ra, req_count); +} + +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_async_readahead() should be called when a page is used which + * is marked as PageReadahead; this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. + */ +static inline +void page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + struct page *page, pgoff_t index, unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_async_ra(&ractl, ra, page, req_count); +} + /** * readahead_page - Get the next page to read. * @rac: The current readahead request. diff --git a/include/linux/sched.h b/include/linux/sched.h index 9030f3abd969..063cd120b459 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1013,7 +1013,7 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_UBSAN +#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 15bfb06f2884..981e34cb1409 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,31 +49,6 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } -/* - * This has to be called after a get_task_mm()/mmget_not_zero() - * followed by taking the mmap_lock for writing before modifying the - * vmas or anything the coredump pretends not to change from under it. - * - * It also has to be called when mmgrab() is used in the context of - * the process, but then the mm_count refcount is transferred outside - * the context of the process to run down_write() on that pinned mm. - * - * NOTE: find_extend_vma() called from GUP context is the only place - * that can modify the "mm" (notably the vm_start/end) under mmap_lock - * for reading and outside the context of the process, so it is also - * the only case that holds the mmap_lock for reading that must call - * this function. Generally if the mmap_lock is hold for reading - * there's no need of this check after get_task_mm()/mmget_not_zero(). - * - * This function can be obsoleted and the check can be removed, after - * the coredump code will hold the mmap_lock for writing before - * invoking the ->core_dump methods. - */ -static inline bool mmget_still_valid(struct mm_struct *mm) -{ - return likely(!mm->core_state); -} - /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1ae36bc8db35..1b8c9d6162bc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,7 +2,9 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> +#include <linux/minmax.h> #include <linux/sched.h> #include <linux/thread_info.h> @@ -83,6 +85,8 @@ static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); @@ -104,6 +108,8 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -113,6 +119,8 @@ static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -124,7 +132,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); - if (likely(access_ok(from, n))) { + if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } @@ -142,6 +150,8 @@ static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 7557c1070fd7..322dcbfcc933 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -28,7 +28,7 @@ struct reclaim_stat { unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; - unsigned nr_activate[2]; + unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; diff --git a/include/linux/xarray.h b/include/linux/xarray.h index b4d70e7568b2..5cdf441f6377 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1505,6 +1505,28 @@ void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); +#ifdef CONFIG_XARRAY_MULTI +int xa_get_order(struct xarray *, unsigned long index); +void xas_split(struct xa_state *, void *entry, unsigned int order); +void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); +#else +static inline int xa_get_order(struct xarray *xa, unsigned long index) +{ + return 0; +} + +static inline void xas_split(struct xa_state *xas, void *entry, + unsigned int order) +{ + xas_store(xas, entry); +} + +static inline void xas_split_alloc(struct xa_state *xas, void *entry, + unsigned int order, gfp_t gfp) +{ +} +#endif + /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 36c5c5e38c1d..0bdbc0d17d2f 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -361,6 +361,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ @@ -373,6 +374,8 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ EM ( MF_MSG_BUDDY, "free buddy page" ) \ EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ + EM ( MF_MSG_DAX, "dax page" ) \ + EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/kernel/acct.c b/kernel/acct.c index b0c5b3a9f5af..f175df8f6aa4 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -25,7 +25,7 @@ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * - * Fixed a nasty interaction with with sys_umount(). If the accointing + * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not @@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex); * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting * - * Returns 0 for success or negative errno values for failure. - * * sys_acct() is the only system call needed to implement process * accounting. It takes the name of the file where accounting records * should be written. If the filename is NULL, accounting will be * shutdown. + * + * Returns: 0 for success or negative errno values for failure. */ SYSCALL_DEFINE1(acct, const char __user *, name) { @@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns) } /** - * acct_process - * - * handles process accounting for an exiting task + * acct_process - handles process accounting for an exiting task */ void acct_process(void) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 642415b8c3c9..57b5b5d0a5fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index b92d08e65999..06c111544f61 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,7 +16,7 @@ #include "direct.h" /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ diff --git a/kernel/fork.c b/kernel/fork.c index 3ca8f1f83fb3..32083db7a2a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -556,7 +556,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); + put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); @@ -2189,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process( /* * Ensure that the cgroup subsystem policies allow the new process to be - * forked. It should be noted the the new process's css_set can be changed + * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ diff --git a/kernel/futex.c b/kernel/futex.c index a5876694a60e..680854dcf156 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -916,7 +916,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e960d7ce7bcc..773b6105c4ae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -604,7 +604,7 @@ int irq_timings_alloc(int irq) /* * Some platforms can have the same private interrupt per cpu, - * so this function may be be called several times with the + * so this function may be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e661c61b3d6b..015ef903ce8c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -19,7 +19,7 @@ #include <linux/cpu.h> #include <asm/sections.h> -/* mutex to protect coming/going of the the jump_label table */ +/* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index f03562aaf2eb..1a6db2f797ac 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -32,7 +32,7 @@ * 1. different addresses but with the same encoded address race; * 2. and both map onto the same watchpoint slots; * - * Both these are assumed to be very unlikely. However, in case it still happens + * Both these are assumed to be very unlikely. However, in case it still * happens, the report logic will filter out the false positive (see report.c). */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c5e5e5a11535..8798a8183974 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size + * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 84f7316792a7..e21f6b9234f7 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -521,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) /* Returning 0 will take to next memory range */ /* Don't use memory that will be detected and handled by a driver. */ - if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) return 0; if (sz < kbuf->memsz) diff --git a/kernel/kthread.c b/kernel/kthread.c index 3edaa380dc7b..e29773c82b70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it - * it to a given CPU and the associated NUMA node. + * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 7ee19476de9d..2565d039ade0 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state); * * The function can be called only during transition when a new * livepatch is being enabled or when such a transition is reverted. - * It is typically called only from from pre/post (un)patch + * It is typically called only from pre/post (un)patch * callbacks. * * Return: pointer to the latest struct klp_state from already diff --git a/kernel/panic.c b/kernel/panic.c index aef8872ba843..396142ee43fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (args) vprintk(args->fmt, args->args); + print_modules(); + + if (regs) + show_regs(regs); + if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. @@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - print_modules(); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); print_irqtrace_events(current); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..9de21803a8ae 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * - * The code relies on the the pid_ns->child_reaper ignoring + * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d25749bce7cf..46b1804c1ddf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -735,7 +735,7 @@ zone_found: */ /* - * If the zone we wish to scan is the the current zone and the + * If the zone we wish to scan is the current zone and the * pfn falls into the current node then we do not need to walk * the tree. */ diff --git a/kernel/range.c b/kernel/range.c index d84de6766472..56435f96da73 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -2,8 +2,9 @@ /* * Range add and subtract */ -#include <linux/kernel.h> #include <linux/init.h> +#include <linux/minmax.h> +#include <linux/printk.h> #include <linux/sort.h> #include <linux/string.h> #include <linux/range.h> diff --git a/kernel/relay.c b/kernel/relay.c index fb4e0c530c08..b08d936d5fa7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1002,7 +1002,7 @@ static int relay_file_read_avail(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; + size_t consumed; relay_file_read_consume(buf, 0, 0); diff --git a/kernel/resource.c b/kernel/resource.c index f1175ce93a1d..3ae2f56cc79d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1240,7 +1240,6 @@ EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region - * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * @@ -1258,21 +1257,28 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { + struct resource *parent = &iomem_resource; + struct resource *new_res = NULL; + bool alloc_nofail = false; struct resource **p; struct resource *res; - struct resource *new_res; resource_size_t end; - int ret = -EINVAL; end = start + size - 1; - if ((start < parent->start) || (end > parent->end)) - return ret; + if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) + return; - /* The alloc_resource() result gets checked later */ - new_res = alloc_resource(GFP_KERNEL); + /* + * We free up quite a lot of memory on memory hotunplug (esp., memap), + * just before releasing the region. This is highly unlikely to + * fail - let's play save and make it never fail as the caller cannot + * perform any error handling (e.g., trying to re-add memory will fail + * similarly). + */ +retry: + new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); @@ -1298,7 +1304,6 @@ int release_mem_region_adjustable(struct resource *parent, * so if we are dealing with them, let us just back off here. */ if (!(res->flags & IORESOURCE_SYSRAM)) { - ret = 0; break; } @@ -1315,20 +1320,23 @@ int release_mem_region_adjustable(struct resource *parent, /* free the whole entry */ *p = res->sibling; free_resource(res); - ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ - ret = __adjust_resource(res, end + 1, - res->end - end); + WARN_ON_ONCE(__adjust_resource(res, end + 1, + res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ - ret = __adjust_resource(res, res->start, - start - res->start); + WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start)); } else { - /* split into two entries */ + /* split into two entries - we need a new resource */ if (!new_res) { - ret = -ENOMEM; - break; + new_res = alloc_resource(GFP_ATOMIC); + if (!new_res) { + alloc_nofail = true; + write_unlock(&resource_lock); + goto retry; + } } new_res->name = res->name; new_res->start = end + 1; @@ -1339,9 +1347,8 @@ int release_mem_region_adjustable(struct resource *parent, new_res->sibling = res->sibling; new_res->child = NULL; - ret = __adjust_resource(res, res->start, - start - res->start); - if (ret) + if (WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start))) break; res->sibling = new_res; new_res = NULL; @@ -1352,10 +1359,69 @@ int release_mem_region_adjustable(struct resource *parent, write_unlock(&resource_lock); free_resource(new_res); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#ifdef CONFIG_MEMORY_HOTPLUG +static bool system_ram_resources_mergeable(struct resource *r1, + struct resource *r2) +{ + /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ + return r1->flags == r2->flags && r1->end + 1 == r2->start && + r1->name == r2->name && r1->desc == r2->desc && + !r1->child && !r2->child; +} + +/* + * merge_system_ram_resource - mark the System RAM resource mergeable and try to + * merge it with adjacent, mergeable resources + * @res: resource descriptor + * + * This interface is intended for memory hotplug, whereby lots of contiguous + * system ram resources are added (e.g., via add_memory*()) by a driver, and + * the actual resource boundaries are not of interest (e.g., it might be + * relevant for DIMMs). Only resources that are marked mergeable, that have the + * same parent, and that don't have any children are considered. All mergeable + * resources must be immutable during the request. + * + * Note: + * - The caller has to make sure that no pointers to resources that are + * marked mergeable are used anymore after this call - the resource might + * be freed and the pointer might be stale! + * - release_mem_region_adjustable() will split on demand on memory hotunplug + */ +void merge_system_ram_resource(struct resource *res) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *cur; + + if (WARN_ON_ONCE((res->flags & flags) != flags)) + return; + + write_lock(&resource_lock); + res->flags |= IORESOURCE_SYSRAM_MERGEABLE; + + /* Try to merge with next item in the list. */ + cur = res->sibling; + if (cur && system_ram_resources_mergeable(res, cur)) { + res->end = cur->end; + res->sibling = cur->sibling; + free_resource(cur); + } + + /* Try to merge with previous item in the list. */ + cur = res->parent->child; + while (cur && cur->sibling != res) + cur = cur->sibling; + if (cur && system_ram_resources_mergeable(cur, res)) { + cur->end = res->end; + cur->sibling = res->sibling; + free_resource(res); + } + write_unlock(&resource_lock); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Managed region resource */ diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..d9832a171046 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -741,7 +741,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and - * the the info parameter. The function is called + * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. diff --git a/kernel/sys.c b/kernel/sys.c index ab6c409b1159..6401880dff74 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2034,7 +2034,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * - * - @start_brk/@brk which are used in do_brk but kernel lookups + * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these memvers so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..e703d5d9cbe8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged); * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test - * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 491789a793ae..ebe5ab111e65 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1768,6 +1768,13 @@ config FAIL_PAGE_ALLOC help Provide fault-injection capability for alloc_pages(). +config FAULT_INJECTION_USERCOPY + bool "Fault injection capability for usercopy functions" + depends on FAULT_INJECTION + help + Provides fault-injection capability to inject failures + in usercopy functions (copy_from_user(), get_user(), ...). + config FAIL_MAKE_REQUEST bool "Fault-injection capability for disk IO" depends on FAULT_INJECTION && BLOCK diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 774315de555a..58f8d03d037b 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -47,6 +47,20 @@ config UBSAN_BOUNDS to the {str,mem}*cpy() family of functions (that is addressed by CONFIG_FORTIFY_SOURCE). +config UBSAN_LOCAL_BOUNDS + bool "Perform array local bounds checking" + depends on UBSAN_TRAP + depends on CC_IS_CLANG + depends on !UBSAN_KCOV_BROKEN + help + This option enables -fsanitize=local-bounds which traps when an + exception/error is detected. Therefore, it should be enabled only + if trapping is expected. + Enabling this option detects errors due to accesses through a + pointer that is derived from an object of a statically-known size, + where an added offset (which may not be known statically) is + out-of-bounds. + config UBSAN_MISC bool "Enable all other Undefined Behavior sanity checks" default UBSAN diff --git a/lib/Makefile b/lib/Makefile index 49a2a9e36224..1c7577b2e86a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -210,6 +210,7 @@ obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o +obj-$(CONFIG_FAULT_INJECTION_USERCOPY) += fault-inject-usercopy.o obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o diff --git a/lib/bitmap.c b/lib/bitmap.c index 61394890788e..75006c4036e9 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -23,7 +23,7 @@ /** * DOC: bitmap introduction * - * bitmaps provide an array of bits, implemented using an an + * bitmaps provide an array of bits, implemented using an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. diff --git a/lib/crc32.c b/lib/crc32.c index 35a03d03f973..2a68dfd3b96c 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -331,7 +331,7 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, return crc; } -#if CRC_LE_BITS == 1 +#if CRC_BE_BITS == 1 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE); diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index f9628f3924ce..c72c865032fa 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -390,7 +390,7 @@ static int INIT get_next_block(struct bunzip_data *bd) j = (bd->inbufBits >> bd->inbufBitCount)& ((1 << hufGroup->maxLen)-1); got_huff_bits: - /* Figure how how many bits are in next symbol and + /* Figure how many bits are in next symbol and * unget extras */ i = hufGroup->minLen; while (j > limit[i]) diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c index e659a027036e..fde0aa244148 100644 --- a/lib/dynamic_queue_limits.c +++ b/lib/dynamic_queue_limits.c @@ -60,8 +60,8 @@ void dql_completed(struct dql *dql, unsigned int count) * A decrease is only considered if the queue has been busy in * the whole interval (the check above). * - * If there is slack, the amount of execess data queued above - * the the amount needed to prevent starvation, the queue limit + * If there is slack, the amount of excess data queued above + * the amount needed to prevent starvation, the queue limit * can be decreased. To avoid hysteresis we consider the * minimum amount of slack found over several iterations of the * completion routine. diff --git a/lib/earlycpio.c b/lib/earlycpio.c index c001e084829e..e83628882001 100644 --- a/lib/earlycpio.c +++ b/lib/earlycpio.c @@ -42,7 +42,7 @@ enum cpio_fields { /** * cpio_data find_cpio_data - Search for files in an uncompressed cpio * @path: The directory to search for, including a slash at the end - * @data: Pointer to the the cpio archive or a header inside + * @data: Pointer to the cpio archive or a header inside * @len: Remaining length of the cpio based on data pointer * @nextoff: When a matching file is found, this is the offset from the * beginning of the cpio to the beginning of the next file, not the diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c new file mode 100644 index 000000000000..77558b6c29ca --- /dev/null +++ b/lib/fault-inject-usercopy.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/fault-inject.h> +#include <linux/fault-inject-usercopy.h> + +static struct { + struct fault_attr attr; +} fail_usercopy = { + .attr = FAULT_ATTR_INITIALIZER, +}; + +static int __init setup_fail_usercopy(char *str) +{ + return setup_fault_attr(&fail_usercopy.attr, str); +} +__setup("fail_usercopy=", setup_fail_usercopy); + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_usercopy_debugfs(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_usercopy", NULL, + &fail_usercopy.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + return 0; +} + +late_initcall(fail_usercopy_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +bool should_fail_usercopy(void) +{ + return should_fail(&fail_usercopy.attr, 1); +} +EXPORT_SYMBOL_GPL(should_fail_usercopy); diff --git a/lib/find_bit.c b/lib/find_bit.c index 49f875f1baf7..4a8751010d59 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -16,6 +16,7 @@ #include <linux/bitmap.h> #include <linux/export.h> #include <linux/kernel.h> +#include <linux/minmax.h> #if !defined(find_next_bit) || !defined(find_next_zero_bit) || \ !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \ diff --git a/lib/hexdump.c b/lib/hexdump.c index 147133f8eb2f..9301578f98e8 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -7,6 +7,7 @@ #include <linux/ctype.h> #include <linux/errno.h> #include <linux/kernel.h> +#include <linux/minmax.h> #include <linux/export.h> #include <asm/unaligned.h> diff --git a/lib/idr.c b/lib/idr.c index c2cf2c52bbde..3fa8be43696f 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -372,7 +372,8 @@ EXPORT_SYMBOL(idr_replace); * Allocate an ID between @min and @max, inclusive. The allocated ID will * not exceed %INT_MAX, even if @max is larger. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -479,7 +480,8 @@ EXPORT_SYMBOL(ida_alloc_range); * @ida: IDA handle. * @id: Previously allocated ID. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. */ void ida_free(struct ida *ida, unsigned int id) { @@ -531,7 +533,8 @@ EXPORT_SYMBOL(ida_free); * or freed. If the IDA is already empty, there is no need to call this * function. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. */ void ida_destroy(struct ida *ida) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 14cae2584db4..1635111c5bd2 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -2,6 +2,7 @@ #include <crypto/hash.h> #include <linux/export.h> #include <linux/bvec.h> +#include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/slab.h> @@ -140,6 +141,8 @@ static int copyout(void __user *to, const void *from, size_t n) { + if (should_fail_usercopy()) + return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); @@ -149,6 +152,8 @@ static int copyout(void __user *to, const void *from, size_t n) static int copyin(void *to, const void __user *from, size_t n) { + if (should_fail_usercopy()) + return n; if (access_ok(from, n)) { instrument_copy_from_user(to, from, n); n = raw_copy_from_user(to, from, n); diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 77ab839644c5..5ca0d815a95d 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -12,7 +12,7 @@ * pages = {}, * month = {June}, *} - * Used by the iSCSI driver, possibly others, and derived from the + * Used by the iSCSI driver, possibly others, and derived from * the iscsi-crc.c module of the linux-iscsi driver at * http://linux-iscsi.sourceforge.net. * diff --git a/lib/math/rational.c b/lib/math/rational.c index df75c8809693..9781d521963d 100644 --- a/lib/math/rational.c +++ b/lib/math/rational.c @@ -11,7 +11,7 @@ #include <linux/rational.h> #include <linux/compiler.h> #include <linux/export.h> -#include <linux/kernel.h> +#include <linux/minmax.h> /* * calculate best rational approximation for a given fraction diff --git a/lib/math/reciprocal_div.c b/lib/math/reciprocal_div.c index bf043258fa00..32436dd4171e 100644 --- a/lib/math/reciprocal_div.c +++ b/lib/math/reciprocal_div.c @@ -4,6 +4,7 @@ #include <asm/div64.h> #include <linux/reciprocal_div.h> #include <linux/export.h> +#include <linux/minmax.h> /* * For a description of the algorithm please have a look at diff --git a/lib/mpi/mpi-bit.c b/lib/mpi/mpi-bit.c index a5119a2bcdd4..142b680835df 100644 --- a/lib/mpi/mpi-bit.c +++ b/lib/mpi/mpi-bit.c @@ -1,4 +1,4 @@ -/* mpi-bit.c - MPI bit level fucntions +/* mpi-bit.c - MPI bit level functions * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index f61689a96e85..00f666d94486 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -85,7 +85,7 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; - if (count >= batch || count <= -batch) { + if (abs(count) >= batch) { unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e4a3a4397f2..005ced92a4a9 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -325,7 +325,7 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) int ret = -ENOMEM; /* - * Nodes preloaded by one cgroup can be be used by another cgroup, so + * Nodes preloaded by one cgroup can be used by another cgroup, so * they should never be accounted to any particular memory cgroup. */ gfp_mask &= ~__GFP_ACCOUNT; diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 5d63a8857f36..d94628fa3349 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -504,7 +504,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length, nalloc++; } sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), - (gfp & ~GFP_DMA) | __GFP_ZERO); + gfp & ~GFP_DMA); if (!sgl) return NULL; diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 34696a348864..e6d5fcc2cdf3 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/export.h> +#include <linux/fault-inject-usercopy.h> #include <linux/kasan-checks.h> #include <linux/thread_info.h> #include <linux/uaccess.h> @@ -99,6 +100,8 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max_addr, src_addr; might_fault(); + if (should_fail_usercopy()) + return -EFAULT; if (unlikely(count <= 0)) return 0; diff --git a/lib/syscall.c b/lib/syscall.c index fb328e7ccb08..8533d2fea2d7 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -44,7 +44,7 @@ static int collect_syscall(struct task_struct *target, struct syscall_info *info * .data.instruction_pointer - filled with user PC * * If @target is blocked in a system call, returns zero with @info.data.nr - * set to the the call's number and @info.data.args filled in with its + * set to the call's number and @info.data.args filled in with its * arguments. Registers not used for system call arguments may not be available * and it is not kosher to use &struct user_regset calls while the system * call is still in progress. Note we may get this result if @target diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e151a7f10519..80a78877bd93 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -461,7 +461,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); if (!devmem) - return -ENOMEM; + return false; res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, "hmm_dmirror"); diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 98bc92a91662..3750323973f4 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -16,7 +16,7 @@ */ /* - * This module provides an interface to the the proc sysctl interfaces. This + * This module provides an interface to the proc sysctl interfaces. This * driver requires CONFIG_PROC_SYSCTL. It will not normally be loaded by the * system unless explicitly requested by name. You can also build this driver * into your kernel. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index d4f97925dbd8..8262c3f05a5d 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1503,6 +1503,49 @@ static noinline void check_store_range(struct xarray *xa) } } +#ifdef CONFIG_XARRAY_MULTI +static void check_split_1(struct xarray *xa, unsigned long index, + unsigned int order) +{ + XA_STATE(xas, xa, index); + void *entry; + unsigned int i = 0; + + xa_store_order(xa, index, order, xa, GFP_KERNEL); + + xas_split_alloc(&xas, xa, order, GFP_KERNEL); + xas_lock(&xas); + xas_split(&xas, xa, order); + xas_unlock(&xas); + + xa_for_each(xa, index, entry) { + XA_BUG_ON(xa, entry != xa); + i++; + } + XA_BUG_ON(xa, i != 1 << order); + + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + + xa_destroy(xa); +} + +static noinline void check_split(struct xarray *xa) +{ + unsigned int order; + + XA_BUG_ON(xa, !xa_empty(xa)); + + for (order = 1; order < 2 * XA_CHUNK_SHIFT; order++) { + check_split_1(xa, 0, order); + check_split_1(xa, 1UL << order, order); + check_split_1(xa, 3UL << order, order); + } +} +#else +static void check_split(struct xarray *xa) { } +#endif + static void check_align_1(struct xarray *xa, char *name) { int i; @@ -1649,6 +1692,26 @@ static noinline void check_account(struct xarray *xa) #endif } +static noinline void check_get_order(struct xarray *xa) +{ + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned int order; + unsigned long i, j; + + for (i = 0; i < 3; i++) + XA_BUG_ON(xa, xa_get_order(xa, i) != 0); + + for (order = 0; order < max_order; order++) { + for (i = 0; i < 10; i++) { + xa_store_order(xa, i << order, order, + xa_mk_index(i << order), GFP_KERNEL); + for (j = i << order; j < (i + 1) << order; j++) + XA_BUG_ON(xa, xa_get_order(xa, j) != order); + xa_erase(xa, i << order); + } + } +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -1697,6 +1760,7 @@ static int xarray_checks(void) check_reserve(&array); check_reserve(&xa0); check_multi_store(&array); + check_get_order(&array); check_xa_alloc(); check_find(&array); check_find_entry(&array); @@ -1708,6 +1772,7 @@ static int xarray_checks(void) check_store_range(&array); check_store_iter(&array); check_align(&xa0); + check_split(&array); check_workingset(&array, 0); check_workingset(&array, 64); diff --git a/lib/usercopy.c b/lib/usercopy.c index b26509f112f9..7413dd300516 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> +#include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/uaccess.h> @@ -10,7 +11,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n { unsigned long res = n; might_fault(); - if (likely(access_ok(from, n))) { + if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } @@ -25,6 +26,8 @@ EXPORT_SYMBOL(_copy_from_user); unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; if (likely(access_ok(to, n))) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); diff --git a/lib/xarray.c b/lib/xarray.c index e9e641d3c0c3..b76eea7b314c 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -266,13 +266,14 @@ static void xa_node_free(struct xa_node *node) */ static void xas_destroy(struct xa_state *xas) { - struct xa_node *node = xas->xa_alloc; + struct xa_node *next, *node = xas->xa_alloc; - if (!node) - return; - XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); - kmem_cache_free(radix_tree_node_cachep, node); - xas->xa_alloc = NULL; + while (node) { + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + next = rcu_dereference_raw(node->parent); + radix_tree_node_rcu_free(&node->rcu_head); + xas->xa_alloc = node = next; + } } /** @@ -304,6 +305,7 @@ bool xas_nomem(struct xa_state *xas, gfp_t gfp) xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); if (!xas->xa_alloc) return false; + xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; @@ -339,6 +341,7 @@ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) } if (!xas->xa_alloc) return false; + xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; @@ -403,7 +406,7 @@ static unsigned long xas_size(const struct xa_state *xas) /* * Use this to calculate the maximum index that will need to be created * in order to add the entry described by @xas. Because we cannot store a - * multiple-index entry at index 0, the calculation is a little more complex + * multi-index entry at index 0, the calculation is a little more complex * than you might expect. */ static unsigned long xas_max(struct xa_state *xas) @@ -946,6 +949,153 @@ void xas_init_marks(const struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_init_marks); +#ifdef CONFIG_XARRAY_MULTI +static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) +{ + unsigned int marks = 0; + xa_mark_t mark = XA_MARK_0; + + for (;;) { + if (node_get_mark(node, offset, mark)) + marks |= 1 << (__force unsigned int)mark; + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } + + return marks; +} + +static void node_set_marks(struct xa_node *node, unsigned int offset, + struct xa_node *child, unsigned int marks) +{ + xa_mark_t mark = XA_MARK_0; + + for (;;) { + if (marks & (1 << (__force unsigned int)mark)) { + node_set_mark(node, offset, mark); + if (child) + node_mark_all(child, mark); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } +} + +/** + * xas_split_alloc() - Allocate memory for splitting an entry. + * @xas: XArray operation state. + * @entry: New entry which will be stored in the array. + * @order: New entry order. + * @gfp: Memory allocation flags. + * + * This function should be called before calling xas_split(). + * If necessary, it will allocate new nodes (and fill them with @entry) + * to prepare for the upcoming split of an entry of @order size into + * entries of the order stored in the @xas. + * + * Context: May sleep if @gfp flags permit. + */ +void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, + gfp_t gfp) +{ + unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + unsigned int mask = xas->xa_sibs; + + /* XXX: no support for splitting really large entries yet */ + if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order)) + goto nomem; + if (xas->xa_shift + XA_CHUNK_SHIFT > order) + return; + + do { + unsigned int i; + void *sibling; + struct xa_node *node; + + node = kmem_cache_alloc(radix_tree_node_cachep, gfp); + if (!node) + goto nomem; + node->array = xas->xa; + for (i = 0; i < XA_CHUNK_SIZE; i++) { + if ((i & mask) == 0) { + RCU_INIT_POINTER(node->slots[i], entry); + sibling = xa_mk_sibling(0); + } else { + RCU_INIT_POINTER(node->slots[i], sibling); + } + } + RCU_INIT_POINTER(node->parent, xas->xa_alloc); + xas->xa_alloc = node; + } while (sibs-- > 0); + + return; +nomem: + xas_destroy(xas); + xas_set_err(xas, -ENOMEM); +} +EXPORT_SYMBOL_GPL(xas_split_alloc); + +/** + * xas_split() - Split a multi-index entry into smaller entries. + * @xas: XArray operation state. + * @entry: New entry to store in the array. + * @order: New entry order. + * + * The value in the entry is copied to all the replacement entries. + * + * Context: Any context. The caller should hold the xa_lock. + */ +void xas_split(struct xa_state *xas, void *entry, unsigned int order) +{ + unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + unsigned int offset, marks; + struct xa_node *node; + void *curr = xas_load(xas); + int values = 0; + + node = xas->xa_node; + if (xas_top(node)) + return; + + marks = node_get_marks(node, xas->xa_offset); + + offset = xas->xa_offset + sibs; + do { + if (xas->xa_shift < node->shift) { + struct xa_node *child = xas->xa_alloc; + + xas->xa_alloc = rcu_dereference_raw(child->parent); + child->shift = node->shift - XA_CHUNK_SHIFT; + child->offset = offset; + child->count = XA_CHUNK_SIZE; + child->nr_values = xa_is_value(entry) ? + XA_CHUNK_SIZE : 0; + RCU_INIT_POINTER(child->parent, node); + node_set_marks(node, offset, child, marks); + rcu_assign_pointer(node->slots[offset], + xa_mk_node(child)); + if (xa_is_value(curr)) + values--; + } else { + unsigned int canon = offset - xas->xa_sibs; + + node_set_marks(node, canon, NULL, marks); + rcu_assign_pointer(node->slots[canon], entry); + while (offset > canon) + rcu_assign_pointer(node->slots[offset--], + xa_mk_sibling(canon)); + values += (xa_is_value(entry) - xa_is_value(curr)) * + (xas->xa_sibs + 1); + } + } while (offset-- > xas->xa_offset); + + node->nr_values += values; +} +EXPORT_SYMBOL_GPL(xas_split); +#endif + /** * xas_pause() - Pause a walk to drop a lock. * @xas: XArray operation state. @@ -1407,7 +1557,7 @@ EXPORT_SYMBOL(__xa_store); * @gfp: Memory allocation flags. * * After this function returns, loads from this index will return @entry. - * Storing into an existing multislot entry updates the entry of every index. + * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Any context. Takes and releases the xa_lock. @@ -1549,7 +1699,7 @@ static void xas_set_range(struct xa_state *xas, unsigned long first, * * After this function returns, loads from any index between @first and @last, * inclusive will return @entry. - * Storing into an existing multislot entry updates the entry of every index. + * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Process context. Takes and releases the xa_lock. May sleep @@ -1592,6 +1742,46 @@ unlock: return xas_result(&xas, NULL); } EXPORT_SYMBOL(xa_store_range); + +/** + * xa_get_order() - Get the order of an entry. + * @xa: XArray. + * @index: Index of the entry. + * + * Return: A number between 0 and 63 indicating the order of the entry. + */ +int xa_get_order(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *entry; + int order = 0; + + rcu_read_lock(); + entry = xas_load(&xas); + + if (!entry) + goto unlock; + + if (!xas.xa_node) + goto unlock; + + for (;;) { + unsigned int slot = xas.xa_offset + (1 << order); + + if (slot >= XA_CHUNK_SIZE) + break; + if (!xa_is_sibling(xas.xa_node->slots[slot])) + break; + order++; + } + + order += xas.xa_node->shift; +unlock: + rcu_read_unlock(); + + return order; +} +EXPORT_SYMBOL(xa_get_order); #endif /* CONFIG_XARRAY_MULTI */ /** diff --git a/mm/Kconfig b/mm/Kconfig index e72e61c1d62e..c7f30f8b282b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -152,6 +152,7 @@ config HAVE_BOOTMEM_INFO_NODE # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" + select MEMORY_ISOLATION depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on 64BIT || BROKEN @@ -178,7 +179,6 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE config MEMORY_HOTREMOVE bool "Allow for memory hot remove" - select MEMORY_ISOLATION select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION diff --git a/mm/compaction.c b/mm/compaction.c index 6c63844fc061..6e0ee5641788 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -625,7 +625,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, } /* Found a free page, will break it into order-0 pages */ - order = page_order(page); + order = buddy_order(page); isolated = __isolate_free_page(page, order); if (!isolated) break; @@ -898,7 +898,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * potential isolation targets. */ if (PageBuddy(page)) { - unsigned long freepage_order = page_order_unsafe(page); + unsigned long freepage_order = buddy_order_unsafe(page); /* * Without lock, we cannot be sure that what we got is @@ -1172,7 +1172,7 @@ static bool suitable_migration_target(struct compact_control *cc, * the only small danger is that we skip a potentially suitable * pageblock, so it's not worth to check order for valid range. */ - if (page_order_unsafe(page) >= pageblock_order) + if (buddy_order_unsafe(page) >= pageblock_order) return false; } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 086309fb9b6f..c05d9dcf7891 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -28,6 +28,7 @@ #include <linux/swapops.h> #include <linux/start_kernel.h> #include <linux/sched/mm.h> +#include <linux/io.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -44,10 +45,17 @@ * entry type. But these bits might affect the ability to clear entries with * pxx_clear() because of how dynamic page table folding works on s390. So * while loading up the entries do not change the lower 4 bits. It does not - * have affect any other platform. + * have affect any other platform. Also avoid the 62nd bit on ppc64 that is + * used to mark a pte entry. */ -#define S390_MASK_BITS 4 -#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS) +#define S390_SKIP_MASK GENMASK(3, 0) +#if __BITS_PER_LONG == 64 +#define PPC64_SKIP_MASK GENMASK(62, 62) +#else +#define PPC64_SKIP_MASK 0x0 +#endif +#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) +#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) @@ -71,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm, { pte_t pte = pfn_pte(pfn, prot); + /* + * Architectures optimize set_pte_at by avoiding TLB flush. + * This requires set_pte_at to be not used to update an + * existing pte entry. Clear pte before we do set_pte_at + */ + pr_debug("Validating PTE advanced\n"); pte = pfn_pte(pfn, prot); set_pte_at(mm, vaddr, ptep, pte); ptep_set_wrprotect(mm, vaddr, ptep); pte = ptep_get(ptep); WARN_ON(pte_write(pte)); - - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); ptep_get_and_clear(mm, vaddr, ptep); pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); @@ -93,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm, ptep_set_access_flags(vma, vaddr, ptep, pte, 1); pte = ptep_get(ptep); WARN_ON(!(pte_write(pte) && pte_dirty(pte))); - - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); ptep_get_and_clear_full(mm, vaddr, ptep, 1); pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); + pte = pfn_pte(pfn, prot); pte = pte_mkyoung(pte); set_pte_at(mm, vaddr, ptep, pte); ptep_test_and_clear_young(vma, vaddr, ptep); @@ -111,10 +120,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) { pte_t pte = pfn_pte(pfn, prot); + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + pr_debug("Validating PTE saved write\n"); WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte)))); WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte)))); } + #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { @@ -141,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, - pgprot_t prot) + pgprot_t prot, pgtable_t pgtable) { pmd_t pmd = pfn_pmd(pfn, prot); @@ -152,14 +165,13 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, /* Align the address wrt HPAGE_PMD_SIZE */ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + pgtable_trans_huge_deposit(mm, pmdp, pgtable); + pmd = pfn_pmd(pfn, prot); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_set_wrprotect(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_write(pmd)); - - pmd = pfn_pmd(pfn, prot); - set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_huge_get_and_clear(mm, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); @@ -173,18 +185,20 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1); pmd = READ_ONCE(*pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); - - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); - set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1); pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); pmd = pmd_mkyoung(pmd); set_pmd_at(mm, vaddr, pmdp, pmd); pmdp_test_and_clear_young(vma, vaddr, pmdp); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_young(pmd)); + + /* Clear the pte entries */ + pmdp_huge_get_and_clear(mm, vaddr, pmdp); + pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -199,11 +213,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_leaf(pmd)); } +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { pmd_t pmd; - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + if (!arch_ioremap_pmd_supported()) return; pr_debug("Validating PMD huge\n"); @@ -217,11 +232,17 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) pmd = READ_ONCE(*pmdp); WARN_ON(!pmd_none(pmd)); } +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { pmd_t pmd = pfn_pmd(pfn, prot); + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + pr_debug("Validating PMD saved write\n"); WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); @@ -272,17 +293,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm, WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED - pud = pfn_pud(pfn, prot); - set_pud_at(mm, vaddr, pudp, pud); pudp_huge_get_and_clear(mm, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); - - pud = pfn_pud(pfn, prot); - set_pud_at(mm, vaddr, pudp, pud); - pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); - pud = READ_ONCE(*pudp); - WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(pfn, prot); pud = pud_wrprotect(pud); @@ -294,11 +307,20 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pud = READ_ONCE(*pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); +#ifndef __PAGETABLE_PMD_FOLDED + pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); +#endif /* __PAGETABLE_PMD_FOLDED */ + + pud = pfn_pud(pfn, prot); pud = pud_mkyoung(pud); set_pud_at(mm, vaddr, pudp, pud); pudp_test_and_clear_young(vma, vaddr, pudp); pud = READ_ONCE(*pudp); WARN_ON(pud_young(pud)); + + pudp_huge_get_and_clear(mm, vaddr, pudp); } static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) @@ -313,11 +335,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pud_leaf(pud)); } +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { pud_t pud; - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + if (!arch_ioremap_pud_supported()) return; pr_debug("Validating PUD huge\n"); @@ -331,6 +354,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) pud = READ_ONCE(*pudp); WARN_ON(!pud_none(pud)); } +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } +#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ + #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } static void __init pud_advanced_tests(struct mm_struct *mm, @@ -350,7 +377,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, - pgprot_t prot) + pgprot_t prot, pgtable_t pgtable) { } static void __init pud_advanced_tests(struct mm_struct *mm, @@ -417,8 +444,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, * This entry points to next level page table page. * Hence this must not qualify as pud_bad(). */ - pmd_clear(pmdp); - pud_clear(pudp); pud_populate(mm, pudp, pmdp); pud = READ_ONCE(*pudp); WARN_ON(pud_bad(pud)); @@ -515,12 +540,15 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, #endif /* PAGETABLE_P4D_FOLDED */ static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, - unsigned long vaddr) + unsigned long pfn, unsigned long vaddr, + pgprot_t prot) { - pte_t pte = ptep_get(ptep); + pte_t pte = pfn_pte(pfn, prot); pr_debug("Validating PTE clear\n"); +#ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); +#endif set_pte_at(mm, vaddr, ptep, pte); barrier(); pte_clear(mm, vaddr, ptep); @@ -550,7 +578,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, * This entry points to next level page table page. * Hence this must not qualify as pmd_bad(). */ - pmd_clear(pmdp); pmd_populate(mm, pmdp, pgtable); pmd = READ_ONCE(*pmdp); WARN_ON(pmd_bad(pmd)); @@ -784,57 +811,8 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pte_huge(pte_mkhuge(pte))); #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } - -static void __init hugetlb_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, - pte_t *ptep, unsigned long pfn, - unsigned long vaddr, pgprot_t prot) -{ - struct page *page = pfn_to_page(pfn); - pte_t pte = ptep_get(ptep); - unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK; - - pr_debug("Validating HugeTLB advanced\n"); - pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot)); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - WARN_ON(!pte_same(pte, huge_ptep_get(ptep))); - huge_pte_clear(mm, vaddr, ptep, PMD_SIZE); - pte = huge_ptep_get(ptep); - WARN_ON(!huge_pte_none(pte)); - - pte = mk_huge_pte(page, prot); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - huge_ptep_set_wrprotect(mm, vaddr, ptep); - pte = huge_ptep_get(ptep); - WARN_ON(huge_pte_write(pte)); - - pte = mk_huge_pte(page, prot); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - huge_ptep_get_and_clear(mm, vaddr, ptep); - pte = huge_ptep_get(ptep); - WARN_ON(!huge_pte_none(pte)); - - pte = mk_huge_pte(page, prot); - pte = huge_pte_wrprotect(pte); - set_huge_pte_at(mm, vaddr, ptep, pte); - barrier(); - pte = huge_pte_mkwrite(pte); - pte = huge_pte_mkdirty(pte); - huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1); - pte = huge_ptep_get(ptep); - WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte))); -} #else /* !CONFIG_HUGETLB_PAGE */ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { } -static void __init hugetlb_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, - pte_t *ptep, unsigned long pfn, - unsigned long vaddr, pgprot_t prot) -{ -} #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -965,7 +943,13 @@ static int __init debug_vm_pgtable(void) p4dp = p4d_alloc(mm, pgdp, vaddr); pudp = pud_alloc(mm, p4dp, vaddr); pmdp = pmd_alloc(mm, pudp, vaddr); - ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + /* + * Allocate pgtable_t + */ + if (pte_alloc(mm, pmdp)) { + pr_err("pgtable allocation failed\n"); + return 1; + } /* * Save all the page table page addresses as the page table @@ -985,32 +969,11 @@ static int __init debug_vm_pgtable(void) p4d_basic_tests(p4d_aligned, prot); pgd_basic_tests(pgd_aligned, prot); - pte_clear_tests(mm, ptep, vaddr); - pmd_clear_tests(mm, pmdp); - pud_clear_tests(mm, pudp); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); - - pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pmd_leaf_tests(pmd_aligned, prot); pud_leaf_tests(pud_aligned, prot); - pmd_huge_tests(pmdp, pmd_aligned, prot); - pud_huge_tests(pudp, pud_aligned, prot); - - pte_savedwrite_tests(pte_aligned, prot); - pmd_savedwrite_tests(pmd_aligned, prot); - - pte_unmap_unlock(ptep, ptl); - - pmd_populate_tests(mm, pmdp, saved_ptep); - pud_populate_tests(mm, pudp, saved_pmdp); - p4d_populate_tests(mm, p4dp, saved_pudp); - pgd_populate_tests(mm, pgdp, saved_p4dp); + pte_savedwrite_tests(pte_aligned, protnone); + pmd_savedwrite_tests(pmd_aligned, protnone); pte_special_tests(pte_aligned, prot); pte_protnone_tests(pte_aligned, protnone); @@ -1029,11 +992,43 @@ static int __init debug_vm_pgtable(void) pmd_swap_tests(pmd_aligned, prot); swap_migration_tests(); - hugetlb_basic_tests(pte_aligned, prot); pmd_thp_tests(pmd_aligned, prot); pud_thp_tests(pud_aligned, prot); + hugetlb_basic_tests(pte_aligned, prot); + + /* + * Page table modifying tests. They need to hold + * proper page table lock. + */ + + ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl); + pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot); + pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); + pte_unmap_unlock(ptep, ptl); + + ptl = pmd_lock(mm, pmdp); + pmd_clear_tests(mm, pmdp); + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); + pmd_huge_tests(pmdp, pmd_aligned, prot); + pmd_populate_tests(mm, pmdp, saved_ptep); + spin_unlock(ptl); + + ptl = pud_lock(mm, pudp); + pud_clear_tests(mm, pudp); + pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); + pud_huge_tests(pudp, pud_aligned, prot); + pud_populate_tests(mm, pudp, saved_pmdp); + spin_unlock(ptl); + + spin_lock(&mm->page_table_lock); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); + p4d_populate_tests(mm, p4dp, saved_pudp); + pgd_populate_tests(mm, pgdp, saved_p4dp); + spin_unlock(&mm->page_table_lock); + p4d_free(mm, saved_p4dp); pud_free(mm, saved_pudp); pmd_free(mm, saved_pmdp); diff --git a/mm/filemap.c b/mm/filemap.c index e3b8987153e6..1a6beaf69f49 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping, freepage(page); if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); + page_ref_sub(page, thp_nr_pages(page)); VM_BUG_ON_PAGE(page_count(page) <= 0, page); } else { put_page(page); @@ -829,13 +829,12 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); noinline int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask, + pgoff_t offset, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); int error; - void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); @@ -846,25 +845,46 @@ noinline int __add_to_page_cache_locked(struct page *page, page->index = offset; if (!huge) { - error = mem_cgroup_charge(page, current->mm, gfp_mask); + error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; } + gfp &= GFP_RECLAIM_MASK; + do { + unsigned int order = xa_get_order(xas.xa, xas.xa_index); + void *entry, *old = NULL; + + if (order > thp_order(page)) + xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), + order, gfp); xas_lock_irq(&xas); - old = xas_load(&xas); - if (old && !xa_is_value(old)) - xas_set_err(&xas, -EEXIST); + xas_for_each_conflict(&xas, entry) { + old = entry; + if (!xa_is_value(entry)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + } + + if (old) { + if (shadowp) + *shadowp = old; + /* entry may have been split before we acquired lock */ + order = xa_get_order(xas.xa, xas.xa_index); + if (order > thp_order(page)) { + xas_split(&xas, old, order); + xas_reset(&xas); + } + } + xas_store(&xas, page); if (xas_error(&xas)) goto unlock; - if (xa_is_value(old)) { + if (old) mapping->nrexceptional--; - if (shadowp) - *shadowp = old; - } mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ @@ -872,7 +892,7 @@ noinline int __add_to_page_cache_locked(struct page *page, __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { error = xas_error(&xas); @@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem * unlock_page - unlock a locked page * @page: the page * - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Unlocks the page and wakes up sleepers in wait_on_page_locked(). * Also wakes sleepers in wait_on_page_writeback() because the wakeup * mechanism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. @@ -2568,8 +2588,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); struct file *fpin = NULL; - pgoff_t offset = vmf->pgoff; unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ @@ -2580,8 +2600,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (vmf->vma->vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_readahead(mapping, ra, file, offset, - ra->ra_pages); + page_cache_sync_ra(&ractl, ra, ra->ra_pages); return fpin; } @@ -2601,10 +2620,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * mmap read-around */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, offset - ra->ra_pages / 2); + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; - ra_submit(ra, mapping, file); + ractl._index = ra->start; + do_page_cache_ra(&ractl, ra->size, ra->async_size); return fpin; } @@ -2984,7 +3004,7 @@ filler: goto out; /* - * Page is not up to date and may be locked due one of the following + * Page is not up to date and may be locked due to one of the following * case a: Page is being filled and the page lock is held * case b: Read/write error clearing the page uptodate status * case c: Truncation in progress (page locked) @@ -1490,35 +1490,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_lock, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1564,6 +1535,38 @@ finish_or_fault: } #endif /* !CONFIG_MMU */ +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_lock (takes and releases the mmap_lock by itself). + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct page *page; + int locked = 1; + int ret; + + if (mmap_read_lock_killable(mm)) + return NULL; + ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked, + FOLL_FORCE | FOLL_DUMP | FOLL_GET); + if (locked) + mmap_read_unlock(mm); + return (ret == 1) ? page : NULL; +} +#endif /* CONFIG_ELF_CORE */ + #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { diff --git a/mm/highmem.c b/mm/highmem.c index 64d8dea47dd1..1352a27951e3 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -369,7 +369,7 @@ void kunmap_high(struct page *page) } EXPORT_SYMBOL(kunmap_high); -#endif +#endif /* CONFIG_HIGHMEM */ #if defined(HASHED_PAGE_VIRTUAL) @@ -481,4 +481,4 @@ void __init page_address_init(void) } } -#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ +#endif /* defined(HASHED_PAGE_VIRTUAL) */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 65c289c13b58..9474dbc150ed 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2335,13 +2335,13 @@ static void unmap_page(struct page *page) VM_BUG_ON_PAGE(!unmap_success, page); } -static void remap_page(struct page *page) +static void remap_page(struct page *page, unsigned int nr) { int i; if (PageTransHuge(page)) { remove_migration_ptes(page, page, true); } else { - for (i = 0; i < HPAGE_PMD_NR; i++) + for (i = 0; i < nr; i++) remove_migration_ptes(page + i, page + i, true); } } @@ -2419,6 +2419,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; + unsigned int nr = thp_nr_pages(head); int i; lruvec = mem_cgroup_page_lruvec(head, pgdat); @@ -2434,7 +2435,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { + for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ if (head[i].index >= end) { @@ -2454,7 +2455,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); - split_page_owner(head, HPAGE_PMD_ORDER); + split_page_owner(head, nr); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { @@ -2473,9 +2474,15 @@ static void __split_huge_page(struct page *page, struct list_head *list, spin_unlock_irqrestore(&pgdat->lru_lock, flags); - remap_page(head); + remap_page(head, nr); - for (i = 0; i < HPAGE_PMD_NR; i++) { + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + split_swap_cluster(entry); + } + + for (i = 0; i < nr; i++) { struct page *subpage = head + i; if (subpage == page) continue; @@ -2494,7 +2501,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, int total_mapcount(struct page *page) { - int i, compound, ret; + int i, compound, nr, ret; VM_BUG_ON_PAGE(PageTail(page), page); @@ -2502,16 +2509,17 @@ int total_mapcount(struct page *page) return atomic_read(&page->_mapcount) + 1; compound = compound_mapcount(page); + nr = compound_nr(page); if (PageHuge(page)) return compound; ret = compound; - for (i = 0; i < HPAGE_PMD_NR; i++) + for (i = 0; i < nr; i++) ret += atomic_read(&page[i]._mapcount) + 1; /* File pages has compound_mapcount included in _mapcount */ if (!PageAnon(page)) - return ret - compound * HPAGE_PMD_NR; + return ret - compound * nr; if (PageDoubleMap(page)) - ret -= HPAGE_PMD_NR; + ret -= nr; return ret; } @@ -2556,14 +2564,14 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) page = compound_head(page); _total_mapcount = ret = 0; - for (i = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0; i < thp_nr_pages(page); i++) { mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); _total_mapcount += mapcount; } if (PageDoubleMap(page)) { ret -= 1; - _total_mapcount -= HPAGE_PMD_NR; + _total_mapcount -= thp_nr_pages(page); } mapcount = compound_mapcount(page); ret += mapcount; @@ -2580,9 +2588,9 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) /* Additional pins from page cache */ if (PageAnon(page)) - extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0; else - extra_pins = HPAGE_PMD_NR; + extra_pins = thp_nr_pages(page); if (pextra_pins) *pextra_pins = extra_pins; return total_mapcount(page) == page_count(page) - extra_pins - 1; @@ -2709,12 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } __split_huge_page(page, list, end, flags); - if (PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; - - ret = split_swap_cluster(entry); - } else - ret = 0; + ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { pr_alert("total_mapcount: %u, page_count(): %u\n", @@ -2728,7 +2731,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(&pgdata->lru_lock, flags); - remap_page(head); + remap_page(head, thp_nr_pages(head)); ret = -EBUSY; } diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e488876b168a..1ae1ebc2b9b1 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -26,11 +26,6 @@ static int hwpoison_inject(void *data, u64 val) p = pfn_to_page(pfn); hpage = compound_head(p); - /* - * This implies unable to support free buddy pages. - */ - if (!get_hwpoison_page(p)) - return 0; if (!hwpoison_filter_enable) goto inject; @@ -40,23 +35,20 @@ static int hwpoison_inject(void *data, u64 val) * This implies unable to support non-LRU pages. */ if (!PageLRU(hpage) && !PageHuge(p)) - goto put_out; + return 0; /* - * do a racy check with elevated page count, to make sure PG_hwpoison - * will only be set for the targeted owner (or on a free page). + * do a racy check to make sure PG_hwpoison will only be set for + * the targeted owner (or on a free page). * memory_failure() will redo the check reliably inside page lock. */ err = hwpoison_filter(hpage); if (err) - goto put_out; + return 0; inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); - return memory_failure(pfn, MF_COUNT_INCREASED); -put_out: - put_hwpoison_page(p); - return 0; + return memory_failure(pfn, 0); } static int hwpoison_unpoison(void *data, u64 val) diff --git a/mm/internal.h b/mm/internal.h index a801a4d51f26..c43ccdddb0f6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,20 +49,15 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -void force_page_cache_readahead(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read); -void __do_page_cache_readahead(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, +void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); - -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -static inline void ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) +void force_page_cache_ra(struct readahead_control *, struct file_ra_state *, + unsigned long nr); +static inline void force_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read) { - __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); + DEFINE_READAHEAD(ractl, file, mapping, index); + force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); } struct page *find_get_entry(struct address_space *mapping, pgoff_t index); @@ -275,16 +270,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must - * handle invalid values gracefully, and use page_order_unsafe() below. + * handle invalid values gracefully, and use buddy_order_unsafe() below. */ -static inline unsigned int page_order(struct page *page) +static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* - * Like page_order(), but for callers who cannot afford to hold the zone lock. + * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * @@ -294,7 +289,7 @@ static inline unsigned int page_order(struct page *page) * times, potentially observing different values in the tests and the actual * use of the result. */ -#define page_order_unsafe(page) READ_ONCE(page_private(page)) +#define buddy_order_unsafe(page) READ_ONCE(page_private(page)) static inline bool is_cow_mapping(vm_flags_t flags) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 58b0d9c502a1..4e3dff13eb70 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -434,7 +434,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, static inline int khugepaged_test_exit(struct mm_struct *mm) { - return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm); + return atomic_read(&mm->mm_users) == 0; } static bool hugepage_vma_check(struct vm_area_struct *vma, diff --git a/mm/madvise.c b/mm/madvise.c index 9b065d412e5f..fd1f448b4e1d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -872,7 +872,6 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { - struct page *page; struct zone *zone; unsigned long size; @@ -882,6 +881,7 @@ static int madvise_inject_error(int behavior, for (; start < end; start += size) { unsigned long pfn; + struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); @@ -896,32 +896,23 @@ static int madvise_inject_error(int behavior, */ size = page_size(compound_head(page)); - if (PageHWPoison(page)) { - put_page(page); - continue; - } - if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", - pfn, start); - + pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); - if (ret) - return ret; - continue; + } else { + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", + pfn, start); + /* + * Drop the page reference taken by get_user_pages_fast(). In + * the absence of MF_COUNT_INCREASED the memory_failure() + * routine is responsible for pinning the page to prevent it + * from being released back to the page allocator. + */ + put_page(page); + ret = memory_failure(pfn, 0); } - pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", - pfn, start); - - /* - * Drop the page reference taken by get_user_pages_fast(). In - * the absence of MF_COUNT_INCREASED the memory_failure() - * routine is responsible for pinning the page to prevent it - * from being released back to the page allocator. - */ - put_page(page); - ret = memory_failure(pfn, 0); if (ret) return ret; } @@ -1094,23 +1085,6 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) if (write) { if (mmap_write_lock_killable(current->mm)) return -EINTR; - - /* - * We may have stolen the mm from another process - * that is undergoing core dumping. - * - * Right now that's io_ring, in the future it may - * be remote process management and not "current" - * at all. - * - * We need to fix core dumping to not do this, - * but for now we have the mmget_still_valid() - * model. - */ - if (!mmget_still_valid(current->mm)) { - mmap_write_unlock(current->mm); - return -EINTR; - } } else { mmap_read_lock(current->mm); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 990e3b2e37d5..a2184b721fbf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -65,6 +65,33 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); +static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) +{ + if (hugepage_or_freepage) { + /* + * Doing this check for free pages is also fine since dissolve_free_huge_page + * returns 0 for non-hugetlb pages as well. + */ + if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) + /* + * We could fail to take off the target page from buddy + * for example due to racy page allocaiton, but that's + * acceptable because soft-offlined page is not broken + * and if someone really want to use it, they should + * take it. + */ + return false; + } + + SetPageHWPoison(page); + if (release) + put_page(page); + page_ref_inc(page); + num_poisoned_pages_inc(); + + return true; +} + #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) u32 hwpoison_filter_enable = 0; @@ -555,6 +582,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", + [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -924,7 +952,7 @@ static int page_action(struct page_state *ps, struct page *p, * Return: return 0 if failed to grab the refcount, otherwise true (some * non-zero value.) */ -int get_hwpoison_page(struct page *page) +static int get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); @@ -953,7 +981,6 @@ int get_hwpoison_page(struct page *page) return 0; } -EXPORT_SYMBOL_GPL(get_hwpoison_page); /* * Do all that is necessary to remove user space mappings. Unmap @@ -1103,6 +1130,25 @@ static int identify_page_state(unsigned long pfn, struct page *p, return page_action(ps, p, pfn); } +static int try_to_split_thp_page(struct page *page, const char *msg) +{ + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unsigned long pfn = page_to_pfn(page); + + unlock_page(page); + if (!PageAnon(page)) + pr_info("%s: %#lx: non anonymous thp\n", msg, pfn); + else + pr_info("%s: %#lx: thp split failed\n", msg, pfn); + put_page(page); + return -EBUSY; + } + unlock_page(page); + + return 0; +} + static int memory_failure_hugetlb(unsigned long pfn, int flags) { struct page *p = pfn_to_page(pfn); @@ -1144,7 +1190,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(head); - put_hwpoison_page(head); + put_page(head); return 0; } @@ -1325,23 +1371,11 @@ int memory_failure(unsigned long pfn, int flags) } if (PageTransHuge(hpage)) { - lock_page(p); - if (!PageAnon(p) || unlikely(split_huge_page(p))) { - unlock_page(p); - if (!PageAnon(p)) - pr_err("Memory failure: %#lx: non anonymous thp\n", - pfn); - else - pr_err("Memory failure: %#lx: thp split failed\n", - pfn); - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - put_hwpoison_page(p); + if (try_to_split_thp_page(p, "Memory Failure") < 0) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); return -EBUSY; } - unlock_page(p); VM_BUG_ON_PAGE(!page_count(p), p); - hpage = compound_head(p); } /* @@ -1381,10 +1415,7 @@ int memory_failure(unsigned long pfn, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - if (PageHuge(p)) - page_flags = hpage->flags; - else - page_flags = p->flags; + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock @@ -1393,14 +1424,14 @@ int memory_failure(unsigned long pfn, int flags) pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); - put_hwpoison_page(p); + put_page(p); return 0; } @@ -1416,11 +1447,8 @@ int memory_failure(unsigned long pfn, int flags) /* * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. - * - * When the raw error page is thp tail page, hpage points to the raw - * page after thp split. */ - if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) { + if (!hwpoison_user_mappings(p, pfn, flags, &p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1637,9 +1665,9 @@ int unpoison_memory(unsigned long pfn) } unlock_page(page); - put_hwpoison_page(page); + put_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) - put_hwpoison_page(page); + put_page(page); return 0; } @@ -1679,6 +1707,9 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) } else if (is_free_buddy_page(p)) { pr_info("%s: %#lx free buddy page\n", __func__, pfn); ret = 0; + } else if (page_count(p)) { + /* raced with allocation */ + ret = -EBUSY; } else { pr_info("%s: %#lx: unknown zero refcount page type %lx\n", __func__, pfn, p->flags); @@ -1695,12 +1726,15 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) { int ret = __get_any_page(page, pfn, flags); + if (ret == -EBUSY) + ret = __get_any_page(page, pfn, flags); + if (ret == 1 && !PageHuge(page) && !PageLRU(page) && !__PageMovable(page)) { /* * Try to free it. */ - put_hwpoison_page(page); + put_page(page); shake_page(page, 1); /* @@ -1709,7 +1743,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) ret = __get_any_page(page, pfn, 0); if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ - put_hwpoison_page(page); + put_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", pfn, page->flags, &page->flags); return -EIO; @@ -1718,69 +1752,51 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) return ret; } -static int soft_offline_huge_page(struct page *page, int flags) +static bool isolate_page(struct page *page, struct list_head *pagelist) { - int ret; - unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_head(page); - LIST_HEAD(pagelist); + bool isolated = false; + bool lru = PageLRU(page); - /* - * This double-check of PageHWPoison is to avoid the race with - * memory_failure(). See also comment in __soft_offline_page(). - */ - lock_page(hpage); - if (PageHWPoison(hpage)) { - unlock_page(hpage); - put_hwpoison_page(hpage); - pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); - return -EBUSY; + if (PageHuge(page)) { + isolated = isolate_huge_page(page, pagelist); + } else { + if (lru) + isolated = !isolate_lru_page(page); + else + isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); + + if (isolated) + list_add(&page->lru, pagelist); } - unlock_page(hpage); - ret = isolate_huge_page(hpage, &pagelist); + if (isolated && lru) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_lru(page)); + /* - * get_any_page() and isolate_huge_page() takes a refcount each, - * so need to drop one here. + * If we succeed to isolate the page, we grabbed another refcount on + * the page, so we can safely drop the one we got from get_any_pages(). + * If we failed to isolate the page, it means that we cannot go further + * and we will return an error, so drop the reference we got from + * get_any_pages() as well. */ - put_hwpoison_page(hpage); - if (!ret) { - pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); - return -EBUSY; - } - - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC, MR_MEMORY_FAILURE); - if (ret) { - pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", - pfn, ret, page->flags, &page->flags); - if (!list_empty(&pagelist)) - putback_movable_pages(&pagelist); - if (ret > 0) - ret = -EIO; - } else { - /* - * We set PG_hwpoison only when the migration source hugepage - * was successfully dissolved, because otherwise hwpoisoned - * hugepage remains on free hugepage list, then userspace will - * find it as SIGBUS by allocation failure. That's not expected - * in soft-offlining. - */ - ret = dissolve_free_huge_page(page); - if (!ret) { - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - else - ret = -EBUSY; - } - } - return ret; + put_page(page); + return isolated; } -static int __soft_offline_page(struct page *page, int flags) +/* + * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. + * If the page is a non-dirty unmapped page-cache page, it simply invalidates. + * If the page is mapped, it migrates the contents over. + */ +static int __soft_offline_page(struct page *page) { - int ret; + int ret = 0; unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + char const *msg_page[] = {"page", "hugepage"}; + bool huge = PageHuge(page); + LIST_HEAD(pagelist); /* * Check PageHWPoison again inside page lock because PageHWPoison @@ -1789,121 +1805,75 @@ static int __soft_offline_page(struct page *page, int flags) * so there's no race between soft_offline_page() and memory_failure(). */ lock_page(page); - wait_on_page_writeback(page); + if (!PageHuge(page)) + wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); - put_hwpoison_page(page); + put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); - return -EBUSY; + return 0; } - /* - * Try to invalidate first. This should work for - * non dirty unmapped page cache pages. - */ - ret = invalidate_inode_page(page); + + if (!PageHuge(page)) + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); unlock_page(page); + /* * RED-PEN would be better to keep it isolated here, but we * would need to fix isolation locking first. */ - if (ret == 1) { - put_hwpoison_page(page); + if (ret) { pr_info("soft_offline: %#lx: invalidated\n", pfn); - SetPageHWPoison(page); - num_poisoned_pages_inc(); + page_handle_poison(page, false, true); return 0; } - /* - * Simple invalidation didn't work. - * Try to migrate to a new page instead. migrate.c - * handles a large number of cases for us. - */ - if (PageLRU(page)) - ret = isolate_lru_page(page); - else - ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - /* - * Drop page reference which is came from get_any_page() - * successful isolate_lru_page() already took another one. - */ - put_hwpoison_page(page); - if (!ret) { - LIST_HEAD(pagelist); - /* - * After isolated lru page, the PageLRU will be cleared, - * so use !__PageMovable instead for LRU page's mapping - * cannot have PAGE_MAPPING_MOVABLE. - */ - if (!__PageMovable(page)) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); - list_add(&page->lru, &pagelist); + if (isolate_page(hpage, &pagelist)) { ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); - if (ret) { + if (!ret) { + bool release = !huge; + + if (!page_handle_poison(page, huge, release)) + ret = -EBUSY; + } else { if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", - pfn, ret, page->flags, &page->flags); + pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n", + pfn, msg_page[huge], ret, page->flags, &page->flags); if (ret > 0) ret = -EIO; } } else { - pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", - pfn, ret, page_count(page), page->flags, &page->flags); + pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n", + pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags); + ret = -EBUSY; } return ret; } -static int soft_offline_in_use_page(struct page *page, int flags) +static int soft_offline_in_use_page(struct page *page) { - int ret; - int mt; struct page *hpage = compound_head(page); - if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(page); - if (!PageAnon(page) || unlikely(split_huge_page(page))) { - unlock_page(page); - if (!PageAnon(page)) - pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); - else - pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(page); + if (!PageHuge(page) && PageTransHuge(hpage)) + if (try_to_split_thp_page(page, "soft offline") < 0) return -EBUSY; - } - unlock_page(page); - } - - /* - * Setting MIGRATE_ISOLATE here ensures that the page will be linked - * to free list immediately (not via pcplist) when released after - * successful page migration. Otherwise we can't guarantee that the - * page is really free after put_page() returns, so - * set_hwpoison_free_buddy_page() highly likely fails. - */ - mt = get_pageblock_migratetype(page); - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - set_pageblock_migratetype(page, mt); - return ret; + return __soft_offline_page(page); } static int soft_offline_free_page(struct page *page) { - int rc = dissolve_free_huge_page(page); + int rc = 0; + + if (!page_handle_poison(page, true, false)) + rc = -EBUSY; - if (!rc) { - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - else - rc = -EBUSY; - } return rc; } @@ -1933,6 +1903,7 @@ int soft_offline_page(unsigned long pfn, int flags) { int ret; struct page *page; + bool try_again = true; if (!pfn_valid(pfn)) return -ENXIO; @@ -1944,18 +1915,22 @@ int soft_offline_page(unsigned long pfn, int flags) if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); if (flags & MF_COUNT_INCREASED) - put_hwpoison_page(page); - return -EBUSY; + put_page(page); + return 0; } +retry: get_online_mems(); ret = get_any_page(page, pfn, flags); put_online_mems(); if (ret > 0) - ret = soft_offline_in_use_page(page, flags); + ret = soft_offline_in_use_page(page); else if (ret == 0) - ret = soft_offline_free_page(page); + if (soft_offline_free_page(page) && try_again) { + try_again = false; + goto retry; + } return ret; } diff --git a/mm/memory.c b/mm/memory.c index 2afb01ea1307..589afe45d0b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3709,13 +3709,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i; - vm_fault_t ret; + vm_fault_t ret = VM_FAULT_FALLBACK; if (!transhuge_vma_suitable(vma, haddr)) - return VM_FAULT_FALLBACK; + return ret; - ret = VM_FAULT_FALLBACK; page = compound_head(page); + if (compound_order(page) != HPAGE_PMD_ORDER) + return ret; /* * Archs like ppc64 need additonal space to store information diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8e9e2d44cdad..6f203574ca1d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -105,7 +105,7 @@ static struct resource *register_memory_resource(u64 start, u64 size, unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; if (strcmp(resource_name, "System RAM")) - flags |= IORESOURCE_MEM_DRIVER_MANAGED; + flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; /* * Make sure value parsed from 'mem=' only restricts memory adding @@ -625,31 +625,22 @@ void generic_online_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(generic_online_page); -static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg) +static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn; - int order; /* - * Online the pages. The callback might decide to keep some pages - * PG_reserved (to add them to the buddy later), but we still account - * them as being online/belonging to this zone ("present"). + * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might + * decide to not expose all pages to the buddy (e.g., expose them + * later). We account all pages as being online and belonging to this + * zone ("present"). */ - for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { - order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); - /* __free_pages_core() wants pfns to be aligned to the order */ - if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order))) - order = 0; - (*online_page_callback)(pfn_to_page(pfn), order); - } + for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) + (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); - - *(unsigned long *)arg += nr_pages; - return 0; } /* check which state of node_states will be changed when online memory */ @@ -710,9 +701,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this * call, all affected pages are PG_reserved. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -737,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMINIT_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); } @@ -803,17 +799,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid) { unsigned long flags; - unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; int ret; struct memory_notify arg; + /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + if (WARN_ON_ONCE(!nr_pages || + !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + mem_hotplug_begin(); /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -825,6 +825,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, goto failed_addition; /* + * Fixup the number of isolated pageblocks before marking the sections + * onlining, such that undo_isolate_page_range() works correctly. + */ + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; + spin_unlock_irqrestore(&zone->lock, flags); + + /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online. @@ -834,36 +842,29 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, setup_zone_pageset(zone); } - ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, - online_pages_range); - if (ret) { - /* not a single memory resource was applicable */ - if (need_zonelists_rebuild) - zone_pcp_reset(zone); - goto failed_addition; - } - - zone->present_pages += onlined_pages; + online_pages_range(pfn, nr_pages); + zone->present_pages += nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += onlined_pages; + zone->zone_pgdat->node_present_pages += nr_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); - /* - * When exposing larger, physically contiguous memory areas to the - * buddy, shuffling in the buddy (when freeing onlined pages, putting - * them either to the head or the tail of the freelist) is only helpful - * for maintaining the shuffle, but not for creating the initial - * shuffle. Shuffle the whole zone to make sure the just onlined pages - * are properly distributed across the whole freelist. - */ - shuffle_zone(zone); - node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL); zone_pcp_update(zone); + /* Basic onlining is complete, allow allocation of onlined pages. */ + undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + + /* + * Freshly onlined pages aren't shuffled (e.g., all pages are placed to + * the tail of the freelist when undoing isolation). Shuffle the whole + * zone to make sure the just onlined pages are properly distributed + * across the whole freelist - to create an initial shuffle. + */ + shuffle_zone(zone); + init_per_zone_wmark_min(); kswapd_run(nid); @@ -1035,7 +1036,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res) +int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = PAGE_KERNEL }; u64 start, size; @@ -1088,9 +1089,8 @@ int __ref add_memory_resource(int nid, struct resource *res) } /* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), - MEMINIT_HOTPLUG); - BUG_ON(ret); + link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) @@ -1099,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res) /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); + /* + * In case we're allowed to merge the resource, flag it and trigger + * merging now that adding succeeded. + */ + if (mhp_flags & MEMHP_MERGE_RESOURCE) + merge_system_ram_resource(res); + /* online pages if requested */ if (memhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); @@ -1115,7 +1122,7 @@ error: } /* requires device_hotplug_lock, see add_memory_resource() */ -int __ref __add_memory(int nid, u64 start, u64 size) +int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; @@ -1124,18 +1131,18 @@ int __ref __add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res); + ret = add_memory_resource(nid, res, mhp_flags); if (ret < 0) release_memory_resource(res); return ret; } -int add_memory(int nid, u64 start, u64 size) +int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { int rc; lock_device_hotplug(); - rc = __add_memory(nid, start, size); + rc = __add_memory(nid, start, size, mhp_flags); unlock_device_hotplug(); return rc; @@ -1157,14 +1164,14 @@ EXPORT_SYMBOL_GPL(add_memory); * * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided * memory map") are created. Also, the created memory resource is flagged - * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case + * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case * this memory as well (esp., not place kexec images onto it). * * The resource_name (visible via /proc/iomem) has to have the format * "System RAM ($DRIVER)". */ int add_memory_driver_managed(int nid, u64 start, u64 size, - const char *resource_name) + const char *resource_name, mhp_t mhp_flags) { struct resource *res; int rc; @@ -1182,7 +1189,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size, goto out_unlock; } - rc = add_memory_resource(nid, res); + rc = add_memory_resource(nid, res, mhp_flags); if (rc < 0) release_memory_resource(res); @@ -1379,28 +1386,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) return ret; } -/* Mark all sections offline and remove all free pages from the buddy. */ -static int -offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, - void *data) -{ - unsigned long *offlined_pages = (unsigned long *)data; - - *offlined_pages += __offline_isolated_pages(start, start + nr_pages); - return 0; -} - -/* - * Check all pages in range, recorded as memory resource, are isolated. - */ -static int -check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, - void *data) -{ - return test_pages_isolated(start_pfn, start_pfn + nr_pages, - MEMORY_OFFLINE); -} - static int __init cmdline_parse_movable_node(char *p) { movable_node_enabled = true; @@ -1484,17 +1469,21 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -static int __ref __offline_pages(unsigned long start_pfn, - unsigned long end_pfn) +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) { - unsigned long pfn, nr_pages = 0; - unsigned long offlined_pages = 0; - int ret, node, nr_isolate_pageblock; + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn, system_ram_pages = 0; unsigned long flags; struct zone *zone; struct memory_notify arg; + int ret, node; char *reason; + /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + if (WARN_ON_ONCE(!nr_pages || + !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + mem_hotplug_begin(); /* @@ -1505,9 +1494,9 @@ static int __ref __offline_pages(unsigned long start_pfn, * memory holes PG_reserved, don't need pfn_valid() checks, and can * avoid using walk_system_ram_range() later. */ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages, + walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, count_system_ram_pages_cb); - if (nr_pages != end_pfn - start_pfn) { + if (system_ram_pages != nr_pages) { ret = -EINVAL; reason = "memory holes"; goto failed_removal; @@ -1527,11 +1516,10 @@ static int __ref __offline_pages(unsigned long start_pfn, ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, MEMORY_OFFLINE | REPORT_FAILURE); - if (ret < 0) { + if (ret) { reason = "failure to isolate range"; goto failed_removal; } - nr_isolate_pageblock = ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1581,9 +1569,7 @@ static int __ref __offline_pages(unsigned long start_pfn, reason = "failure to dissolve huge pages"; goto failed_removal_isolated; } - /* check again */ - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, - NULL, check_pages_isolated_cb); + /* * per-cpu pages are drained in start_isolate_page_range, but if * there are still pages that are not free, make sure that we @@ -1596,30 +1582,30 @@ static int __ref __offline_pages(unsigned long start_pfn, * because has_unmovable_pages explicitly checks for * PageBuddy on freed pages on other zones. */ + ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); if (ret) drain_all_pages(zone); } while (ret); - /* Ok, all of our target is isolated. - We cannot do rollback at this point. */ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, - &offlined_pages, offline_isolated_pages_cb); - pr_info("Offlined Pages %ld\n", offlined_pages); + /* Mark all sections offline and remove free pages from the buddy. */ + __offline_isolated_pages(start_pfn, end_pfn); + pr_info("Offlined Pages %ld\n", nr_pages); + /* - * Onlining will reset pagetype flags and makes migrate type - * MOVABLE, so just need to decrease the number of isolated - * pageblocks zone counter here. + * The memory sections are marked offline, and the pageblock flags + * effectively stale; nobody should be touching them. Fixup the number + * of isolated pageblocks, memory onlining will properly revert this. */ spin_lock_irqsave(&zone->lock, flags); - zone->nr_isolate_pageblock -= nr_isolate_pageblock; + zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); /* removal success */ - adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); - zone->present_pages -= offlined_pages; + adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + zone->present_pages -= nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= offlined_pages; + zone->zone_pgdat->node_present_pages -= nr_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); @@ -1656,11 +1642,6 @@ failed_removal: return ret; } -int offline_pages(unsigned long start_pfn, unsigned long nr_pages) -{ - return __offline_pages(start_pfn, start_pfn + nr_pages); -} - static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); @@ -1749,26 +1730,6 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); -static void __release_memory_resource(resource_size_t start, - resource_size_t size) -{ - int ret; - - /* - * When removing memory in the same granularity as it was added, - * this function never fails. It might only fail if resources - * have to be adjusted or split. We'll ignore the error, as - * removing of memory cannot fail. - */ - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } -} - static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; @@ -1802,7 +1763,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) memblock_remove(start, size); } - __release_memory_resource(start, size); + release_mem_region_adjustable(start, size); try_offline_node(nid); diff --git a/mm/memremap.c b/mm/memremap.c index 198083453182..73a206d0f645 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -266,7 +266,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(range->start), - PHYS_PFN(range_len(range)), params->altmap); + PHYS_PFN(range_len(range)), params->altmap, + MIGRATE_MOVABLE); } mem_hotplug_done(); diff --git a/mm/migrate.c b/mm/migrate.c index f94d7c7eeddf..4cf1af88c1dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1223,16 +1223,11 @@ out: * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { - put_page(page); - if (reason == MR_MEMORY_FAILURE) { + if (reason != MR_MEMORY_FAILURE) /* - * Set PG_HWPoison on just freed page - * intentionally. Although it's rather weird, - * it's how HWPoison flag works at the moment. + * We release the page in page_handle_poison. */ - if (set_hwpoison_free_buddy_page(page)) - num_poisoned_pages_inc(); - } + put_page(page); } else { if (rc != -EAGAIN) { if (likely(!__PageMovable(page))) { diff --git a/mm/mmap.c b/mm/mmap.c index 67d11ad6df24..ebb92f5515a1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -619,7 +619,7 @@ static void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file_inode(file)->i_writecount); + put_write_access(file_inode(file)); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(mapping); @@ -2562,7 +2562,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (vma && (vma->vm_start <= addr)) return vma; /* don't alter vm_end if the coredump is running */ - if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) + if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2588,9 +2588,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; - /* don't alter vm_start if the coredump is running */ - if (!mmget_still_valid(mm)) - return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 4fc918163dd3..5654dd19addc 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -913,7 +913,7 @@ static int __mmu_interval_notifier_insert( return -EOVERFLOW; /* Must call with a mmget() held */ - if (WARN_ON(atomic_read(&mm->mm_count) <= 0)) + if (WARN_ON(atomic_read(&mm->mm_users) <= 0)) return -EINVAL; /* pairs with mmdrop in mmu_interval_notifier_remove() */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 358d6f28c627..7709f0e223f5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2849,6 +2849,7 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback); */ void wait_for_stable_page(struct page *page) { + page = thp_head(page); if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) wait_on_page_writeback(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0ff3a811ec5..23f5066bd4a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -78,6 +78,34 @@ #include "shuffle.h" #include "page_reporting.h" +/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ +typedef int __bitwise fpi_t; + +/* No special request */ +#define FPI_NONE ((__force fpi_t)0) + +/* + * Skip free page reporting notification for the (possibly merged) page. + * This does not hinder free page reporting from grabbing the page, + * reporting it and marking it "reported" - it only skips notifying + * the free page reporting infrastructure about a newly freed page. For + * example, used when temporarily pulling a page from a freelist and + * putting it back unmodified. + */ +#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) + +/* + * Place the (possibly merged) page to the tail of the freelist. Will ignore + * page shuffling (relevant code - e.g., memory onlining - is expected to + * shuffle the whole zone). + * + * Note: No code should rely on this flag for correctness - it's purely + * to allow for optimizations when handing back either fresh pages + * (memory onlining) or untouched pages (page isolation, free page + * reporting). + */ +#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -247,7 +275,8 @@ bool pm_suspended_storage(void) unsigned int pageblock_order __read_mostly; #endif -static void __free_pages_ok(struct page *page, unsigned int order); +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -659,7 +688,7 @@ out: void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page)); + __free_pages_ok(page, compound_order(page), FPI_NONE); } void prep_compound_page(struct page *page, unsigned int order) @@ -763,7 +792,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif -static inline void set_page_order(struct page *page, unsigned int order) +static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -788,7 +817,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy, if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; - if (page_order(buddy) != order) + if (buddy_order(buddy) != order) return false; /* @@ -873,13 +902,17 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, area->nr_free++; } -/* Used for pages which are on another list */ +/* + * Used for pages which are on another list. Move the pages to the tail + * of the list - so the moved pages won't immediately be considered for + * allocation again (e.g., optimization for memory onlining). + */ static inline void move_to_free_list(struct page *page, struct zone *zone, unsigned int order, int migratetype) { struct free_area *area = &zone->free_area[order]; - list_move(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->lru, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -952,7 +985,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, - int migratetype, bool report) + int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); unsigned long buddy_pfn; @@ -1026,9 +1059,11 @@ continue_merging: } done_merging: - set_page_order(page, order); + set_buddy_order(page, order); - if (is_shuffle_order(order)) + if (fpi_flags & FPI_TO_TAIL) + to_tail = true; + else if (is_shuffle_order(order)) to_tail = shuffle_pick_tail(); else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); @@ -1039,7 +1074,7 @@ done_merging: add_to_free_list(page, zone, order, migratetype); /* Notify page reporting subsystem of freed page */ - if (report) + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) page_reporting_notify_free(order); } @@ -1174,6 +1209,17 @@ static __always_inline bool free_pages_prepare(struct page *page, trace_mm_page_free(page, order); + if (unlikely(PageHWPoison(page)) && !order) { + /* + * Do not let hwpoison pages hit pcplists/buddy + * Untie memcg state and reset page's owner + */ + if (memcg_kmem_enabled() && PageKmemcg(page)) + __memcg_kmem_uncharge_page(page, order); + reset_page_owner(page, order); + return false; + } + /* * Check tail pages before head page information is cleared to * avoid checking PageCompound for order-0 pages. @@ -1369,7 +1415,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, 0, mt); } spin_unlock(&zone->lock); @@ -1378,14 +1424,14 @@ static void free_pcppages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, - int migratetype) + int migratetype, fpi_t fpi_flags) { spin_lock(&zone->lock); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype, true); + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock(&zone->lock); } @@ -1463,7 +1509,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) } } -static void __free_pages_ok(struct page *page, unsigned int order) +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; int migratetype; @@ -1475,7 +1522,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype, + fpi_flags); local_irq_restore(flags); } @@ -1485,6 +1533,11 @@ void __free_pages_core(struct page *page, unsigned int order) struct page *p = page; unsigned int loop; + /* + * When initializing the memmap, __init_single_page() sets the refcount + * of all pages to 1 ("allocated"/"not free"). We have to set the + * refcount of all involved pages to 0. + */ prefetchw(p); for (loop = 0; loop < (nr_pages - 1); loop++, p++) { prefetchw(p + 1); @@ -1495,8 +1548,12 @@ void __free_pages_core(struct page *page, unsigned int order) set_page_count(p, 0); atomic_long_add(nr_pages, &page_zone(page)->managed_pages); - set_page_refcounted(page); - __free_pages(page, order); + + /* + * Bypass PCP and place fresh pages right to the tail, primarily + * relevant for memory onlining. + */ + __free_pages_ok(page, order, FPI_TO_TAIL); } #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -2121,7 +2178,7 @@ static inline void expand(struct zone *zone, struct page *page, continue; add_to_free_list(&page[size], zone, high, migratetype); - set_page_order(&page[size], high); + set_buddy_order(&page[size], high); } } @@ -2299,7 +2356,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Move the free pages in a range to the free lists of the requested type. + * Move the free pages in a range to the freelist tail of the requested type. * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ @@ -2335,7 +2392,7 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); VM_BUG_ON_PAGE(page_zone(page) != zone, page); - order = page_order(page); + order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); page += 1 << order; pages_moved += 1 << order; @@ -2459,7 +2516,7 @@ static inline void boost_watermark(struct zone *zone) static void steal_suitable_fallback(struct zone *zone, struct page *page, unsigned int alloc_flags, int start_type, bool whole_block) { - unsigned int current_order = page_order(page); + unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; int old_block_type; @@ -3123,7 +3180,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype, + FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; @@ -3209,7 +3267,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order); + split_page_owner(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); @@ -3278,7 +3336,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) lockdep_assert_held(&zone->lock); /* Return isolated page to tail of freelist. */ - __free_one_page(page, page_to_pfn(page), zone, order, mt, false); + __free_one_page(page, page_to_pfn(page), zone, order, mt, + FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); } /* @@ -4945,7 +5004,7 @@ static inline void free_the_page(struct page *page, unsigned int order) if (order == 0) /* Via pcp? */ free_unref_page(page); else - __free_pages_ok(page, order); + __free_pages_ok(page, order, FPI_NONE); } void __free_pages(struct page *page, unsigned int order) @@ -5979,10 +6038,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum meminit_context context, - struct vmem_altmap *altmap) + unsigned long start_pfn, + enum meminit_context context, + struct vmem_altmap *altmap, int migratetype) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -6026,19 +6090,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, __SetPageReserved(page); /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, + * such that unmovable allocations won't be scattered all + * over the place during system boot. */ - if (!(pfn & (pageblock_nr_pages - 1))) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, migratetype); cond_resched(); } pfn++; @@ -6100,15 +6157,10 @@ void __ref memmap_init_zone_device(struct zone *zone, * the address space during boot when many long-lived * kernel allocations are made. * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. - * * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (!(pfn & (pageblock_nr_pages - 1))) { + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } @@ -6143,7 +6195,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; memmap_init_zone(size, nid, zone, start_pfn, - MEMINIT_EARLY, NULL); + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } } @@ -8292,7 +8344,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, */ if (!page_ref_count(page)) { if (PageBuddy(page)) - iter += (1 << page_order(page)) - 1; + iter += (1 << buddy_order(page)) - 1; continue; } @@ -8457,7 +8509,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype, 0); - if (ret < 0) + if (ret) return ret; /* @@ -8505,7 +8557,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, } if (outer_start != start) { - order = page_order(pfn_to_page(outer_start)); + order = buddy_order(pfn_to_page(outer_start)); /* * outer_start page could be small order buddy page and @@ -8693,35 +8745,21 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be in a single zone and isolated - * before calling this. + * All pages in the range must be in a single zone, must not contain holes, + * must span full sections, and must be isolated before calling this function. */ -unsigned long -__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { + unsigned long pfn = start_pfn; struct page *page; struct zone *zone; unsigned int order; - unsigned long pfn; unsigned long flags; - unsigned long offlined_pages = 0; - - /* find the first valid pfn */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) - if (pfn_valid(pfn)) - break; - if (pfn == end_pfn) - return offlined_pages; offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); - pfn = start_pfn; while (pfn < end_pfn) { - if (!pfn_valid(pfn)) { - pfn++; - continue; - } page = pfn_to_page(pfn); /* * The HWPoisoned page may be not in buddy system, and @@ -8729,7 +8767,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) */ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; - offlined_pages++; continue; } /* @@ -8740,20 +8777,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(PageBuddy(page)); pfn++; - offlined_pages++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); - order = page_order(page); - offlined_pages += 1 << order; + order = buddy_order(page); del_page_from_free_list(page, zone, order); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); - - return offlined_pages; } #endif @@ -8768,7 +8801,7 @@ bool is_free_buddy_page(struct page *page) for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); - if (PageBuddy(page_head) && page_order(page_head) >= order) + if (PageBuddy(page_head) && buddy_order(page_head) >= order) break; } spin_unlock_irqrestore(&zone->lock, flags); @@ -8778,30 +8811,70 @@ bool is_free_buddy_page(struct page *page) #ifdef CONFIG_MEMORY_FAILURE /* - * Set PG_hwpoison flag if a given page is confirmed to be a free page. This - * test is performed under the zone lock to prevent a race against page - * allocation. + * Break down a higher-order page in sub-pages, and keep our target out of + * buddy allocator. */ -bool set_hwpoison_free_buddy_page(struct page *page) +static void break_down_buddy_pages(struct zone *zone, struct page *page, + struct page *target, int low, int high, + int migratetype) +{ + unsigned long size = 1 << high; + struct page *current_buddy, *next_page; + + while (high > low) { + high--; + size >>= 1; + + if (target >= &page[size]) { + next_page = page + size; + current_buddy = page; + } else { + next_page = page; + current_buddy = page + size; + } + + if (set_page_guard(zone, current_buddy, high, migratetype)) + continue; + + if (current_buddy != target) { + add_to_free_list(current_buddy, zone, high, migratetype); + set_buddy_order(current_buddy, high); + page = next_page; + } + } +} + +/* + * Take a page that will be marked as poisoned off the buddy allocator. + */ +bool take_page_off_buddy(struct page *page) { struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; unsigned int order; - bool hwpoisoned = false; + bool ret = false; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); + int page_order = buddy_order(page_head); - if (PageBuddy(page_head) && page_order(page_head) >= order) { - if (!TestSetPageHWPoison(page)) - hwpoisoned = true; + if (PageBuddy(page_head) && page_order >= order) { + unsigned long pfn_head = page_to_pfn(page_head); + int migratetype = get_pfnblock_migratetype(page_head, + pfn_head); + + del_page_from_free_list(page_head, zone, page_order); + break_down_buddy_pages(zone, page_head, page, 0, + page_order, migratetype); + ret = true; break; } + if (page_count(page_head) > 0) + break; } spin_unlock_irqrestore(&zone->lock, flags); - - return hwpoisoned; + return ret; } #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index aa94afb63823..abbf42214485 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * these pages to be merged. */ if (PageBuddy(page)) { - order = page_order(page); + order = buddy_order(page); if (order >= pageblock_order) { pfn = page_to_pfn(page); buddy_pfn = __find_buddy_pfn(pfn, order); @@ -106,6 +106,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * If we isolate freepage with more than pageblock_order, there * should be no freepage in the range, so we could avoid costly * pageblock scanning for freepage moving. + * + * We didn't actually touch any of the isolated pages, so place them + * to the tail of the freelist. This is an optimization for memory + * onlining - just onlined memory won't immediately be considered for + * allocation. */ if (!isolated_page) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); @@ -173,8 +178,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * (e.g. __offline_pages will need to call it after check for isolated range for * a next retry). * - * Return: the number of isolated pageblocks on success and -EBUSY if any part - * of range cannot be isolated. + * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, int flags) @@ -182,7 +186,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; unsigned long undo_pfn; struct page *page; - int nr_isolate_pageblock = 0; BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); @@ -196,10 +199,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo_pfn = pfn; goto undo; } - nr_isolate_pageblock++; } } - return nr_isolate_pageblock; + return 0; undo: for (pfn = start_pfn; pfn < undo_pfn; @@ -259,7 +261,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * the correct MIGRATE_ISOLATE freelist. There is no * simple way to verify that as VM_BUG_ON(), though. */ - pfn += 1 << page_order(page); + pfn += 1 << buddy_order(page); else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; diff --git a/mm/page_owner.c b/mm/page_owner.c index 360461509423..b735a8eafcdb 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -204,7 +204,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner->last_migrate_reason = reason; } -void __split_page_owner(struct page *page, unsigned int order) +void __split_page_owner(struct page *page, unsigned int nr) { int i; struct page_ext *page_ext = lookup_page_ext(page); @@ -213,7 +213,7 @@ void __split_page_owner(struct page *page, unsigned int order) if (unlikely(!page_ext)) return; - for (i = 0; i < (1 << order); i++) { + for (i = 0; i < nr; i++) { page_owner = get_page_owner(page_ext); page_owner->order = 0; page_ext = page_ext_next(page_ext); @@ -295,7 +295,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageBuddy(page)) { unsigned long freepage_order; - freepage_order = page_order_unsafe(page); + freepage_order = buddy_order_unsafe(page); if (freepage_order < MAX_ORDER) pfn += (1UL << freepage_order) - 1; continue; @@ -490,7 +490,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) page = pfn_to_page(pfn); if (PageBuddy(page)) { - unsigned long freepage_order = page_order_unsafe(page); + unsigned long freepage_order = buddy_order_unsafe(page); if (freepage_order < MAX_ORDER) pfn += (1UL << freepage_order) - 1; @@ -584,7 +584,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) * heavy lock contention. */ if (PageBuddy(page)) { - unsigned long order = page_order_unsafe(page); + unsigned long order = buddy_order_unsafe(page); if (order > 0 && order < MAX_ORDER) pfn += (1UL << order) - 1; diff --git a/mm/page_poison.c b/mm/page_poison.c index 34b9181ee5d1..ae0482cded87 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -8,13 +8,23 @@ #include <linux/ratelimit.h> #include <linux/kasan.h> -static bool want_page_poisoning __read_mostly; +static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning); static int __init early_page_poison_param(char *buf) { - if (!buf) - return -EINVAL; - return strtobool(buf, &want_page_poisoning); + int ret; + bool tmp; + + ret = strtobool(buf, &tmp); + if (ret) + return ret; + + if (tmp) + static_branch_enable(&want_page_poisoning); + else + static_branch_disable(&want_page_poisoning); + + return 0; } early_param("page_poison", early_page_poison_param); @@ -31,7 +41,7 @@ bool page_poisoning_enabled(void) * Page poisoning is debug page alloc for some arches. If * either of those options are enabled, enable poisoning. */ - return (want_page_poisoning || + return (static_branch_unlikely(&want_page_poisoning) || (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())); } diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 3bbd471cfc81..cd8e13d41df4 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -92,7 +92,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev, * report on the new larger page when we make our way * up to that higher order. */ - if (PageBuddy(page) && page_order(page) == order) + if (PageBuddy(page) && buddy_order(page) == order) __SetPageReported(page); } while ((sg = sg_next(sg))); @@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, * the new head of the free list before we release the * zone lock. */ - if (&page->lru != list && !list_is_first(&page->lru, list)) + if (!list_is_first(&page->lru, list)) list_rotate_to_front(&page->lru, list); /* release lock before waiting on report processing */ diff --git a/mm/readahead.c b/mm/readahead.c index 3c9a8dd7c56c..c6ffb76827da 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,10 +158,8 @@ out: } /** - * page_cache_readahead_unbounded - Start unchecked readahead. - * @mapping: File address space. - * @file: This instance of the open file; used for authentication. - * @index: First page index to read. + * page_cache_ra_unbounded - Start unchecked readahead. + * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * @@ -173,17 +171,13 @@ out: * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ -void page_cache_readahead_unbounded(struct address_space *mapping, - struct file *file, pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_size) +void page_cache_ra_unbounded(struct readahead_control *ractl, + unsigned long nr_to_read, unsigned long lookahead_size) { + struct address_space *mapping = ractl->mapping; + unsigned long index = readahead_index(ractl); LIST_HEAD(page_pool); gfp_t gfp_mask = readahead_gfp_mask(mapping); - struct readahead_control rac = { - .mapping = mapping, - .file = file, - ._index = index, - }; unsigned long i; /* @@ -204,7 +198,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping, for (i = 0; i < nr_to_read; i++) { struct page *page = xa_load(&mapping->i_pages, index + i); - BUG_ON(index + i != rac._index + rac._nr_pages); + BUG_ON(index + i != ractl->_index + ractl->_nr_pages); if (page && !xa_is_value(page)) { /* @@ -215,7 +209,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping, * have a stable reference to this page, and it's * not worth getting one just for that. */ - read_pages(&rac, &page_pool, true); + read_pages(ractl, &page_pool, true); continue; } @@ -228,12 +222,12 @@ void page_cache_readahead_unbounded(struct address_space *mapping, } else if (add_to_page_cache_lru(page, mapping, index + i, gfp_mask) < 0) { put_page(page); - read_pages(&rac, &page_pool, true); + read_pages(ractl, &page_pool, true); continue; } if (i == nr_to_read - lookahead_size) SetPageReadahead(page); - rac._nr_pages++; + ractl->_nr_pages++; } /* @@ -241,22 +235,22 @@ void page_cache_readahead_unbounded(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - read_pages(&rac, &page_pool, false); + read_pages(ractl, &page_pool, false); memalloc_nofs_restore(nofs); } -EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); +EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ -void __do_page_cache_readahead(struct address_space *mapping, - struct file *file, pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_size) +void do_page_cache_ra(struct readahead_control *ractl, + unsigned long nr_to_read, unsigned long lookahead_size) { - struct inode *inode = mapping->host; + struct inode *inode = ractl->mapping->host; + unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ @@ -270,20 +264,19 @@ void __do_page_cache_readahead(struct address_space *mapping, if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; - page_cache_readahead_unbounded(mapping, file, index, nr_to_read, - lookahead_size); + page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ -void force_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t index, unsigned long nr_to_read) +void force_page_cache_ra(struct readahead_control *ractl, + struct file_ra_state *ra, unsigned long nr_to_read) { + struct address_space *mapping = ractl->mapping; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); - struct file_ra_state *ra = &filp->f_ra; - unsigned long max_pages; + unsigned long max_pages, index; if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && !mapping->a_ops->readahead)) @@ -293,14 +286,16 @@ void force_page_cache_readahead(struct address_space *mapping, * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ + index = readahead_index(ractl); max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); - nr_to_read = min(nr_to_read, max_pages); + nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; - __do_page_cache_readahead(mapping, filp, index, this_chunk, 0); + ractl->_index = index; + do_page_cache_ra(ractl, this_chunk, 0); index += this_chunk; nr_to_read -= this_chunk; @@ -437,14 +432,14 @@ static int try_context_readahead(struct address_space *mapping, /* * A minimal readahead algorithm for trivial sequential/random reads. */ -static void ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - bool hit_readahead_marker, pgoff_t index, +static void ondemand_readahead(struct readahead_control *ractl, + struct file_ra_state *ra, bool hit_readahead_marker, unsigned long req_size) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; + unsigned long index = readahead_index(ractl); pgoff_t prev_index; /* @@ -482,7 +477,8 @@ static void ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_miss(mapping, index + 1, max_pages); + start = page_cache_next_miss(ractl->mapping, index + 1, + max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) @@ -515,14 +511,15 @@ static void ondemand_readahead(struct address_space *mapping, * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, index, req_size, max_pages)) + if (try_context_readahead(ractl->mapping, ra, index, req_size, + max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ - __do_page_cache_readahead(mapping, filp, index, req_size, 0); + do_page_cache_ra(ractl, req_size, 0); return; initial_readahead: @@ -548,25 +545,12 @@ readit: } } - ra_submit(ra, mapping, filp); + ractl->_index = ra->start; + do_page_cache_ra(ractl, ra->size, ra->async_size); } -/** - * page_cache_sync_readahead - generic file readahead - * @mapping: address_space which holds the pagecache and I/O vectors - * @ra: file_ra_state which holds the readahead state - * @filp: passed on to ->readpage() and ->readpages() - * @index: Index of first page to be read. - * @req_count: Total number of pages being read by the caller. - * - * page_cache_sync_readahead() should be called when a cache miss happened: - * it will submit the read. The readahead logic may decide to piggyback more - * pages onto the read request if access patterns suggest it will improve - * performance. - */ -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - pgoff_t index, unsigned long req_count) +void page_cache_sync_ra(struct readahead_control *ractl, + struct file_ra_state *ra, unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -576,35 +560,19 @@ void page_cache_sync_readahead(struct address_space *mapping, return; /* be dumb */ - if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_readahead(mapping, filp, index, req_count); + if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) { + force_page_cache_ra(ractl, ra, req_count); return; } /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, false, index, req_count); + ondemand_readahead(ractl, ra, false, req_count); } -EXPORT_SYMBOL_GPL(page_cache_sync_readahead); +EXPORT_SYMBOL_GPL(page_cache_sync_ra); -/** - * page_cache_async_readahead - file readahead for marked pages - * @mapping: address_space which holds the pagecache and I/O vectors - * @ra: file_ra_state which holds the readahead state - * @filp: passed on to ->readpage() and ->readpages() - * @page: The page at @index which triggered the readahead call. - * @index: Index of first page to be read. - * @req_count: Total number of pages being read by the caller. - * - * page_cache_async_readahead() should be called when a page is used which - * is marked as PageReadahead; this is a marker to suggest that the application - * has used up enough of the readahead window that we should start pulling in - * more pages. - */ -void -page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - struct page *page, pgoff_t index, - unsigned long req_count) +void page_cache_async_ra(struct readahead_control *ractl, + struct file_ra_state *ra, struct page *page, + unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -621,16 +589,16 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (inode_read_congested(mapping->host)) + if (inode_read_congested(ractl->mapping->host)) return; if (blk_cgroup_congested()) return; /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, true, index, req_count); + ondemand_readahead(ractl, ra, true, req_count); } -EXPORT_SYMBOL_GPL(page_cache_async_readahead); +EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { diff --git a/mm/rmap.c b/mm/rmap.c index 9425260774a1..1b84945d655c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1205,7 +1205,7 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } @@ -1246,7 +1246,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } @@ -1293,7 +1293,7 @@ static void page_remove_anon_compound_rmap(struct page *page) * Subpages can be mapped with PTEs too. Check how many of * them are still mapped. */ - for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } @@ -1303,10 +1303,10 @@ static void page_remove_anon_compound_rmap(struct page *page) * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < HPAGE_PMD_NR) + if (nr && nr < thp_nr_pages(page)) deferred_split_huge_page(page); } else { - nr = HPAGE_PMD_NR; + nr = thp_nr_pages(page); } if (unlikely(PageMlocked(page))) diff --git a/mm/shmem.c b/mm/shmem.c index 6d4ddef4a24f..537c137698f8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3984,7 +3984,7 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT, }; int __init shmem_init(void) diff --git a/mm/shuffle.c b/mm/shuffle.c index 9b5cd4b004b0..9c2e145a747a 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -60,7 +60,7 @@ static struct page * __meminit shuffle_valid_page(struct zone *zone, * ...is the page on the same list as the page we will * shuffle it with? */ - if (page_order(page) != order) + if (buddy_order(page) != order) return NULL; return page; diff --git a/mm/slab.c b/mm/slab.c index 399a9d185b0f..b1113561b98b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1062,7 +1062,7 @@ int slab_prepare_cpu(unsigned int cpu) * Even if all the cpus of a node are down, we don't free the * kmem_cache_node of any cache. This to avoid a race between cpu_down, and * a kmalloc allocation from another cpu for memory from the node of - * the cpu going down. The list3 structure is usually allocated from + * the cpu going down. The kmem_cache_node structure is usually allocated from * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). */ int slab_dead_cpu(unsigned int cpu) diff --git a/mm/slab.h b/mm/slab.h index 6dd4b702888a..06c6587765a3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -46,7 +46,6 @@ struct kmem_cache { #include <linux/kmemleak.h> #include <linux/random.h> #include <linux/sched/mm.h> -#include <linux/kmemleak.h> /* * State of the slab allocator. diff --git a/mm/slub.c b/mm/slub.c index 61d0d2968413..b30be2385d1c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1956,7 +1956,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partials() + * partial slab and there is none available then get_partial() * will return NULL. */ if (!n || !n->nr_partial) diff --git a/mm/sparse.c b/mm/sparse.c index b25ad8e64839..7bd23f9d6cef 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -312,6 +312,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p return coded_mem_map; } +#ifdef CONFIG_MEMORY_HOTPLUG /* * Decode mem_map from the coded memmap */ @@ -321,6 +322,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn coded_mem_map &= SECTION_MAP_MASK; return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } +#endif /* CONFIG_MEMORY_HOTPLUG */ static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, diff --git a/mm/swap_state.c b/mm/swap_state.c index aa40e706604c..ee465827420e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -246,7 +246,7 @@ int add_to_swap(struct page *page) goto fail; /* * Normally the page will be dirtied in unmap because its pte should be - * dirty. A special case is MADV_FREE page. The page'e pte could have + * dirty. A special case is MADV_FREE page. The page's pte could have * dirty bit cleared but the page's SwapBacked bit is still set because * clearing the dirty bit and SwapBacked bit has no lock protected. For * such page, unmap will not set dirty bit for it, so page reclaim will diff --git a/mm/truncate.c b/mm/truncate.c index 6bbe0f0b3ce9..18cec39a9f53 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -168,7 +168,7 @@ void do_invalidatepage(struct page *page, unsigned int offset, * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * - * We need to bale out if page->mapping is no longer equal to the original + * We need to bail out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. @@ -177,12 +177,12 @@ static void truncate_cleanup_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { - pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1; + unsigned int nr = thp_nr_pages(page); unmap_mapping_pages(mapping, page->index, nr, false); } if (page_has_private(page)) - do_invalidatepage(page, 0, PAGE_SIZE); + do_invalidatepage(page, 0, thp_size(page)); /* * Some filesystems seem to re-dirty the page even after diff --git a/mm/util.c b/mm/util.c index 4e21fe7eae27..4ddb6e186dd5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -69,7 +69,8 @@ EXPORT_SYMBOL(kstrdup); * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * Note: Strings allocated by kstrdup_const should be freed by kfree_const. + * Note: Strings allocated by kstrdup_const should be freed by kfree_const and + * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. diff --git a/mm/vmscan.c b/mm/vmscan.c index 879fb57c5045..1b8f0e059767 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -725,8 +725,7 @@ static inline int is_page_cache_freeable(struct page *page) * that isolated the page, the page cache and optional buffer * heads at page->private. */ - int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? - HPAGE_PMD_NR : 1; + int page_cache_pins = thp_nr_pages(page); return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } @@ -2240,7 +2239,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); - u64 fraction[2]; + u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; unsigned long ap, fp; diff --git a/mm/vmstat.c b/mm/vmstat.c index 4f7b4ee6aa12..698bc0bc18d1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -325,7 +325,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { zone_page_state_add(x, zone, item); x = 0; } @@ -350,7 +350,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, t = __this_cpu_read(pcp->stat_threshold); - if (unlikely(x > t || x < -t)) { + if (unlikely(abs(x) > t)) { node_page_state_add(x, pgdat, item); x = 0; } @@ -511,7 +511,7 @@ static inline void mod_zone_state(struct zone *zone, o = this_cpu_read(*p); n = delta + o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to zone counters */ @@ -573,7 +573,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, o = this_cpu_read(*p); n = delta + o; - if (n > t || n < -t) { + if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; /* Overflow must be added to node counters */ diff --git a/mm/workingset.c b/mm/workingset.c index 92e66113a577..8ed8e6296d8c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -216,7 +216,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, /** * workingset_age_nonresident - age non-resident entries as LRU ages - * @memcg: the lruvec that was aged + * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 27348029b2b8..4e3fff0745e8 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -4,7 +4,15 @@ ifdef CONFIG_UBSAN_ALIGNMENT endif ifdef CONFIG_UBSAN_BOUNDS - CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) + ifdef CONFIG_CC_IS_CLANG + CFLAGS_UBSAN += -fsanitize=array-bounds + else + CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) + endif +endif + +ifdef CONFIG_UBSAN_LOCAL_BOUNDS + CFLAGS_UBSAN += -fsanitize=local-bounds endif ifdef CONFIG_UBSAN_MISC diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 504d2e431c60..4223a9ac7059 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -65,6 +65,7 @@ my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANC # git output parsing needs US English output, so first set backtick child process LANGUAGE my $git_command ='export LANGUAGE=en_US.UTF-8; git'; my $tabsize = 8; +my ${CONFIG_} = "CONFIG_"; sub help { my ($exitcode) = @_; @@ -127,6 +128,8 @@ Options: --typedefsfile Read additional types from this file --color[=WHEN] Use colors 'always', 'never', or only when output is a terminal ('auto'). Default is 'auto'. + --kconfig-prefix=WORD use WORD as a prefix for Kconfig symbols (default + ${CONFIG_}) -h, --help, --version display this help and exit When FILE is - read standard input. @@ -235,6 +238,7 @@ GetOptions( 'color=s' => \$color, 'no-color' => \$color, #keep old behaviors of -nocolor 'nocolor' => \$color, #keep old behaviors of -nocolor + 'kconfig-prefix=s' => \${CONFIG_}, 'h|help' => \$help, 'version' => \$help ) or help(1); @@ -970,6 +974,16 @@ sub seed_camelcase_includes { } } +sub git_is_single_file { + my ($filename) = @_; + + return 0 if ((which("git") eq "") || !(-e "$gitroot")); + + my $output = `${git_command} ls-files -- $filename 2>/dev/null`; + my $count = $output =~ tr/\n//; + return $count eq 1 && $output =~ m{^${filename}$}; +} + sub git_commit_info { my ($commit, $id, $desc) = @_; @@ -1043,6 +1057,9 @@ my $vname; $allow_c99_comments = !defined $ignore_type{"C99_COMMENT_TOLERANCE"}; for my $filename (@ARGV) { my $FILE; + my $is_git_file = git_is_single_file($filename); + my $oldfile = $file; + $file = 1 if ($is_git_file); if ($git) { open($FILE, '-|', "git format-patch -M --stdout -1 $filename") || die "$P: $filename: git format-patch failed - $!\n"; @@ -1087,6 +1104,7 @@ for my $filename (@ARGV) { @modifierListFile = (); @typeListFile = (); build_types(); + $file = $oldfile if ($is_git_file); } if (!$quiet) { @@ -1163,10 +1181,10 @@ sub parse_email { } } + $comment = trim($comment); $name = trim($name); $name =~ s/^\"|\"$//g; - $name =~ s/(\s*\([^\)]+\))\s*//; - if (defined($1)) { + if ($name =~ s/(\s*\([^\)]+\))\s*//) { $name_comment = trim($1); } $address = trim($address); @@ -1181,10 +1199,12 @@ sub parse_email { } sub format_email { - my ($name, $address) = @_; + my ($name, $name_comment, $address, $comment) = @_; my $formatted_email; + $name_comment = trim($name_comment); + $comment = trim($comment); $name = trim($name); $name =~ s/^\"|\"$//g; $address = trim($address); @@ -1197,9 +1217,9 @@ sub format_email { if ("$name" eq "") { $formatted_email = "$address"; } else { - $formatted_email = "$name <$address>"; + $formatted_email = "$name$name_comment <$address>"; } - + $formatted_email .= "$comment"; return $formatted_email; } @@ -1207,17 +1227,23 @@ sub reformat_email { my ($email) = @_; my ($email_name, $name_comment, $email_address, $comment) = parse_email($email); - return format_email($email_name, $email_address); + return format_email($email_name, $name_comment, $email_address, $comment); } sub same_email_addresses { - my ($email1, $email2) = @_; + my ($email1, $email2, $match_comment) = @_; my ($email1_name, $name1_comment, $email1_address, $comment1) = parse_email($email1); my ($email2_name, $name2_comment, $email2_address, $comment2) = parse_email($email2); + if ($match_comment != 1) { + return $email1_name eq $email2_name && + $email1_address eq $email2_address; + } return $email1_name eq $email2_name && - $email1_address eq $email2_address; + $email1_address eq $email2_address && + $name1_comment eq $name2_comment && + $comment1 eq $comment2; } sub which { @@ -2347,6 +2373,7 @@ sub process { my $signoff = 0; my $author = ''; my $authorsignoff = 0; + my $author_sob = ''; my $is_patch = 0; my $is_binding_patch = -1; my $in_header_lines = $file ? 0 : 1; @@ -2661,6 +2688,10 @@ sub process { # Check the patch for a From: if (decode("MIME-Header", $line) =~ /^From:\s*(.*)/) { $author = $1; + my $curline = $linenr; + while(defined($rawlines[$curline]) && ($rawlines[$curline++] =~ /^[ \t]\s*(.*)/)) { + $author .= $1; + } $author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i); $author =~ s/"//g; $author = reformat_email($author); @@ -2670,9 +2701,37 @@ sub process { if ($line =~ /^\s*signed-off-by:\s*(.*)/i) { $signoff++; $in_commit_log = 0; - if ($author ne '') { - if (same_email_addresses($1, $author)) { + if ($author ne '' && $authorsignoff != 1) { + if (same_email_addresses($1, $author, 1)) { $authorsignoff = 1; + } else { + my $ctx = $1; + my ($email_name, $email_comment, $email_address, $comment1) = parse_email($ctx); + my ($author_name, $author_comment, $author_address, $comment2) = parse_email($author); + + if ($email_address eq $author_address && $email_name eq $author_name) { + $author_sob = $ctx; + $authorsignoff = 2; + } elsif ($email_address eq $author_address) { + $author_sob = $ctx; + $authorsignoff = 3; + } elsif ($email_name eq $author_name) { + $author_sob = $ctx; + $authorsignoff = 4; + + my $address1 = $email_address; + my $address2 = $author_address; + + if ($address1 =~ /(\S+)\+\S+(\@.*)/) { + $address1 = "$1$2"; + } + if ($address2 =~ /(\S+)\+\S+(\@.*)/) { + $address2 = "$1$2"; + } + if ($address1 eq $address2) { + $authorsignoff = 5; + } + } } } } @@ -2729,7 +2788,7 @@ sub process { } my ($email_name, $name_comment, $email_address, $comment) = parse_email($email); - my $suggested_email = format_email(($email_name, $email_address)); + my $suggested_email = format_email(($email_name, $name_comment, $email_address, $comment)); if ($suggested_email eq "") { ERROR("BAD_SIGN_OFF", "Unrecognized email address: '$email'\n" . $herecurr); @@ -2739,9 +2798,9 @@ sub process { $dequoted =~ s/" </ </; # Don't force email to have quotes # Allow just an angle bracketed address - if (!same_email_addresses($email, $suggested_email)) { + if (!same_email_addresses($email, $suggested_email, 0)) { WARN("BAD_SIGN_OFF", - "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr); + "email address '$email' might be better as '$suggested_email'\n" . $herecurr); } } @@ -2987,6 +3046,42 @@ sub process { } } +# check for repeated words separated by a single space + if ($rawline =~ /^\+/ || $in_commit_log) { + while ($rawline =~ /\b($word_pattern) (?=($word_pattern))/g) { + + my $first = $1; + my $second = $2; + + if ($first =~ /(?:struct|union|enum)/) { + pos($rawline) += length($first) + length($second) + 1; + next; + } + + next if ($first ne $second); + next if ($first eq 'long'); + + if (WARN("REPEATED_WORD", + "Possible repeated word: '$first'\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\b$first $second\b/$first/; + } + } + + # if it's a repeated word on consecutive lines in a comment block + if ($prevline =~ /$;+\s*$/ && + $prevrawline =~ /($word_pattern)\s*$/) { + my $last_word = $1; + if ($rawline =~ /^\+\s*\*\s*$last_word /) { + if (WARN("REPEATED_WORD", + "Possible repeated word: '$last_word'\n" . $hereprev) && + $fix) { + $fixed[$fixlinenr] =~ s/(\+\s*\*\s*)$last_word /$1/; + } + } + } + } + # ignore non-hunk lines and lines being removed next if (!$hunk_line || $line =~ /^-/); @@ -3213,6 +3308,12 @@ sub process { } } +# check for embedded filenames + if ($rawline =~ /^\+.*\Q$realfile\E/) { + WARN("EMBEDDED_FILENAME", + "It's generally not useful to have the filename in the file\n" . $herecurr); + } + # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); @@ -3310,42 +3411,6 @@ sub process { } } -# check for repeated words separated by a single space - if ($rawline =~ /^\+/) { - while ($rawline =~ /\b($word_pattern) (?=($word_pattern))/g) { - - my $first = $1; - my $second = $2; - - if ($first =~ /(?:struct|union|enum)/) { - pos($rawline) += length($first) + length($second) + 1; - next; - } - - next if ($first ne $second); - next if ($first eq 'long'); - - if (WARN("REPEATED_WORD", - "Possible repeated word: '$first'\n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/\b$first $second\b/$first/; - } - } - - # if it's a repeated word on consecutive lines in a comment block - if ($prevline =~ /$;+\s*$/ && - $prevrawline =~ /($word_pattern)\s*$/) { - my $last_word = $1; - if ($rawline =~ /^\+\s*\*\s*$last_word /) { - if (WARN("REPEATED_WORD", - "Possible repeated word: '$last_word'\n" . $hereprev) && - $fix) { - $fixed[$fixlinenr] =~ s/(\+\s*\*\s*)$last_word /$1/; - } - } - } - } - # check for space before tabs. if ($rawline =~ /^\+/ && $rawline =~ / \t/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; @@ -3436,7 +3501,7 @@ sub process { if ($realfile =~ m@^(drivers/net/|net/)@ && $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ && $rawline =~ /^\+[ \t]*\*/ && - $realline > 2) { + $realline > 3) { # Do not warn about the initial copyright comment block after SPDX-License-Identifier WARN("NETWORKING_BLOCK_COMMENT_STYLE", "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev); } @@ -3895,6 +3960,17 @@ sub process { #ignore lines not being added next if ($line =~ /^[^\+]/); +# check for self assignments used to avoid compiler warnings +# e.g.: int foo = foo, *bar = NULL; +# struct foo bar = *(&(bar)); + if ($line =~ /^\+\s*(?:$Declare)?([A-Za-z_][A-Za-z\d_]*)\s*=/) { + my $var = $1; + if ($line =~ /^\+\s*(?:$Declare)?$var\s*=\s*(?:$var|\*\s*\(?\s*&\s*\(?\s*$var\s*\)?\s*\)?)\s*[;,]/) { + WARN("SELF_ASSIGNMENT", + "Do not use self-assignments to avoid compiler warnings\n" . $herecurr); + } + } + # check for dereferences that span multiple lines if ($prevline =~ /^\+.*$Lval\s*(?:\.|->)\s*$/ && $line =~ /^\+\s*(?!\#\s*(?!define\s+|if))\s*$Lval/) { @@ -4270,6 +4346,12 @@ sub process { "Prefer dev_$level(... to dev_printk(KERN_$orig, ...\n" . $herecurr); } +# trace_printk should not be used in production code. + if ($line =~ /\b(trace_printk|trace_puts|ftrace_vprintk)\s*\(/) { + WARN("TRACE_PRINTK", + "Do not use $1() in production code (this can be ignored if built only with a debug config option)\n" . $herecurr); + } + # ENOSYS means "bad syscall nr" and nothing else. This will have a small # number of false positives, but assembly files are not checked, so at # least the arch entry code will not trigger this warning. @@ -4936,6 +5018,17 @@ sub process { } } +# check if a statement with a comma should be two statements like: +# foo = bar(), /* comma should be semicolon */ +# bar = baz(); + if (defined($stat) && + $stat =~ /^\+\s*(?:$Lval\s*$Assignment\s*)?$FuncArg\s*,\s*(?:$Lval\s*$Assignment\s*)?$FuncArg\s*;\s*$/) { + my $cnt = statement_rawlines($stat); + my $herectx = get_stat_here($linenr, $cnt, $here); + WARN("SUSPECT_COMMA_SEMICOLON", + "Possible comma where semicolon could be used\n" . $herectx); + } + # return is not a function if (defined($stat) && $stat =~ /^.\s*return(\s*)\(/s) { my $spacing = $1; @@ -5295,9 +5388,9 @@ sub process { $dstat =~ s/\s*$//s; # Flatten any parentheses and braces - while ($dstat =~ s/\([^\(\)]*\)/1/ || - $dstat =~ s/\{[^\{\}]*\}/1/ || - $dstat =~ s/.\[[^\[\]]*\]/1/) + while ($dstat =~ s/\([^\(\)]*\)/1u/ || + $dstat =~ s/\{[^\{\}]*\}/1u/ || + $dstat =~ s/.\[[^\[\]]*\]/1u/) { } @@ -5338,6 +5431,7 @@ sub process { $dstat !~ /^\.$Ident\s*=/ && # .foo = $dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ && # stringification #foo $dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ && # do {...} while (...); // do {...} while (...) + $dstat !~ /^while\s*$Constant\s*$Constant\s*$/ && # while (...) {...} $dstat !~ /^for\s*$Constant$/ && # for (...) $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() $dstat !~ /^do\s*{/ && # do {... @@ -6524,16 +6618,16 @@ sub process { } # check for IS_ENABLED() without CONFIG_<FOO> ($rawline for comments too) - if ($rawline =~ /\bIS_ENABLED\s*\(\s*(\w+)\s*\)/ && $1 !~ /^CONFIG_/) { + if ($rawline =~ /\bIS_ENABLED\s*\(\s*(\w+)\s*\)/ && $1 !~ /^${CONFIG_}/) { WARN("IS_ENABLED_CONFIG", - "IS_ENABLED($1) is normally used as IS_ENABLED(CONFIG_$1)\n" . $herecurr); + "IS_ENABLED($1) is normally used as IS_ENABLED(${CONFIG_}$1)\n" . $herecurr); } # check for #if defined CONFIG_<FOO> || defined CONFIG_<FOO>_MODULE - if ($line =~ /^\+\s*#\s*if\s+defined(?:\s*\(?\s*|\s+)(CONFIG_[A-Z_]+)\s*\)?\s*\|\|\s*defined(?:\s*\(?\s*|\s+)\1_MODULE\s*\)?\s*$/) { + if ($line =~ /^\+\s*#\s*if\s+defined(?:\s*\(?\s*|\s+)(${CONFIG_}[A-Z_]+)\s*\)?\s*\|\|\s*defined(?:\s*\(?\s*|\s+)\1_MODULE\s*\)?\s*$/) { my $config = $1; if (WARN("PREFER_IS_ENABLED", - "Prefer IS_ENABLED(<FOO>) to CONFIG_<FOO> || CONFIG_<FOO>_MODULE\n" . $herecurr) && + "Prefer IS_ENABLED(<FOO>) to ${CONFIG_}<FOO> || ${CONFIG_}<FOO>_MODULE\n" . $herecurr) && $fix) { $fixed[$fixlinenr] = "\+#if IS_ENABLED($config)"; } @@ -6886,9 +6980,33 @@ sub process { if ($signoff == 0) { ERROR("MISSING_SIGN_OFF", "Missing Signed-off-by: line(s)\n"); - } elsif (!$authorsignoff) { - WARN("NO_AUTHOR_SIGN_OFF", - "Missing Signed-off-by: line by nominal patch author '$author'\n"); + } elsif ($authorsignoff != 1) { + # authorsignoff values: + # 0 -> missing sign off + # 1 -> sign off identical + # 2 -> names and addresses match, comments mismatch + # 3 -> addresses match, names different + # 4 -> names match, addresses different + # 5 -> names match, addresses excluding subaddress details (refer RFC 5233) match + + my $sob_msg = "'From: $author' != 'Signed-off-by: $author_sob'"; + + if ($authorsignoff == 0) { + ERROR("NO_AUTHOR_SIGN_OFF", + "Missing Signed-off-by: line by nominal patch author '$author'\n"); + } elsif ($authorsignoff == 2) { + CHK("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email comments mismatch: $sob_msg\n"); + } elsif ($authorsignoff == 3) { + WARN("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email name mismatch: $sob_msg\n"); + } elsif ($authorsignoff == 4) { + WARN("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email address mismatch: $sob_msg\n"); + } elsif ($authorsignoff == 5) { + WARN("FROM_SIGN_OFF_MISMATCH", + "From:/Signed-off-by: email subaddress mismatch: $sob_msg\n"); + } } } diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch index e9df9cc28a85..1aae4f4fdacc 100644 --- a/scripts/const_structs.checkpatch +++ b/scripts/const_structs.checkpatch @@ -39,6 +39,9 @@ nlmsvc_binding nvkm_device_chip of_device_id pci_raw_ops +phy_ops +pinctrl_ops +pinmux_ops pipe_buf_operations platform_hibernation_ops platform_suspend_ops diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 6a56bba233a9..09cd871925a5 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -167,6 +167,9 @@ values of that process namespace""" if not namespace: raise gdb.GdbError("No namespace for current process") + gdb.write("{:^18} {:^15} {:>9} {} {} options\n".format( + "mount", "super_block", "devname", "pathname", "fstype")) + for vfs in lists.list_for_each_entry(namespace['list'], mount_ptr_type, "mnt_list"): devname = vfs['mnt_devname'].string() @@ -190,14 +193,10 @@ values of that process namespace""" m_flags = int(vfs['mnt']['mnt_flags']) rd = "ro" if (s_flags & constants.LX_SB_RDONLY) else "rw" - gdb.write( - "{} {} {} {}{}{} 0 0\n" - .format(devname, - pathname, - fstype, - rd, - info_opts(FS_INFO, s_flags), - info_opts(MNT_INFO, m_flags))) + gdb.write("{} {} {} {} {} {}{}{} 0 0\n".format( + vfs.format_string(), superblock.format_string(), devname, + pathname, fstype, rd, info_opts(FS_INFO, s_flags), + info_opts(MNT_INFO, m_flags))) LxMounts() diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py index 0301dc1e0138..17ec19e9b5bf 100644 --- a/scripts/gdb/linux/tasks.py +++ b/scripts/gdb/linux/tasks.py @@ -73,11 +73,12 @@ class LxPs(gdb.Command): super(LxPs, self).__init__("lx-ps", gdb.COMMAND_DATA) def invoke(self, arg, from_tty): + gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM")) for task in task_lists(): - gdb.write("{address} {pid} {comm}\n".format( - address=task, - pid=task["pid"], - comm=task["comm"].string())) + gdb.write("{} {:^5} {}\n".format( + task.format_string().split()[0], + task["pid"].format_string(), + task["comm"].string())) LxPs() diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 484d2fbf5921..2075db0c08b8 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -541,6 +541,9 @@ foreach my $file (@ARGV) { die "$P: file '${file}' not found\n"; } } + if ($from_filename && (vcs_exists() && !vcs_file_exists($file))) { + warn "$P: file '$file' not found in version control $!\n"; + } if ($from_filename || ($file ne "&STDIN" && vcs_file_exists($file))) { $file =~ s/^\Q${cur_path}\E//; #strip any absolute path $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree @@ -954,8 +957,10 @@ sub get_maintainers { foreach my $file (@files) { if ($email && - ($email_git || ($email_git_fallback && - !$exact_pattern_match_hash{$file}))) { + ($email_git || + ($email_git_fallback && + $file !~ /MAINTAINERS$/ && + !$exact_pattern_match_hash{$file}))) { vcs_file_signoffs($file); } if ($email && $email_git_blame) { diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore index 344a99c6da1b..9e2f00343f15 100644 --- a/tools/testing/selftests/exec/.gitignore +++ b/tools/testing/selftests/exec/.gitignore @@ -7,6 +7,7 @@ execveat.moved execveat.path.ephemeral execveat.ephemeral execveat.denatured +/load_address_* /recursion-depth xxxxxxxx* pipe diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 0a13b110c1e6..cf69b2fcce59 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE TEST_PROGS := binfmt_script non-regular -TEST_GEN_PROGS := execveat +TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile @@ -27,4 +27,9 @@ $(OUTPUT)/execveat.symlink: $(OUTPUT)/execveat $(OUTPUT)/execveat.denatured: $(OUTPUT)/execveat cp $< $@ chmod -x $@ - +$(OUTPUT)/load_address_4096: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000 -pie $< -o $@ +$(OUTPUT)/load_address_2097152: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x200000 -pie $< -o $@ +$(OUTPUT)/load_address_16777216: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000000 -pie $< -o $@ diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c new file mode 100644 index 000000000000..d487c2f6a615 --- /dev/null +++ b/tools/testing/selftests/exec/load_address.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <link.h> +#include <stdio.h> +#include <stdlib.h> + +struct Statistics { + unsigned long long load_address; + unsigned long long alignment; +}; + +int ExtractStatistics(struct dl_phdr_info *info, size_t size, void *data) +{ + struct Statistics *stats = (struct Statistics *) data; + int i; + + if (info->dlpi_name != NULL && info->dlpi_name[0] != '\0') { + // Ignore headers from other than the executable. + return 2; + } + + stats->load_address = (unsigned long long) info->dlpi_addr; + stats->alignment = 0; + + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + if (info->dlpi_phdr[i].p_align > stats->alignment) + stats->alignment = info->dlpi_phdr[i].p_align; + } + + return 1; // Terminate dl_iterate_phdr. +} + +int main(int argc, char **argv) +{ + struct Statistics extracted; + unsigned long long misalign; + int ret; + + ret = dl_iterate_phdr(ExtractStatistics, &extracted); + if (ret != 1) { + fprintf(stderr, "FAILED\n"); + return 1; + } + + if (extracted.alignment == 0) { + fprintf(stderr, "No alignment found\n"); + return 1; + } else if (extracted.alignment & (extracted.alignment - 1)) { + fprintf(stderr, "Alignment is not a power of 2\n"); + return 1; + } + + misalign = extracted.load_address & (extracted.alignment - 1); + if (misalign) { + printf("alignment = %llu, load_address = %llu\n", + extracted.alignment, extracted.load_address); + fprintf(stderr, "FAILED\n"); + return 1; + } + + fprintf(stderr, "PASS\n"); + return 0; +} |