diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-06-07 15:13:32 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-06-07 15:23:33 +1000 |
commit | 27a38b1355d79eadbe88bb2567f333acb93347f3 (patch) | |
tree | 1bb5f219d62e43949e37025de8231a2466740f3f | |
parent | b512ca221d38c9a1e734ec3593dfd73164553adf (diff) | |
parent | 3ba56b246344e6361cc5a00f88fd472451c57ad0 (diff) |
Merge branch 'akpm-current/current'
231 files changed, 6502 insertions, 3165 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 451b6d882b2c..3c1945f4d5f2 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -90,3 +90,13 @@ Description: device's debugging info useful for kernel developers. Its format is not documented intentionally and may change anytime without any notice. + +What: /sys/block/zram<id>/use_dedup +Date: March 2017 +Contact: Joonsoo Kim <iamjoonsoo.kim@lge.com> +Description: + The use_dedup file is read-write and specifies deduplication + feature is used or not. If enabled, duplicated data is + managed by reference count and will not be stored in memory + twice. Benefit of this feature largely depends on the workload + so keep attention when use. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bc098de4e20a..d76c4b31c392 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2295,8 +2295,11 @@ that the amount of memory usable for all allocations is not too small. - movable_node [KNL] Boot-time switch to enable the effects - of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. + movable_node [KNL] Boot-time switch to make hotplugable memory + NUMA nodes to be movable. This means that the memory + of such nodes will be usable only for movable + allocations which rules out almost all kernel + allocations. Use with caution! MTD_Partition= [MTD] Format: <name>,<region-number>,<size>,<offset> diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 4fced8a21307..cbbe39b39953 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations comp_algorithm RW show and change the compression algorithm compact WO trigger memory compaction debug_stat RO this file is used for zram debugging purposes +use_dedup RW show and set deduplication feature User space is advised to use the following files to read the device statistics. @@ -217,6 +218,8 @@ line of text and contains the following stats separated by whitespace: same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction + dup_data_size deduplicated data size + meta_data_size the amount of metadata allocated for deduplication feature 9) Deactivate: swapoff /dev/zram0 diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index 946e69103cdd..cefb63639070 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -789,23 +789,46 @@ way to trigger. Applications should do whatever they can to help the system. It might be too late to consult with vmstat or any other statistics, so it's advisable to take an immediate action. -The events are propagated upward until the event is handled, i.e. the -events are not pass-through. Here is what this means: for example you have -three cgroups: A->B->C. Now you set up an event listener on cgroups A, B -and C, and suppose group C experiences some pressure. In this situation, -only group C will receive the notification, i.e. groups A and B will not -receive it. This is done to avoid excessive "broadcasting" of messages, -which disturbs the system and which is especially bad if we are low on -memory or thrashing. So, organize the cgroups wisely, or propagate the -events manually (or, ask us to implement the pass-through events, -explaining why would you need them.) +By default, events are propagated upward until the event is handled, i.e. the +events are not pass-through. For example, you have three cgroups: A->B->C. Now +you set up an event listener on cgroups A, B and C, and suppose group C +experiences some pressure. In this situation, only group C will receive the +notification, i.e. groups A and B will not receive it. This is done to avoid +excessive "broadcasting" of messages, which disturbs the system and which is +especially bad if we are low on memory or thrashing. Group B, will receive +notification only if there are no event listers for group C. + +There are three optional modes that specify different propagation behavior: + + - "default": this is the default behavior specified above. This mode is the + same as omitting the optional mode parameter, preserved by backwards + compatibility. + + - "hierarchy": events always propagate up to the root, similar to the default + behavior, except that propagation continues regardless of whether there are + event listeners at each level, with the "hierarchy" mode. In the above + example, groups A, B, and C will receive notification of memory pressure. + + - "local": events are pass-through, i.e. they only receive notifications when + memory pressure is experienced in the memcg for which the notification is + registered. In the above example, group C will receive notification if + registered for "local" notification and the group experiences memory + pressure. However, group B will never receive notification, regardless if + there is an event listener for group C or not, if group B is registered for + local notification. + +The level and event notification mode ("hierarchy" or "local", if necessary) are +specified by a comma-delimited string, i.e. "low,hierarchy" specifies +hierarchical, pass-through, notification for all ancestor memcgs. Notification +that is the default, non pass-through behavior, does not specify a mode. +"medium,local" specifies pass-through notification for the medium level. The file memory.pressure_level is only used to setup an eventfd. To register a notification, an application must: - create an eventfd using eventfd(2); - open memory.pressure_level; -- write string like "<event_fd> <fd of memory.pressure_level> <level>" +- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>" to cgroup.event_control. Application will be notified through eventfd when memory pressure is at @@ -821,7 +844,7 @@ Test: # cd /sys/fs/cgroup/memory/ # mkdir foo # cd foo - # cgroup_event_listener memory.pressure_level low & + # cgroup_event_listener memory.pressure_level low,hierarchy & # echo 8000000 > memory.limit_in_bytes # echo 8000000 > memory.memsw.limit_in_bytes # echo $$ > tasks diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index dc5e2dcdbef4..a86f3cb88125 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -826,13 +826,25 @@ PAGE_SIZE multiple when read back. The number of times the cgroup's memory usage was about to go over the max boundary. If direct reclaim - fails to bring it down, the OOM killer is invoked. + fails to bring it down, the cgroup goes to OOM state. oom - The number of times the OOM killer has been invoked in - the cgroup. This may not exactly match the number of - processes killed but should generally be close. + The number of time the cgroup's memory usage was + reached the limit and allocation was about to fail. + + Depending on context result could be invocation of OOM + killer and retrying allocation or failing alloction. + + Failed allocation in its turn could be returned into + userspace as -ENOMEM or siletly ignored in cases like + disk readahead. For now OOM in memory cgroup kills + tasks iff shortage has happened inside page fault. + + oom_kill + + The number of processes belonging to this cgroup + killed by any kind of OOM killer. memory.stat @@ -930,6 +942,34 @@ PAGE_SIZE multiple when read back. Number of times a shadow node has been reclaimed + pgrefill + + Amount of scanned pages (in an active LRU list) + + pgscan + + Amount of scanned pages (in an inactive LRU list) + + pgsteal + + Amount of reclaimed pages + + pgactivate + + Amount of pages moved to the active LRU list + + pgdeactivate + + Amount of pages moved to the inactive LRU lis + + pglazyfree + + Amount of pages postponed to be freed under memory pressure + + pglazyfreed + + Amount of reclaimed lazyfree pages + memory.swap.current A read-only single value file which exists on non-root diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index b2391b829169..cb8862659178 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -150,6 +150,7 @@ See the include/linux/kmemleak.h header for the functions prototype. - ``kmemleak_init`` - initialize kmemleak - ``kmemleak_alloc`` - notify of a memory block allocation - ``kmemleak_alloc_percpu`` - notify of a percpu memory block allocation +- ``kmemleak_vmalloc`` - notify of a vmalloc() memory allocation - ``kmemleak_free`` - notify of a memory block freeing - ``kmemleak_free_part`` - notify of a partial memory block freeing - ``kmemleak_free_percpu`` - notify of a percpu memory block freeing diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 415484f3d59a..918972babcd8 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -134,6 +134,23 @@ use the boot option: fail_futex= mmc_core.fail_request=<interval>,<probability>,<space>,<times> +o proc entries + +- /proc/<pid>/fail-nth: +- /proc/self/task/<tid>/fail-nth: + + Write to this file of integer N makes N-th call in the task fail. + Read from this file returns a integer value. A value of '0' indicates + that the fault setup with a previous write to this file was injected. + A positive integer N indicates that the fault wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + How to add new fault injection capability ----------------------------------------- @@ -278,3 +295,65 @@ allocation failure. # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call. + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/syscall.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> + +int main() +{ + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 1;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + pread(fail_nth, buf, sizeof(buf), 0); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y', + res, err); + if (atoi(buf)) + break; + } + return 0; +} + +An example output: + +1-th fault Y: res=-1/23 +2-th fault Y: res=-1/23 +3-th fault Y: res=-1/12 +4-th fault Y: res=-1/12 +5-th fault Y: res=-1/23 +6-th fault Y: res=-1/23 +7-th fault Y: res=-1/23 +8-th fault Y: res=-1/12 +9-th fault Y: res=-1/12 +10-th fault Y: res=-1/12 +11-th fault Y: res=-1/12 +12-th fault Y: res=-1/12 +13-th fault Y: res=-1/12 +14-th fault Y: res=-1/12 +15-th fault Y: res=-1/12 +16-th fault N: res=0/12 diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 4cddbce85ac9..adba21b5ada7 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1786,12 +1786,16 @@ pair provide additional information particular to the objects they represent. pos: 0 flags: 02 mnt_id: 9 - tfd: 5 events: 1d data: ffffffffffffffff + tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7 where 'tfd' is a target file descriptor number in decimal form, 'events' is events mask being watched and the 'data' is data associated with a target [see epoll(7) for more details]. + The 'pos' is current offset of the target file in decimal form + [see lseek(2)], 'ino' and 'sdev' are inode and device numbers + where target file resides, all in hex format. + Fsnotify files ~~~~~~~~~~~~~~ For inotify files the format is the following diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 615434d81108..51814450a7f8 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -112,8 +112,8 @@ There are two possible methods of using Kdump. 2) Or use the system kernel binary itself as dump-capture kernel and there is no need to build a separate dump-capture kernel. This is possible only with the architectures which support a relocatable kernel. As - of today, i386, x86_64, ppc64, ia64 and arm architectures support relocatable - kernel. + of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support + relocatable kernel. Building a relocatable kernel is advantageous from the point of view that one does not have to build a second kernel for capturing the dump. But @@ -339,7 +339,7 @@ For arm: For arm64: - Use vmlinux or Image -If you are using a uncompressed vmlinux image then use following command +If you are using an uncompressed vmlinux image then use following command to load dump-capture kernel. kexec -p <dump-capture-kernel-vmlinux-image> \ @@ -361,6 +361,12 @@ to load dump-capture kernel. --dtb=<dtb-for-dump-capture-kernel> \ --append="root=<root-dev> <arch-specific-options>" +If you are using an uncompressed Image, then use following command +to load dump-capture kernel. + + kexec -p <dump-capture-kernel-Image> \ + --initrd=<initrd-for-dump-capture-kernel> \ + --append="root=<root-dev> <arch-specific-options>" Please note, that --args-linux does not need to be specified for ia64. It is planned to make this a no-op on that architecture, but for now diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 6b0ca7feb135..6686bd267dc9 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -98,6 +98,50 @@ use_zero_pages - specifies whether empty pages (i.e. allocated pages it is only effective for pages merged after the change. Default: 0 (normal KSM behaviour as in earlier releases) +max_page_sharing - Maximum sharing allowed for each KSM page. This + enforces a deduplication limit to avoid the virtual + memory rmap lists to grow too large. The minimum + value is 2 as a newly created KSM page will have at + least two sharers. The rmap walk has O(N) + complexity where N is the number of rmap_items + (i.e. virtual mappings) that are sharing the page, + which is in turn capped by max_page_sharing. So + this effectively spread the the linear O(N) + computational complexity from rmap walk context + over different KSM pages. The ksmd walk over the + stable_node "chains" is also O(N), but N is the + number of stable_node "dups", not the number of + rmap_items, so it has not a significant impact on + ksmd performance. In practice the best stable_node + "dup" candidate will be kept and found at the head + of the "dups" list. The higher this value the + faster KSM will merge the memory (because there + will be fewer stable_node dups queued into the + stable_node chain->hlist to check for pruning) and + the higher the deduplication factor will be, but + the slowest the worst case rmap walk could be for + any given KSM page. Slowing down the rmap_walk + means there will be higher latency for certain + virtual memory operations happening during + swapping, compaction, NUMA balancing and page + migration, in turn decreasing responsiveness for + the caller of those virtual memory operations. The + scheduler latency of other tasks not involved with + the VM operations doing the rmap walk is not + affected by this parameter as the rmap walks are + always schedule friendly themselves. + +stable_node_chains_prune_millisecs - How frequently to walk the whole + list of stable_node "dups" linked in the + stable_node "chains" in order to prune stale + stable_nodes. Smaller milllisecs values will free + up the KSM metadata with lower latency, but they + will make ksmd use more CPU during the scan. This + only applies to the stable_node chains so it's a + noop if not a single KSM page hit the + max_page_sharing yet (there would be no stable_node + chains in such case). + The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: pages_shared - how many shared pages are being used @@ -106,10 +150,29 @@ pages_unshared - how many pages unique but repeatedly checked for merging pages_volatile - how many pages changing too fast to be placed in a tree full_scans - how many times all mergeable areas have been scanned +stable_node_chains - number of stable node chains allocated, this is + effectively the number of KSM pages that hit the + max_page_sharing limit +stable_node_dups - number of stable node dups queued into the + stable_node chains + A high ratio of pages_sharing to pages_shared indicates good sharing, but a high ratio of pages_unshared to pages_sharing indicates wasted effort. pages_volatile embraces several different kinds of activity, but a high proportion there would also indicate poor use of madvise MADV_MERGEABLE. +The maximum possible page_sharing/page_shared ratio is limited by the +max_page_sharing tunable. To increase the ratio max_page_sharing must +be increased accordingly. + +The stable_node_dups/stable_node_chains ratio is also affected by the +max_page_sharing tunable, and an high ratio may indicate fragmentation +in the stable_node dups, which could be solved by introducing +fragmentation algorithms in ksmd which would refile rmap_items from +one stable_node dup to another stable_node dup, in order to freeup +stable_node "dups" with few rmap_items in them, but that may increase +the ksmd CPU usage and possibly slowdown the readonly computations on +the KSM pages of the applications. + Izik Eidus, Hugh Dickins, 17 Nov 2009 diff --git a/MAINTAINERS b/MAINTAINERS index 9aafae1c18c5..75cd0dc8e93f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10383,6 +10383,17 @@ W: http://wireless.kernel.org/en/users/Drivers/p54 S: Obsolete F: drivers/net/wireless/intersil/prism54/ +PROC SYSCTL +M: "Luis R. Rodriguez" <mcgrof@kernel.org> +M: Kees Cook <keescook@chromium.org> +L: linux-kernel@vger.kernel.org +L: linux-fsdevel@vger.kernel.org +S: Maintained +F: fs/proc/proc_sysctl.c +F: include/linux/sysctl.h +F: kernel/sysctl.c +F: tools/testing/selftests/sysctl/ + PS3 NETWORK SUPPORT M: Geoff Levand <geoff@infradead.org> L: netdev@vger.kernel.org diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include <linux/personality.h> /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include <asm/page-nommu.h> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 064dddd37936..e3360a3b016b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -12,7 +12,7 @@ config ARM64 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index bbc1e35aa601..793bd73b0d07 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -83,4 +83,8 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm, extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/stackprotector.h b/arch/arm64/include/asm/stackprotector.h index fe5e287dc56b..b86a0865ddf1 100644 --- a/arch/arm64/include/asm/stackprotector.h +++ b/arch/arm64/include/asm/stackprotector.h @@ -30,6 +30,7 @@ static __always_inline void boot_init_stack_canary(void) /* Try to get a semi random initial value. */ get_random_bytes(&canary, sizeof(canary)); canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; current->stack_canary = canary; __stack_chk_guard = current->stack_canary; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 7514a000e361..0ecc921b5719 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -42,15 +42,13 @@ int pud_huge(pud_t pud) } static int find_num_contig(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, size_t *pgsize) + pte_t *ptep, size_t *pgsize) { pgd_t *pgd = pgd_offset(mm, addr); pud_t *pud; pmd_t *pmd; *pgsize = PAGE_SIZE; - if (!pte_cont(pte)) - return 1; pud = pud_offset(pgd, addr); pmd = pmd_offset(pud, addr); if ((pte_t *)pmd == ptep) { @@ -65,15 +63,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, { size_t pgsize; int i; - int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize); + int ncontig; unsigned long pfn; pgprot_t hugeprot; - if (ncontig == 1) { + if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); return; } + ncontig = find_num_contig(mm, addr, ptep, &pgsize); pfn = pte_pfn(pte); hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); for (i = 0; i < ncontig; i++) { @@ -132,7 +131,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; @@ -193,21 +193,19 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, if (pte_cont(*ptep)) { int ncontig, i; size_t pgsize; - pte_t *cpte; bool is_dirty = false; - cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); /* save the 1st pte to return */ - pte = ptep_get_and_clear(mm, addr, cpte); + pte = ptep_get_and_clear(mm, addr, ptep); for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { /* * If HW_AFDBM is enabled, then the HW could * turn on the dirty bit for any of the page * in the set, so check them all. */ - ++cpte; - if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) + ++ptep; + if (pte_dirty(ptep_get_and_clear(mm, addr, ptep))) is_dirty = true; } if (is_dirty) @@ -223,8 +221,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - pte_t *cpte; - if (pte_cont(pte)) { int ncontig, i, changed = 0; size_t pgsize = 0; @@ -234,12 +230,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); - cpte = huge_pte_offset(vma->vm_mm, addr); - pfn = pte_pfn(*cpte); - ncontig = find_num_contig(vma->vm_mm, addr, cpte, - *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { - changed |= ptep_set_access_flags(vma, addr, cpte, + pfn = pte_pfn(pte); + ncontig = find_num_contig(vma->vm_mm, addr, ptep, + &pgsize); + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) { + changed |= ptep_set_access_flags(vma, addr, ptep, pfn_pte(pfn, hugeprot), dirty); @@ -256,13 +251,11 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, { if (pte_cont(*ptep)) { int ncontig, i; - pte_t *cpte; size_t pgsize = 0; - cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) - ptep_set_wrprotect(mm, addr, cpte); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) + ptep_set_wrprotect(mm, addr, ptep); } else { ptep_set_wrprotect(mm, addr, ptep); } @@ -273,14 +266,12 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, { if (pte_cont(*ptep)) { int ncontig, i; - pte_t *cpte; size_t pgsize = 0; - cpte = huge_pte_offset(vma->vm_mm, addr); - ncontig = find_num_contig(vma->vm_mm, addr, cpte, - *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) - ptep_clear_flush(vma, addr, cpte); + ncontig = find_num_contig(vma->vm_mm, addr, ptep, + &pgsize); + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) + ptep_clear_flush(vma, addr, ptep); } else { ptep_clear_flush(vma, addr, ptep); } diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 687a358a3733..81f03959a4ab 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -191,14 +191,8 @@ void __init kasan_init(void) if (start >= end) break; - /* - * end + 1 here is intentional. We check several shadow bytes in - * advance to slightly speed up fastpath. In some rare cases - * we could cross boundary of mapped shadow, so we just map - * some more here. - */ vmemmap_populate((unsigned long)kasan_mem_to_shadow(start), - (unsigned long)kasan_mem_to_shadow(end) + 1, + (unsigned long)kasan_mem_to_shadow(end), pfn_to_nid(virt_to_pfn(start))); } diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index cce3bc3603ea..2cf7648787b2 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild @@ -1,7 +1,9 @@ generic-y += clkdev.h +generic-y += device.h generic-y += exec.h generic-y += extable.h +generic-y += fb.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/frv/include/asm/device.h b/arch/frv/include/asm/device.h deleted file mode 100644 index d8f9872b0e2d..000000000000 --- a/arch/frv/include/asm/device.h +++ /dev/null @@ -1,7 +0,0 @@ -/* - * Arch specific extensions to struct device - * - * This file is released under the GPLv2 - */ -#include <asm-generic/device.h> - diff --git a/arch/frv/include/asm/fb.h b/arch/frv/include/asm/fb.h deleted file mode 100644 index c7df38030992..000000000000 --- a/arch/frv/include/asm/fb.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ -#include <linux/fb.h> - -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 599507bcec91..c14815dca747 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void) #endif } -phys_addr_t paddr_vmcoreinfo_note(void) -{ - return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); -} - diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 85de86d36fdf..ae35140332f7 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -44,7 +44,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) } pte_t * -huge_pte_offset (struct mm_struct *mm, unsigned long addr) +huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; @@ -92,7 +92,7 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int writ if (REGION_NUMBER(addr) != RGN_HPAGE) return ERR_PTR(-EINVAL); - ptep = huge_pte_offset(mm, addr); + ptep = huge_pte_offset(mm, addr, HPAGE_SIZE); if (!ptep || pte_none(*ptep)) return NULL; page = pte_page(*ptep); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 8f3efa682ee8..a4e8d6bd9cfa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -646,20 +646,13 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - pg_data_t *pgdat; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - - zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages); - + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index db1b7da91e4f..67fd53e2935a 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -74,7 +74,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 74aa6f62468f..cef152234312 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -36,7 +36,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, + unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index ed810e7206e8..db5b57829a81 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -1,8 +1,10 @@ generic-y += barrier.h generic-y += clkdev.h +generic-y += device.h generic-y += exec.h generic-y += extable.h +generic-y += fb.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/mn10300/include/asm/device.h b/arch/mn10300/include/asm/device.h deleted file mode 100644 index f0a4c256403b..000000000000 --- a/arch/mn10300/include/asm/device.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/device.h> diff --git a/arch/mn10300/include/asm/fb.h b/arch/mn10300/include/asm/fb.h deleted file mode 100644 index 697b24a91e1a..000000000000 --- a/arch/mn10300/include/asm/fb.h +++ /dev/null @@ -1,23 +0,0 @@ -/* MN10300 Frame buffer stuff - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ -#ifndef _ASM_FB_H -#define _ASM_FB_H - -#include <linux/fb.h> - -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H */ diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index aa50ac090e9b..5eb8f633b282 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -69,7 +69,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 6666cd366596..5c28bd6f2ae1 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -50,4 +50,14 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, else return entry; } + +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) +{ + if (radix_enabled()) + return true; + return false; +} +#endif + #endif diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 12837d52e84a..436aedf195ab 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -940,8 +940,7 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = vmcoreinfo_max_size; - phdr->p_filesz = vmcoreinfo_max_size; + phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; /* Increment number of program headers. */ (elf->e_phnum)++; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a4f33de4008e..c41dc44472c5 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -17,6 +17,8 @@ #include <linux/memblock.h> #include <linux/bootmem.h> #include <linux/moduleparam.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -55,7 +57,7 @@ static unsigned nr_gpages; #define hugepd_none(hpd) (hpd_val(hpd) == 0) -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { /* Only called for hugetlbfs pages, hence can ignore THP */ return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); @@ -617,62 +619,39 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } -/* - * We are holding mmap_sem, so a parallel huge page collapse cannot run. - * To prevent hugepage split, disable irq. - */ -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +struct page *follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, + int flags, int pdshift) { - bool is_thp; - pte_t *ptep, pte; - unsigned shift; - unsigned long mask, flags; - struct page *page = ERR_PTR(-EINVAL); - - local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift); - if (!ptep) - goto no_page; - pte = READ_ONCE(*ptep); - /* - * Verify it is a huge page else bail. - * Transparent hugepages are handled by generic code. We can skip them - * here. - */ - if (!shift || is_thp) - goto no_page; - - if (!pte_present(pte)) { - page = NULL; - goto no_page; + pte_t *ptep; + spinlock_t *ptl; + struct page *page = NULL; + unsigned long mask; + int shift = hugepd_shift(hpd); + struct mm_struct *mm = vma->vm_mm; + +retry: + ptl = &mm->page_table_lock; + spin_lock(ptl); + + ptep = hugepte_offset(hpd, address, pdshift); + if (pte_present(*ptep)) { + mask = (1UL << shift) - 1; + page = pte_page(*ptep); + page += ((address & mask) >> PAGE_SHIFT); + if (flags & FOLL_GET) + get_page(page); + } else { + if (is_hugetlb_entry_migration(*ptep)) { + spin_unlock(ptl); + __migration_entry_wait(mm, ptep, ptl); + goto retry; + } } - mask = (1UL << shift) - 1; - page = pte_page(pte); - if (page) - page += (address & mask) / PAGE_SIZE; - -no_page: - local_irq_restore(flags); + spin_unlock(ptl); return page; } -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - BUG(); - return NULL; -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - BUG(); - return NULL; -} - static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { @@ -763,8 +742,11 @@ static int __init add_huge_page_size(unsigned long long size) * Hash: 16M and 16G */ if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M) - return -EINVAL; + if (mmu_psize != MMU_PAGE_2M) { + if (cpu_has_feature(CPU_FTR_POWER9_DD1) || + (mmu_psize != MMU_PAGE_1G)) + return -EINVAL; + } } else { if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) return -EINVAL; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9ee536ec0739..de5a90e1ceaa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -126,18 +126,14 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - struct pglist_data *pgdata; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); - pgdata = NODE_DATA(nid); - start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0, for_device); - - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 684e886eaae4..2f629e0551e9 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -344,12 +344,18 @@ config PPC_STD_MMU_64 config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this is only implemented by IBM Power9 CPUs, if you don't have one of them you can probably disable this. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION + + config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 0521934c1e25..f602c25f44c2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -67,7 +67,7 @@ config S390 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cd546a245c68..d95869ce3ca2 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,7 +39,7 @@ static inline int prepare_hugepage_range(struct file *file, #define arch_clear_hugepage_flags(page) do { } while (0) static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) pte_val(*ptep) = _REGION3_ENTRY_EMPTY; @@ -112,4 +112,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 49a6bd45957b..3d0b14afa232 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -246,6 +246,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); } void machine_shutdown(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3ae756c0db3d..3d1d808ea8a9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -496,11 +496,6 @@ static void __init setup_memory_end(void) pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); } -static void __init setup_vmcoreinfo(void) -{ - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); -} - #ifdef CONFIG_CRASH_DUMP /* @@ -939,7 +934,6 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_vmcoreinfo(); setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d3a5e39756f6..44a8e6f0391e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -180,7 +180,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return (pte_t *) pmdp; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgdp; p4d_t *p4dp; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ee6a1d3d4983..f4fb5d191562 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -162,43 +162,17 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *zone; - int rc, i; + int rc; rc = vmem_add_mapping(start, size); if (rc) return rc; - for (i = 0; i < MAX_NR_ZONES; i++) { - zone = pgdat->node_zones + i; - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits, if possible */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; - } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; - } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages); - if (rc) - break; - start_pfn += nr_pages; - size_pages -= nr_pages; - if (!size_pages) - break; - } + rc = __add_pages(nid, start_pfn, size_pages, want_memblock); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/include/asm/stackprotector.h b/arch/sh/include/asm/stackprotector.h index d9df3a76847c..141515a43b78 100644 --- a/arch/sh/include/asm/stackprotector.h +++ b/arch/sh/include/asm/stackprotector.h @@ -19,6 +19,7 @@ static __always_inline void boot_init_stack_canary(void) /* Try to get a semi random initial value. */ get_random_bytes(&canary, sizeof(canary)); canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; current->stack_canary = canary; __stack_chk_guard = current->stack_canary; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index cc948db74878..d2412d2d6462 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -42,7 +42,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 75491862d900..bf726af5f1a5 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,20 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, - for_device), - start_pfn, nr_pages); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 7c29d38e6b99..8989c5e155b3 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -277,7 +277,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index cb10153b5c9f..1f0993945521 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -102,7 +102,8 @@ static pte_t *get_pte(pte_t *base, int index, int level) return ptep; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 492a7361e58e..ec5576fd3a86 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -503,6 +503,17 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, } EXPORT_SYMBOL(ioremap_prot); +#if !defined(CONFIG_PCI) || !defined(CONFIG_TILEGX) +/* ioremap is conditionally declared in pci_gx.c */ + +void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) +{ + return NULL; +} +EXPORT_SYMBOL(ioremap); + +#endif + /* Unmap an MMIO VA mapping. */ void iounmap(volatile void __iomem *addr_in) { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4616fd45811f..3086c2ae1cb3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY @@ -72,6 +72,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 3a106165e03a..535af0f2d8ac 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -85,4 +85,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* _ASM_X86_HUGETLB_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index dcbd9bcce714..8abedf1d650e 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -74,6 +74,7 @@ static __always_inline void boot_init_stack_canary(void) get_random_bytes(&canary, sizeof(canary)); tsc = rdtsc(); canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; current->stack_canary = canary; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 22217ece26c8..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6f5ca4ebe6e5..794e7a9a98a4 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -356,6 +356,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 302f43fd9c28..ccf509063dfd 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) if (!vma || !is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); /* hugetlb should be locked, and hence, prefaulted */ WARN_ON(!pte || pte_none(*pte)); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 99fb83819a5f..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,15 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 95651dc58e09..73329300192b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -682,22 +682,15 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0c7d8129bed6..39231a9c981a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -23,12 +23,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - /* - * end + 1 here is intentional. We check several shadow bytes in advance - * to slightly speed up fastpath. In some rare cases we could cross - * boundary of mapped shadow, so we just map some more here. - */ - return vmemmap_populate(start, end + 1, NUMA_NO_NODE); + return vmemmap_populate(start, end, NUMA_NO_NODE); } static void __init clear_pgds(unsigned long start, diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 21beb37114b7..6603394268c1 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2693,8 +2693,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) - return virt_to_machine(&vmcoreinfo_note).maddr; + return virt_to_machine(vmcoreinfo_note).maddr; else - return __pa_symbol(&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } #endif /* CONFIG_KEXEC_CORE */ diff --git a/block/genhd.c b/block/genhd.c index d252d29fe837..9355ac27d5c4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -890,7 +890,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index cc4f1d0cbffe..c7c4e0325cdb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -128,6 +128,9 @@ static ssize_t show_mem_removable(struct device *dev, int ret = 1; struct memory_block *mem = to_memory_block(dev); + if (mem->state != MEM_ONLINE) + goto out; + for (i = 0; i < sections_per_block; i++) { if (!present_section_nr(mem->start_section_nr + i)) continue; @@ -135,6 +138,7 @@ static ssize_t show_mem_removable(struct device *dev, ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); } +out: return sprintf(buf, "%d\n", ret); } @@ -388,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn, end_pfn; - unsigned long valid_start, valid_end, valid_pages; + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - struct zone *zone; - int zone_shift = 0; + unsigned long valid_start_pfn, valid_end_pfn; + bool append = false; + int nid; - start_pfn = section_nr_to_pfn(mem->start_section_nr); - end_pfn = start_pfn + nr_pages; - - /* The block contains more than one zone can not be offlined. */ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) + /* + * The block contains more than one zone can not be offlined. + * This can happen e.g. for ZONE_DMA and ZONE_DMA32 + */ + if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) return sprintf(buf, "none\n"); - zone = page_zone(pfn_to_page(valid_start)); - valid_pages = valid_end - valid_start; + start_pfn = valid_start_pfn; + nr_pages = valid_end_pfn - start_pfn; - /* MMOP_ONLINE_KEEP */ - sprintf(buf, "%s", zone->name); - - /* MMOP_ONLINE_KERNEL */ - zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + /* + * Check the existing zone. Make sure that we do that only on the + * online nodes otherwise the page_zone is not reliable + */ + if (mem->state == MEM_ONLINE) { + strcat(buf, page_zone(pfn_to_page(start_pfn))->name); + goto out; } - /* MMOP_ONLINE_MOVABLE */ - zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + nid = pfn_to_nid(start_pfn); + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { + strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); + append = true; } + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { + if (append) + strcat(buf, " "); + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); + } +out: strcat(buf, "\n"); return strlen(buf); @@ -685,14 +693,6 @@ static int add_memory_block(int base_section_nr) return 0; } -static bool is_zone_device_section(struct mem_section *ms) -{ - struct page *page; - - page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms)); - return is_zone_device_page(page); -} - /* * need an interface for the VM to add new memory regions, * but without onlining it. @@ -702,9 +702,6 @@ int register_new_memory(int nid, struct mem_section *section) int ret = 0; struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); mem = find_memory_block(section); @@ -741,11 +738,16 @@ static int remove_memory_section(unsigned long node_id, { struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); + + /* + * Some users of the memory hotplug do not want/need memblock to + * track all sections. Skip over those. + */ mem = find_memory_block(section); + if (!mem) + goto out_unlock; + unregister_mem_sect_under_nodes(mem, __section_nr(section)); mem->section_count--; @@ -754,6 +756,7 @@ static int remove_memory_section(unsigned long node_id, else put_device(&mem->dev); +out_unlock: mutex_unlock(&mem_sysfs_mutex); return 0; } @@ -820,6 +823,10 @@ int __init memory_dev_init(void) */ mutex_lock(&mem_sysfs_mutex); for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { + /* Don't iterate over sections we know are !present: */ + if (i > __highest_present_section_nr) + break; + err = add_memory_block(i); if (!ret) ret = err; diff --git a/drivers/base/node.c b/drivers/base/node.c index 0440d95c9b5b..73d39bc58c42 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -129,11 +129,11 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) + - sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), - nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + + node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * @@ -141,7 +141,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)); #else - nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); + nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE))); #endif n += hugetlb_report_node_meminfo(nid, buf + n); return n; @@ -368,21 +368,14 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -#define page_initialized(page) (page->lru.next) - static int __ref get_nid_for_pfn(unsigned long pfn) { - struct page *page; - if (!pfn_valid_within(pfn)) return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); #endif - page = pfn_to_page(pfn); - if (!page_initialized(page)) - return -1; return pfn_to_nid(pfn); } @@ -468,10 +461,9 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, return 0; } -static int link_mem_sections(int nid) +int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages) { - unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; - unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages; + unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn; struct memory_block *mem_blk = NULL; int err = 0; @@ -559,10 +551,7 @@ static int node_memory_callback(struct notifier_block *self, return NOTIFY_OK; } #endif /* CONFIG_HUGETLBFS */ -#else /* !CONFIG_MEMORY_HOTPLUG_SPARSE */ - -static int link_mem_sections(int nid) { return 0; } -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \ !defined(CONFIG_HUGETLBFS) @@ -576,39 +565,32 @@ static void init_node_hugetlb_work(int nid) { } #endif -int register_one_node(int nid) +int __register_one_node(int nid) { - int error = 0; + int p_node = parent_node(nid); + struct node *parent = NULL; + int error; int cpu; - if (node_online(nid)) { - int p_node = parent_node(nid); - struct node *parent = NULL; - - if (p_node != nid) - parent = node_devices[p_node]; - - node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL); - if (!node_devices[nid]) - return -ENOMEM; - - error = register_node(node_devices[nid], nid, parent); + if (p_node != nid) + parent = node_devices[p_node]; - /* link cpu under this node */ - for_each_present_cpu(cpu) { - if (cpu_to_node(cpu) == nid) - register_cpu_under_node(cpu, nid); - } + node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL); + if (!node_devices[nid]) + return -ENOMEM; - /* link memory sections under this node */ - error = link_mem_sections(nid); + error = register_node(node_devices[nid], nid, parent); - /* initialize work queue for memory hot plug */ - init_node_hugetlb_work(nid); + /* link cpu under this node */ + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == nid) + register_cpu_under_node(cpu, nid); } - return error; + /* initialize work queue for memory hot plug */ + init_node_hugetlb_work(nid); + return error; } void unregister_one_node(int nid) @@ -657,9 +639,7 @@ static struct node_attr node_state_attr[] = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), #endif -#ifdef CONFIG_MOVABLE_NODE [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), -#endif [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), }; @@ -670,9 +650,7 @@ static struct attribute *node_state_attrs[] = { #ifdef CONFIG_HIGHMEM &node_state_attr[N_HIGH_MEMORY].attr.attr, #endif -#ifdef CONFIG_MOVABLE_NODE &node_state_attr[N_MEMORY].attr.attr, -#endif &node_state_attr[N_CPU].attr.attr, NULL }; diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index b8ecba6dcd3b..2f3dd1f79ee1 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,3 +13,17 @@ config ZRAM disks and maybe many more. See zram.txt for more information. + +config ZRAM_DEDUP + bool "Deduplication support for ZRAM data" + depends on ZRAM + default n + help + Deduplicate ZRAM data to reduce amount of memory consumption. + Advantage largely depends on the workload. In some cases, this + option reduces memory usage to the half. However, if there is no + duplicated data, the amount of memory consumption would be + increased due to additional metadata usage. And, there is + computation time trade-off. Please check the benefit before + enabling this option. Experiment shows the positive effect when + the zram is used as blockdev and is used to store build output. diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index 9e2b79e9a990..d7204ef6ee53 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,3 +1,4 @@ -zram-y := zcomp.o zram_drv.o +zram-y := zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_DEDUP) += zram_dedup.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c new file mode 100644 index 000000000000..c15848cc1b31 --- /dev/null +++ b/drivers/block/zram/zram_dedup.c @@ -0,0 +1,250 @@ +/* + * Copyright (C) 2017 Joonsoo Kim. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/vmalloc.h> +#include <linux/jhash.h> +#include <linux/highmem.h> + +#include "zram_drv.h" + +/* One slot will contain 128 pages theoretically */ +#define ZRAM_HASH_SHIFT 7 +#define ZRAM_HASH_SIZE_MIN (1 << 10) +#define ZRAM_HASH_SIZE_MAX (1 << 31) + +u64 zram_dedup_dup_size(struct zram *zram) +{ + return (u64)atomic64_read(&zram->stats.dup_data_size); +} + +u64 zram_dedup_meta_size(struct zram *zram) +{ + return (u64)atomic64_read(&zram->stats.meta_data_size); +} + +static u32 zram_dedup_checksum(unsigned char *mem) +{ + return jhash(mem, PAGE_SIZE, 0); +} + +void zram_dedup_insert(struct zram *zram, struct zram_entry *new, + u32 checksum) +{ + struct zram_hash *hash; + struct rb_root *rb_root; + struct rb_node **rb_node, *parent = NULL; + struct zram_entry *entry; + + if (!zram_dedup_enabled(zram)) + return; + + new->checksum = checksum; + hash = &zram->hash[checksum % zram->hash_size]; + rb_root = &hash->rb_root; + + spin_lock(&hash->lock); + rb_node = &rb_root->rb_node; + while (*rb_node) { + parent = *rb_node; + entry = rb_entry(parent, struct zram_entry, rb_node); + if (checksum < entry->checksum) + rb_node = &parent->rb_left; + else if (checksum > entry->checksum) + rb_node = &parent->rb_right; + else + rb_node = &parent->rb_left; + } + + rb_link_node(&new->rb_node, parent, rb_node); + rb_insert_color(&new->rb_node, rb_root); + spin_unlock(&hash->lock); +} + +static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry, + unsigned char *mem) +{ + bool match = false; + unsigned char *cmem; + struct zcomp_strm *zstrm; + + cmem = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO); + if (entry->len == PAGE_SIZE) { + match = !memcmp(mem, cmem, PAGE_SIZE); + } else { + zstrm = zcomp_stream_get(zram->comp); + if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer)) + match = !memcmp(mem, zstrm->buffer, PAGE_SIZE); + zcomp_stream_put(zram->comp); + } + zs_unmap_object(zram->mem_pool, entry->handle); + + return match; +} + +static unsigned long zram_dedup_put(struct zram *zram, + struct zram_entry *entry) +{ + struct zram_hash *hash; + u32 checksum; + + checksum = entry->checksum; + hash = &zram->hash[checksum % zram->hash_size]; + + spin_lock(&hash->lock); + + entry->refcount--; + if (!entry->refcount) + rb_erase(&entry->rb_node, &hash->rb_root); + spin_unlock(&hash->lock); + + return entry->refcount; +} + +static struct zram_entry *__zram_dedup_get(struct zram *zram, + struct zram_hash *hash, unsigned char *mem, + struct zram_entry *entry) +{ + struct zram_entry *tmp, *prev = NULL; + struct rb_node *rb_node; + + /* find left-most entry with same checksum */ + while ((rb_node = rb_prev(&entry->rb_node))) { + tmp = rb_entry(rb_node, struct zram_entry, rb_node); + if (tmp->checksum != entry->checksum) + break; + + entry = tmp; + } + +again: + entry->refcount++; + spin_unlock(&hash->lock); + + if (prev) + zram_entry_free(zram, prev); + + if (zram_dedup_match(zram, entry, mem)) + return entry; + + spin_lock(&hash->lock); + tmp = NULL; + rb_node = rb_next(&entry->rb_node); + if (rb_node) + tmp = rb_entry(rb_node, struct zram_entry, rb_node); + + if (tmp && (tmp->checksum == entry->checksum)) { + prev = entry; + entry = tmp; + goto again; + } + + spin_unlock(&hash->lock); + zram_entry_free(zram, entry); + + return NULL; +} + +static struct zram_entry *zram_dedup_get(struct zram *zram, + unsigned char *mem, u32 checksum) +{ + struct zram_hash *hash; + struct zram_entry *entry; + struct rb_node *rb_node; + + hash = &zram->hash[checksum % zram->hash_size]; + + spin_lock(&hash->lock); + rb_node = hash->rb_root.rb_node; + while (rb_node) { + entry = rb_entry(rb_node, struct zram_entry, rb_node); + if (checksum == entry->checksum) + return __zram_dedup_get(zram, hash, mem, entry); + + if (checksum < entry->checksum) + rb_node = rb_node->rb_left; + else + rb_node = rb_node->rb_right; + } + spin_unlock(&hash->lock); + + return NULL; +} + +struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page, + u32 *checksum) +{ + void *mem; + struct zram_entry *entry; + + if (!zram_dedup_enabled(zram)) + return NULL; + + mem = kmap_atomic(page); + *checksum = zram_dedup_checksum(mem); + + entry = zram_dedup_get(zram, mem, *checksum); + kunmap_atomic(mem); + + return entry; +} + +void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry, + unsigned long handle, unsigned int len) +{ + if (!zram_dedup_enabled(zram)) + return; + + entry->handle = handle; + entry->refcount = 1; + entry->len = len; +} + +bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry) +{ + if (!zram_dedup_enabled(zram)) + return true; + + if (zram_dedup_put(zram, entry)) + return false; + + return true; +} + +int zram_dedup_init(struct zram *zram, size_t num_pages) +{ + int i; + struct zram_hash *hash; + + if (!zram_dedup_enabled(zram)) + return 0; + + zram->hash_size = num_pages >> ZRAM_HASH_SHIFT; + zram->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, zram->hash_size); + zram->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, zram->hash_size); + zram->hash = vzalloc(zram->hash_size * sizeof(struct zram_hash)); + if (!zram->hash) { + pr_err("Error allocating zram entry hash\n"); + return -ENOMEM; + } + + for (i = 0; i < zram->hash_size; i++) { + hash = &zram->hash[i]; + spin_lock_init(&hash->lock); + hash->rb_root = RB_ROOT; + } + + return 0; +} + +void zram_dedup_fini(struct zram *zram) +{ + vfree(zram->hash); + zram->hash = NULL; + zram->hash_size = 0; +} diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h new file mode 100644 index 000000000000..8ab267b0b956 --- /dev/null +++ b/drivers/block/zram/zram_dedup.h @@ -0,0 +1,45 @@ +#ifndef _ZRAM_DEDUP_H_ +#define _ZRAM_DEDUP_H_ + +struct zram; +struct zram_entry; + +#ifdef CONFIG_ZRAM_DEDUP + +u64 zram_dedup_dup_size(struct zram *zram); +u64 zram_dedup_meta_size(struct zram *zram); + +void zram_dedup_insert(struct zram *zram, struct zram_entry *new, + u32 checksum); +struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page, + u32 *checksum); + +void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry, + unsigned long handle, unsigned int len); +bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry); + +int zram_dedup_init(struct zram *zram, size_t num_pages); +void zram_dedup_fini(struct zram *zram); +#else + +static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; } +static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; } + +static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new, + u32 checksum) { } +static inline struct zram_entry *zram_dedup_find(struct zram *zram, + struct page *page, u32 *checksum) { return NULL; } + +static inline void zram_dedup_init_entry(struct zram *zram, + struct zram_entry *entry, unsigned long handle, + unsigned int len) { } +static inline bool zram_dedup_put_entry(struct zram *zram, + struct zram_entry *entry) { return true; } + +static inline int zram_dedup_init(struct zram *zram, + size_t num_pages) { return 0; } +static inline void zram_dedup_fini(struct zram *zram) { } + +#endif + +#endif /* _ZRAM_DEDUP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index debee952dcc1..5440d1a8b882 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -57,14 +57,15 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static unsigned long zram_get_handle(struct zram *zram, u32 index) +static struct zram_entry *zram_get_entry(struct zram *zram, u32 index) { - return zram->table[index].handle; + return zram->table[index].entry; } -static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +static void zram_set_entry(struct zram *zram, u32 index, + struct zram_entry *entry) { - zram->table[index].handle = handle; + zram->table[index].entry = entry; } /* flag operations require table entry bit_spin_lock() being held */ @@ -332,6 +333,41 @@ static ssize_t comp_algorithm_store(struct device *dev, return len; } +static ssize_t use_dedup_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + bool val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->use_dedup; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (int)val); +} + +#ifdef CONFIG_ZRAM_DEDUP +static ssize_t use_dedup_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + if (kstrtoint(buf, 10, &val) || (val != 0 && val != 1)) + return -EINVAL; + + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change dedup usage for initialized device\n"); + return -EBUSY; + } + zram->use_dedup = val; + up_write(&zram->init_lock); + return len; +} +#endif + static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -388,14 +424,16 @@ static ssize_t mm_stat_show(struct device *dev, max_used = atomic_long_read(&zram->stats.max_used_pages); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted); + pool_stats.pages_compacted, + zram_dedup_dup_size(zram), + zram_dedup_meta_size(zram)); up_read(&zram->init_lock); return ret; @@ -422,6 +460,15 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); +static unsigned long zram_entry_handle(struct zram *zram, + struct zram_entry *entry) +{ + if (zram_dedup_enabled(zram)) + return entry->handle; + else + return (unsigned long)entry; +} + static void zram_slot_lock(struct zram *zram, u32 index) { bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); @@ -437,7 +484,7 @@ static bool zram_same_page_read(struct zram *zram, u32 index, unsigned int offset, unsigned int len) { zram_slot_lock(zram, index); - if (unlikely(!zram_get_handle(zram, index) || + if (unlikely(!zram_get_entry(zram, index) || zram_test_flag(zram, index, ZRAM_SAME))) { void *mem; @@ -453,27 +500,45 @@ static bool zram_same_page_read(struct zram *zram, u32 index, return false; } -static bool zram_same_page_write(struct zram *zram, u32 index, - struct page *page) +static struct zram_entry *zram_entry_alloc(struct zram *zram, + unsigned int len, gfp_t flags) { - unsigned long element; - void *mem = kmap_atomic(page); + struct zram_entry *entry; + unsigned long handle; - if (page_same_filled(mem, &element)) { - kunmap_atomic(mem); - /* Free memory associated with this sector now. */ - zram_slot_lock(zram, index); - zram_free_page(zram, index); - zram_set_flag(zram, index, ZRAM_SAME); - zram_set_element(zram, index, element); - zram_slot_unlock(zram, index); + handle = zs_malloc(zram->mem_pool, len, flags); + if (!handle) + return NULL; - atomic64_inc(&zram->stats.same_pages); - return true; + if (!zram_dedup_enabled(zram)) + return (struct zram_entry *)handle; + + entry = kzalloc(sizeof(*entry), + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + if (!entry) { + zs_free(zram->mem_pool, handle); + return NULL; } - kunmap_atomic(mem); - return false; + zram_dedup_init_entry(zram, entry, handle, len); + atomic64_add(sizeof(*entry), &zram->stats.meta_data_size); + + return entry; +} + +void zram_entry_free(struct zram *zram, struct zram_entry *entry) +{ + if (!zram_dedup_put_entry(zram, entry)) + return; + + zs_free(zram->mem_pool, zram_entry_handle(zram, entry)); + + if (!zram_dedup_enabled(zram)) + return; + + kfree(entry); + + atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size); } static void zram_meta_free(struct zram *zram, u64 disksize) @@ -486,6 +551,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) zram_free_page(zram, index); zs_destroy_pool(zram->mem_pool); + zram_dedup_fini(zram); vfree(zram->table); } @@ -504,6 +570,12 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) return false; } + if (zram_dedup_init(zram, num_pages)) { + vfree(zram->table); + zs_destroy_pool(zram->mem_pool); + return false; + } + return true; } @@ -514,7 +586,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) */ static void zram_free_page(struct zram *zram, size_t index) { - unsigned long handle = zram_get_handle(zram, index); + struct zram_entry *entry = zram_get_entry(zram, index); /* * No memory is allocated for same element filled pages. @@ -524,26 +596,34 @@ static void zram_free_page(struct zram *zram, size_t index) zram_clear_flag(zram, index, ZRAM_SAME); zram_set_element(zram, index, 0); atomic64_dec(&zram->stats.same_pages); + atomic64_dec(&zram->stats.pages_stored); return; } - if (!handle) - return; + if (zram_dedup_enabled(zram) && + zram_test_flag(zram, index, ZRAM_DUP)) { + zram_clear_flag(zram, index, ZRAM_DUP); + atomic64_sub(entry->len, &zram->stats.dup_data_size); + goto out; + } - zs_free(zram->mem_pool, handle); + if (!entry) + return; atomic64_sub(zram_get_obj_size(zram, index), &zram->stats.compr_data_size); - atomic64_dec(&zram->stats.pages_stored); +out: + zram_entry_free(zram, entry); - zram_set_handle(zram, index, 0); + atomic64_dec(&zram->stats.pages_stored); + zram_set_entry(zram, index, NULL); zram_set_obj_size(zram, index, 0); } static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) { int ret; - unsigned long handle; + struct zram_entry *entry; unsigned int size; void *src, *dst; @@ -551,10 +631,11 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) return 0; zram_slot_lock(zram, index); - handle = zram_get_handle(zram, index); + entry = zram_get_entry(zram, index); size = zram_get_obj_size(zram, index); - src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + src = zs_map_object(zram->mem_pool, + zram_entry_handle(zram, entry), ZS_MM_RO); if (size == PAGE_SIZE) { dst = kmap_atomic(page); memcpy(dst, src, PAGE_SIZE); @@ -568,7 +649,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) kunmap_atomic(dst); zcomp_stream_put(zram->comp); } - zs_unmap_object(zram->mem_pool, handle); + zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry)); zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ @@ -612,14 +693,14 @@ out: } static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, - struct page *page, - unsigned long *out_handle, unsigned int *out_comp_len) + struct page *page, struct zram_entry **out_entry, + unsigned int *out_comp_len) { int ret; unsigned int comp_len; void *src; unsigned long alloced_pages; - unsigned long handle = 0; + struct zram_entry *entry = NULL; compress_again: src = kmap_atomic(page); @@ -628,8 +709,8 @@ compress_again: if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); - if (handle) - zs_free(zram->mem_pool, handle); + if (entry) + zram_entry_free(zram, entry); return ret; } @@ -637,32 +718,32 @@ compress_again: comp_len = PAGE_SIZE; /* - * handle allocation has 2 paths: + * entry allocation has 2 paths: * a) fast path is executed with preemption disabled (for * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, * since we can't sleep; * b) slow path enables preemption and attempts to allocate * the page with __GFP_DIRECT_RECLAIM bit set. we have to * put per-cpu compression stream and, thus, to re-do - * the compression once handle is allocated. + * the compression once entry is allocated. * - * if we have a 'non-null' handle here then we are coming - * from the slow path and handle has already been allocated. + * if we have a 'non-null' entry here then we are coming + * from the slow path and entry has already been allocated. */ - if (!handle) - handle = zs_malloc(zram->mem_pool, comp_len, + if (!entry) + entry = zram_entry_alloc(zram, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_MOVABLE); - if (!handle) { + if (!entry) { zcomp_stream_put(zram->comp); atomic64_inc(&zram->stats.writestall); - handle = zs_malloc(zram->mem_pool, comp_len, + entry = zram_entry_alloc(zram, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); *zstrm = zcomp_stream_get(zram->comp); - if (handle) + if (entry) goto compress_again; return -ENOMEM; } @@ -671,11 +752,11 @@ compress_again: update_used_max(zram, alloced_pages); if (zram->limit_pages && alloced_pages > zram->limit_pages) { - zs_free(zram->mem_pool, handle); + zram_entry_free(zram, entry); return -ENOMEM; } - *out_handle = handle; + *out_entry = entry; *out_comp_len = comp_len; return 0; } @@ -683,23 +764,42 @@ compress_again: static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) { int ret; - unsigned long handle; - unsigned int comp_len; - void *src, *dst; + struct zram_entry *uninitialized_var(entry); + unsigned int uninitialized_var(comp_len); + void *src, *dst, *mem; struct zcomp_strm *zstrm; struct page *page = bvec->bv_page; + u32 checksum; + enum zram_pageflags flags = 0; + unsigned long uninitialized_var(element); - if (zram_same_page_write(zram, index, page)) - return 0; + mem = kmap_atomic(page); + if (page_same_filled(mem, &element)) { + kunmap_atomic(mem); + /* Free memory associated with this sector now. */ + flags = ZRAM_SAME; + atomic64_inc(&zram->stats.same_pages); + goto out; + } + kunmap_atomic(mem); + + entry = zram_dedup_find(zram, page, &checksum); + if (entry) { + comp_len = entry->len; + flags = ZRAM_DUP; + atomic64_add(comp_len, &zram->stats.dup_data_size); + goto out; + } zstrm = zcomp_stream_get(zram->comp); - ret = zram_compress(zram, &zstrm, page, &handle, &comp_len); + ret = zram_compress(zram, &zstrm, page, &entry, &comp_len); if (ret) { zcomp_stream_put(zram->comp); return ret; } - dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + dst = zs_map_object(zram->mem_pool, + zram_entry_handle(zram, entry), ZS_MM_WO); src = zstrm->buffer; if (comp_len == PAGE_SIZE) @@ -709,21 +809,27 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index) kunmap_atomic(src); zcomp_stream_put(zram->comp); - zs_unmap_object(zram->mem_pool, handle); + zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry)); + zram_dedup_insert(zram, entry, checksum); +out: + zram_slot_lock(zram, index); /* * Free memory associated with this sector * before overwriting unused sectors. */ - zram_slot_lock(zram, index); zram_free_page(zram, index); - zram_set_handle(zram, index, handle); - zram_set_obj_size(zram, index, comp_len); + if (flags) + zram_set_flag(zram, index, flags); + if (flags != ZRAM_SAME) { + zram_set_obj_size(zram, index, comp_len); + zram_set_entry(zram, index, entry); + } else { + zram_set_element(zram, index, element); + } zram_slot_unlock(zram, index); - - /* Update stats */ - atomic64_add(comp_len, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); + return 0; } @@ -1106,6 +1212,11 @@ static DEVICE_ATTR_WO(mem_limit); static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); +#ifdef CONFIG_ZRAM_DEDUP +static DEVICE_ATTR_RW(use_dedup); +#else +static DEVICE_ATTR_RO(use_dedup); +#endif static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1116,6 +1227,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, + &dev_attr_use_dedup.attr, &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, &dev_attr_debug_stat.attr, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e34e44d02e3e..8ccfdcd8f674 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -18,8 +18,10 @@ #include <linux/rwsem.h> #include <linux/zsmalloc.h> #include <linux/crypto.h> +#include <linux/spinlock.h> #include "zcomp.h" +#include "zram_dedup.h" /*-- Configurable parameters */ @@ -62,6 +64,7 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; enum zram_pageflags { /* Page consists entirely of zeros */ ZRAM_SAME = ZRAM_FLAG_SHIFT, + ZRAM_DUP, ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, @@ -69,10 +72,18 @@ enum zram_pageflags { /*-- Data structures */ +struct zram_entry { + struct rb_node rb_node; + u32 len; + u32 checksum; + unsigned long refcount; + unsigned long handle; +}; + /* Allocated for each disk page */ struct zram_table_entry { union { - unsigned long handle; + struct zram_entry *entry; unsigned long element; }; unsigned long value; @@ -90,6 +101,13 @@ struct zram_stats { atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ + atomic64_t dup_data_size; /* compressed size of pages duplicated */ + atomic64_t meta_data_size; /* size of zram_entries */ +}; + +struct zram_hash { + spinlock_t lock; + struct rb_root rb_root; }; struct zram { @@ -97,6 +115,8 @@ struct zram { struct zs_pool *mem_pool; struct zcomp *comp; struct gendisk *disk; + struct zram_hash *hash; + size_t hash_size; /* Prevent concurrent execution of device init */ struct rw_semaphore init_lock; /* @@ -115,5 +135,16 @@ struct zram { * zram is claimed so open request will be failed */ bool claim; /* Protected by bdev->bd_mutex */ + bool use_dedup; }; + +static inline bool zram_dedup_enabled(struct zram *zram) +{ +#ifdef CONFIG_ZRAM_DEDUP + return zram->use_dedup; +#else + return false; +#endif +} +void zram_entry_free(struct zram *zram, struct zram_entry *entry); #endif diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c index 35bbe288ddb4..a638c3048207 100644 --- a/drivers/sh/intc/virq.c +++ b/drivers/sh/intc/virq.c @@ -94,10 +94,8 @@ static int add_virq_to_pirq(unsigned int irq, unsigned int virq) } entry = kzalloc(sizeof(struct intc_virq_list), GFP_ATOMIC); - if (!entry) { - pr_err("can't allocate VIRQ mapping for %d\n", virq); + if (!entry) return -ENOMEM; - } entry->irq = virq; diff --git a/fs/Kconfig b/fs/Kconfig index b0e42b6a96b9..7aee6d699fd6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -80,7 +80,6 @@ config EXPORTFS_BLOCK_OPS config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y - select PERCPU_RWSEM help This option enables standard file locking support, required for filesystems like NFS and for the flock() system diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 25e312cb6071..9a69392f1fb3 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -419,7 +419,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) if (i_sblock > info->si_blocks || i_eblock > info->si_blocks || i_sblock > i_eblock || - i_eoff > s_size || + (i_eoff != le32_to_cpu(-1) && i_eoff > s_size) || i_sblock * BFS_BSIZE > i_eoff) { printf("Inode 0x%08x corrupted\n", i); diff --git a/fs/buffer.c b/fs/buffer.c index 161be58c5cb0..0de5340c913c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1273,44 +1273,31 @@ static inline void check_irqs_on(void) } /* - * The LRU management algorithm is dopey-but-simple. Sorry. + * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is + * inserted at the front, and the buffer_head at the back if any is evicted. + * Or, if already in the LRU it is moved to the front. */ static void bh_lru_install(struct buffer_head *bh) { - struct buffer_head *evictee = NULL; + struct buffer_head *evictee = bh; + struct bh_lru *b; + int i; check_irqs_on(); bh_lru_lock(); - if (__this_cpu_read(bh_lrus.bhs[0]) != bh) { - struct buffer_head *bhs[BH_LRU_SIZE]; - int in; - int out = 0; - - get_bh(bh); - bhs[out++] = bh; - for (in = 0; in < BH_LRU_SIZE; in++) { - struct buffer_head *bh2 = - __this_cpu_read(bh_lrus.bhs[in]); - if (bh2 == bh) { - __brelse(bh2); - } else { - if (out >= BH_LRU_SIZE) { - BUG_ON(evictee != NULL); - evictee = bh2; - } else { - bhs[out++] = bh2; - } - } + b = this_cpu_ptr(&bh_lrus); + for (i = 0; i < BH_LRU_SIZE; i++) { + swap(evictee, b->bhs[i]); + if (evictee == bh) { + bh_lru_unlock(); + return; } - while (out < BH_LRU_SIZE) - bhs[out++] = NULL; - memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } - bh_lru_unlock(); - if (evictee) - __brelse(evictee); + get_bh(bh); + bh_lru_unlock(); + brelse(evictee); } /* @@ -1212,7 +1212,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, diff --git a/fs/dcache.c b/fs/dcache.c index cddf39777835..53a2b91b9152 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3548,8 +3548,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3561,24 +3559,19 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); @@ -3592,14 +3585,11 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5420767c9b68..1e2cc37cef40 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -960,10 +960,14 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) mutex_lock(&ep->mtx); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); + struct inode *inode = file_inode(epi->ffd.file); - seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", + seq_printf(m, "tfd: %8d events: %8x data: %16llx " + " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, - (long long)epi->event.data); + (long long)epi->event.data, + (long long)epi->ffd.file->f_pos, + inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } @@ -1073,6 +1077,50 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) +{ + struct rb_node *rbp; + struct epitem *epi; + + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (epi->ffd.fd == tfd) { + if (toff == 0) + return epi; + else + toff--; + } + cond_resched(); + } + + return NULL; +} + +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, + unsigned long toff) +{ + struct file *file_raw; + struct eventpoll *ep; + struct epitem *epi; + + if (!is_file_epoll(file)) + return ERR_PTR(-EINVAL); + + ep = file->private_data; + + mutex_lock(&ep->mtx); + epi = ep_find_tfd(ep, tfd, toff); + if (epi) + file_raw = epi->ffd.file; + else + file_raw = ERR_PTR(-ENOENT); + mutex_unlock(&ep->mtx); + + return file_raw; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they @@ -1748,6 +1796,16 @@ fetch_events: * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); + /* + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. + */ + if (fatal_signal_pending(current)) { + res = -EINTR; + break; + } if (ep_events_available(ep) || timed_out) break; if (signal_pending(current)) { diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 8eeb694332fe..98233a97b7b8 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -72,7 +72,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) set_page_dirty(page); if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index d9650c9508e4..e2709695b177 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -100,7 +100,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) } if (IS_DIRSYNC(dir)) { - err = write_one_page(page, 1); + err = write_one_page(page); if (!err) err = sync_inode_metadata(dir, 1); } else { diff --git a/fs/file.c b/fs/file.c index 1c2972e3a405..1fc7fbbb4510 100644 --- a/fs/file.c +++ b/fs/file.c @@ -30,21 +30,6 @@ unsigned int sysctl_nr_open_min = BITS_PER_LONG; unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; -static void *alloc_fdmem(size_t size) -{ - /* - * Very large allocations can stress page reclaim, so fall back to - * vmalloc() if the allocation size will be considered "large" by the VM. - */ - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | - __GFP_NOWARN | __GFP_NORETRY); - if (data != NULL) - return data; - } - return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); -} - static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); @@ -131,13 +116,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (!fdt) goto out; fdt->max_fds = nr; - data = alloc_fdmem(nr * sizeof(struct file *)); + data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; - data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); + data = kvmalloc(max_t(size_t, + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), + GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dde861387a40..33b33ec2a090 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -851,6 +851,16 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } +static int hugetlbfs_error_remove_page(struct address_space *mapping, + struct page *page) +{ + struct inode *inode = mapping->host; + + remove_huge_page(page); + hugetlb_fix_reserve_counts(inode); + return 0; +} + static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); @@ -966,6 +976,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_end = hugetlbfs_write_end, .set_page_dirty = hugetlbfs_set_page_dirty, .migratepage = hugetlbfs_migrate_page, + .error_remove_page = hugetlbfs_error_remove_page, }; diff --git a/fs/inode.c b/fs/inode.c index db5914783a71..a25b9d5fb52e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1914,8 +1914,6 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -1927,20 +1925,15 @@ void __init inode_init_early(void) sizeof(struct hlist_head), ihash_entries, 14, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - unsigned int loop; - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1958,14 +1951,11 @@ void __init inode_init(void) sizeof(struct hlist_head), ihash_entries, 14, - 0, + HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 489aaa1403e5..7cc1c85f4508 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -664,6 +664,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, INCREMENT(mpStat.pagealloc); mp = alloc_metapage(GFP_NOFS); mp->page = page; + mp->sb = inode->i_sb; mp->flag = 0; mp->xflag = COMMIT_PAGE; mp->count = 1; @@ -711,7 +712,8 @@ void force_metapage(struct metapage *mp) get_page(page); lock_page(page); set_page_dirty(page); - write_one_page(page, 1); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); clear_bit(META_forcewrite, &mp->flag); put_page(page); } @@ -756,7 +758,8 @@ void release_metapage(struct metapage * mp) set_page_dirty(page); if (test_bit(META_sync, &mp->flag)) { clear_bit(META_sync, &mp->flag); - write_one_page(page, 1); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); lock_page(page); /* write_one_page unlocks the page */ } } else if (mp->lsn) /* discard_metapage doesn't remove it */ diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index a869fb4a20d6..8b0ee514eb84 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -38,6 +38,7 @@ struct metapage { /* implementation */ struct page *page; + struct super_block *sb; unsigned int logical_size; /* Journal management */ diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 7edc9b395700..baa9721f1299 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -57,7 +57,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/namespace.c b/fs/namespace.c index 5e3dcbeb1de5..3c06e334deb7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3239,7 +3239,6 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { - unsigned u; int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), @@ -3248,22 +3247,17 @@ void __init mnt_init(void) mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, - 0, + HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, - 0, + HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - for (u = 0; u <= m_hash_mask; u++) - INIT_HLIST_HEAD(&mount_hashtable[u]); - for (u = 0; u <= mp_hash_mask; u++) - INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - kernfs_init(); err = sysfs_init(); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 0c3905e0542e..6719c0be674d 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_fault *vmf) * -- nyc */ count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..f0072145eead 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2612,20 +2612,48 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + if (ret == -EEXIST) { + if (oldmle) + __dlm_put_mle(oldmle); + + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + mlog(0, "another process is already migrating it\n"); + goto fail; + } + + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again. Because it has been unhasded from the map + * in the function dlm_add_migration_mle. + * Otherwise the memory will be leaked. It will not be found again from + * the hash map. + */ + if (oldmle) { + /* master is known, detach if not already detached */ + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + + /* + * If the type of the mle is BLOCK, it should be put once for + * release. Otherwise a memory leak may be caused because + * oldmle has been unhashed from the hash map and it will not + * be found any more. + */ + if (oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + /* get an extra reference on the mle. * otherwise the assert_master from the new * master will destroy this. */ dlm_get_mle_inuse(mle); + mle_added = 1; + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); - if (ret == -EEXIST) { - mlog(0, "another process is already migrating it\n"); - goto fail; - } - mle_added = 1; - /* * set the MIGRATING flag and flush asts * if we fail after this we need to re-dirty the lockres @@ -2642,12 +2670,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (ret != -EEXIST && oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (ret < 0) { if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); @@ -3182,16 +3204,24 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, if (ret < 0) kmem_cache_free(dlm_mle_cache, mle); + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again because it has been unhashed from the map + * in the dlm_add_migration_mle(). + * Otherwise the memory will be leaked. It will not be found again from + * the hash map. + */ + if (oldmle) { + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); - if (oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (res) dlm_lockres_put(res); leave: diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..908b05942282 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct dlm_lock *lock, *next; unsigned int freed = 0; + struct list_head *queue = NULL; + int i; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2280,31 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ /* TODO: check pending_asts, pending_basts here */ - list_for_each_entry_safe(lock, next, &res->granted, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->converting, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->blocked, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* + * Can't schedule DLM_UNLOCK_FREE_LOCK: do + * manually + */ + dlm_lock_put(lock); + freed++; + } } } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3b7c937a36b5..a54196ac75ae 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3409,7 +3409,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * we can recover correctly from node failure. Otherwise, we may get * invalid LVB in LKB, but without DLM_SBF_VALNOTVALIDÂ being set. */ - if (!ocfs2_is_o2cb_active() && + if (ocfs2_userspace_stack(osb) && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) lvb = 1; diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 2cabbcf2f28e..f65f2b2f594d 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -53,36 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = { "UNSUPPORTED" }; -static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); -static LIST_HEAD(ocfs2_filecheck_sysfs_list); - -struct ocfs2_filecheck { - struct list_head fc_head; /* File check entry list head */ - spinlock_t fc_lock; - unsigned int fc_max; /* Maximum number of entry in list */ - unsigned int fc_size; /* Current entry count in list */ - unsigned int fc_done; /* Finished entry count in list */ -}; - -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - -#define OCFS2_FILECHECK_MAXSIZE 100 -#define OCFS2_FILECHECK_MINSIZE 10 - -/* File check operation type */ -enum { - OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ - OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ - OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ -}; - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; @@ -110,40 +80,84 @@ ocfs2_filecheck_error(int errno) return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf); -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count); -static struct kobj_attribute ocfs2_attr_filecheck_chk = +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_filecheck_attr_chk = __ATTR(check, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_fix = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_fix = __ATTR(fix, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_set = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_set = __ATTR(set, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct attribute *ocfs2_filecheck_attrs[] = { + &ocfs2_filecheck_attr_chk.attr, + &ocfs2_filecheck_attr_fix.attr, + &ocfs2_filecheck_attr_set.attr, + NULL +}; -static int ocfs2_filecheck_sysfs_wait(atomic_t *p) +static void ocfs2_filecheck_release(struct kobject *kobj) { - schedule(); - return 0; + struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + complete(&entry->fs_kobj_unregister); } +static ssize_t +ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + kobject_put(kobj); + return ret; +} + +static ssize_t +ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + kobject_put(kobj); + return ret; +} + +static const struct sysfs_ops ocfs2_filecheck_ops = { + .show = ocfs2_filecheck_show, + .store = ocfs2_filecheck_store, +}; + +static struct kobj_type ocfs2_ktype_filecheck = { + .default_attrs = ocfs2_filecheck_attrs, + .sysfs_ops = &ocfs2_filecheck_ops, + .release = ocfs2_filecheck_release, +}; + static void ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; - if (!atomic_dec_and_test(&entry->fs_count)) - wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&entry->fs_fcheck->fc_lock); while (!list_empty(&entry->fs_fcheck->fc_head)) { p = list_first_entry(&entry->fs_fcheck->fc_head, @@ -154,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) } spin_unlock(&entry->fs_fcheck->fc_lock); - kset_unregister(entry->fs_fcheckkset); - kset_unregister(entry->fs_devicekset); kfree(entry->fs_fcheck); - kfree(entry); -} - -static void -ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) -{ - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); + entry->fs_fcheck = NULL; } -static int ocfs2_filecheck_sysfs_del(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - list_del(&p->fs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - ocfs2_filecheck_sysfs_free(p); - return 0; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return 1; -} - -static void -ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb) { - if (atomic_dec_and_test(&entry->fs_count)) - wake_up_atomic_t(&entry->fs_count); -} - -static struct ocfs2_filecheck_sysfs_entry * -ocfs2_filecheck_sysfs_get(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p = NULL; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - atomic_inc(&p->fs_count); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return p; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return NULL; -} - -int ocfs2_filecheck_create_sysfs(struct super_block *sb) -{ - int ret = 0; - struct kset *device_kset = NULL; - struct kset *fcheck_kset = NULL; - struct ocfs2_filecheck *fcheck = NULL; - struct ocfs2_filecheck_sysfs_entry *entry = NULL; - struct attribute **attrs = NULL; - struct attribute_group attrgp; - - if (!ocfs2_kset) - return -ENOMEM; - - attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); - if (!attrs) { - ret = -ENOMEM; - goto error; - } else { - attrs[0] = &ocfs2_attr_filecheck_chk.attr; - attrs[1] = &ocfs2_attr_filecheck_fix.attr; - attrs[2] = &ocfs2_attr_filecheck_set.attr; - attrs[3] = NULL; - memset(&attrgp, 0, sizeof(attrgp)); - attrgp.attrs = attrs; - } + int ret; + struct ocfs2_filecheck *fcheck; + struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent; fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); - if (!fcheck) { - ret = -ENOMEM; - goto error; - } else { - INIT_LIST_HEAD(&fcheck->fc_head); - spin_lock_init(&fcheck->fc_lock); - fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; - fcheck->fc_size = 0; - fcheck->fc_done = 0; - } - - if (strlen(sb->s_id) <= 0) { - mlog(ML_ERROR, - "Cannot get device basename when create filecheck sysfs\n"); - ret = -ENODEV; - goto error; - } - - device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); - if (!device_kset) { - ret = -ENOMEM; - goto error; - } - - fcheck_kset = kset_create_and_add("filecheck", NULL, - &device_kset->kobj); - if (!fcheck_kset) { - ret = -ENOMEM; - goto error; - } - - ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); - if (ret) - goto error; + if (!fcheck) + return -ENOMEM; - entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); - if (!entry) { - ret = -ENOMEM; - goto error; - } else { - atomic_set(&entry->fs_count, 1); - entry->fs_sb = sb; - entry->fs_devicekset = device_kset; - entry->fs_fcheckkset = fcheck_kset; - entry->fs_fcheck = fcheck; - ocfs2_filecheck_sysfs_add(entry); + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + + entry->fs_kobj.kset = osb->osb_dev_kset; + init_completion(&entry->fs_kobj_unregister); + ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck, + NULL, "filecheck"); + if (ret) { + kfree(fcheck); + return ret; } - kfree(attrs); + entry->fs_fcheck = fcheck; return 0; - -error: - kfree(attrs); - kfree(entry); - kfree(fcheck); - kset_unregister(fcheck_kset); - kset_unregister(device_kset); - return ret; } -int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb) { - return ocfs2_filecheck_sysfs_del(sb->s_id); + if (!osb->osb_fc_ent.fs_fcheck) + return; + + kobject_del(&osb->osb_fc_ent.fs_kobj); + kobject_put(&osb->osb_fc_ent.fs_kobj); + wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister); + ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent); } static int @@ -315,7 +226,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, spin_lock(&ent->fs_fcheck->fc_lock); if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot set online file check maximum entry number " "to %u due to too many pending entries(%u)\n", len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); @@ -392,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, return 0; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -400,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, ssize_t ret = 0, total = 0, remain = PAGE_SIZE; unsigned int type; struct ocfs2_filecheck_entry *p; - struct ocfs2_filecheck_sysfs_entry *ent; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) return -EINVAL; - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->name); - return -ENODEV; - } - if (type == OCFS2_FILECHECK_TYPE_SET) { spin_lock(&ent->fs_fcheck->fc_lock); total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); @@ -446,11 +350,26 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, spin_unlock(&ent->fs_fcheck->fc_lock); exit: - ocfs2_filecheck_sysfs_put(ent); return total; } -static int +static inline int +ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned long ino) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (!p->fe_done) { + if (p->fe_ino == ino) + return 1; + } + } + + return 0; +} + +static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { struct ocfs2_filecheck_entry *p; @@ -489,21 +408,21 @@ static void ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { - entry->fe_done = 1; spin_lock(&ent->fs_fcheck->fc_lock); + entry->fe_done = 1; ent->fs_fcheck->fc_done++; spin_unlock(&ent->fs_fcheck->fc_lock); } static unsigned int -ocfs2_filecheck_handle(struct super_block *sb, +ocfs2_filecheck_handle(struct ocfs2_super *osb, unsigned long ino, unsigned int flags) { unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; struct inode *inode = NULL; int rc; - inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + inode = ocfs2_iget(osb, ino, flags, 0); if (IS_ERR(inode)) { rc = (int)(-(long)inode); if (rc >= OCFS2_FILECHECK_ERR_START && @@ -521,11 +440,14 @@ static void ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { + struct ocfs2_super *osb = container_of(ent, struct ocfs2_super, + osb_fc_ent); + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); else entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; @@ -533,30 +455,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, ocfs2_filecheck_done_entry(ent, entry); } -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + ssize_t ret = 0; struct ocfs2_filecheck_args args; struct ocfs2_filecheck_entry *entry; - struct ocfs2_filecheck_sysfs_entry *ent; - ssize_t ret = 0; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (count == 0) return count; - if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { - mlog(ML_ERROR, "Invalid arguments for online file check\n"); + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) return -EINVAL; - } - - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->parent->name); - return -ENODEV; - } if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); @@ -570,13 +483,16 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, } spin_lock(&ent->fs_fcheck->fc_lock); - if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && - (ent->fs_fcheck->fc_done == 0)) { - mlog(ML_ERROR, + if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) { + ret = -EEXIST; + kfree(entry); + } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", ent->fs_fcheck->fc_max); - ret = -EBUSY; + ret = -EAGAIN; kfree(entry); } else { if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && @@ -601,6 +517,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - ocfs2_filecheck_sysfs_put(ent); return (!ret ? count : ret); } diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index e5cd002a2c09..6a22ee79e8d0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -43,7 +43,32 @@ enum { #define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED #define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED -int ocfs2_filecheck_create_sysfs(struct super_block *sb); -int ocfs2_filecheck_remove_sysfs(struct super_block *sb); +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */ + struct kobject fs_kobj; + struct completion fs_kobj_unregister; + struct ocfs2_filecheck *fs_fcheck; +}; + + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb); +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb); #endif /* FILECHECK_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 382401d3e88f..1a1e0078ab38 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -136,7 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { - int rc = 0; + int rc = -ESTALE; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 0c39d71c67a1..3bb169157a01 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -50,6 +50,8 @@ #include "reservations.h" +#include "filecheck.h" + /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of @@ -473,6 +475,12 @@ struct ocfs2_super * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; + + /* sysfs directory per partition */ + struct kset *osb_dev_kset; + + /* file check related stuff */ + struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 44d178b8d1aa..5bb4a89f9045 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -25,6 +25,8 @@ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H +#include <linux/magic.h> + /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 @@ -56,9 +58,6 @@ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE -/* Filesystem magic number */ -#define OCFS2_SUPER_MAGIC 0x7461636f - /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 820359096c7a..52c07346bea3 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,12 +48,6 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; -inline int ocfs2_is_o2cb_active(void) -{ - return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); -} -EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); - static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e3036e1790e8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,9 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); -/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ -int ocfs2_is_o2cb_active(void); - extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83005f486451..3aa927c681d2 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1162,6 +1162,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); + osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, + &ocfs2_kset->kobj); + if (!osb->osb_dev_kset) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); + goto read_super_error; + } + + /* Create filecheck sysfs related directories/files at + * /sys/fs/ocfs2/<devname>/filecheck */ + if (ocfs2_filecheck_create_sysfs(osb)) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " + "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); + goto read_super_error; + } + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else @@ -1200,22 +1217,20 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); - /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */ - ocfs2_filecheck_create_sysfs(sb); - return status; read_super_error: brelse(bh); + if (status) + mlog_errno(status); + if (osb) { atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); ocfs2_dismount_volume(sb, 1); } - if (status) - mlog_errno(status); return status; } @@ -1653,7 +1668,6 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1843,6 +1857,9 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); + if (status == -EBADR && ocfs2_userspace_stack(osb)) + mlog(ML_ERROR, "couldn't mount because cluster name on" + " disk does not match the running cluster name.\n"); goto leave; } @@ -1896,6 +1913,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + /* Remove file check sysfs related directores/files, + * and wait for the pending file check operations */ + ocfs2_filecheck_remove_sysfs(osb); + + kset_unregister(osb->osb_dev_kset); + debugfs_remove(osb->osb_ctxt); /* Orphan scan should be stopped as early as possible */ diff --git a/fs/proc/base.c b/fs/proc/base.c index f1e1927ccd48..ecc8a25b0847 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1355,6 +1355,48 @@ static const struct file_operations proc_fault_inject_operations = { .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + unsigned int n; + + err = kstrtouint_from_user(buf, count, 0, &n); + if (err) + return err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + task->fail_nth = n; + put_task_struct(task); + + return count; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char numbuf[PROC_NUMBUF]; + ssize_t len; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); + len = simple_read_from_buffer(buf, count, ppos, numbuf, len); + put_task_struct(task); + + return len; +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; #endif @@ -2919,6 +2961,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_ELF_CORE REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), @@ -3311,6 +3354,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9425c0d97262..e3cda0b5968f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -180,7 +180,6 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, } static DEFINE_IDA(proc_inum_ida); -static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ #define PROC_DYNAMIC_FIRST 0xF0000000U @@ -190,37 +189,20 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ */ int proc_alloc_inum(unsigned int *inum) { - unsigned int i; - int error; + int i; -retry: - if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) - return -ENOMEM; + i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, + GFP_KERNEL); + if (i < 0) + return i; - spin_lock_irq(&proc_inum_lock); - error = ida_get_new(&proc_inum_ida, &i); - spin_unlock_irq(&proc_inum_lock); - if (error == -EAGAIN) - goto retry; - else if (error) - return error; - - if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { - spin_lock_irq(&proc_inum_lock); - ida_remove(&proc_inum_ida, i); - spin_unlock_irq(&proc_inum_lock); - return -ENOSPC; - } - *inum = PROC_DYNAMIC_FIRST + i; + *inum = PROC_DYNAMIC_FIRST + (unsigned int)i; return 0; } void proc_free_inum(unsigned int inum) { - unsigned long flags; - spin_lock_irqsave(&proc_inum_lock, flags); - ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); - spin_unlock_irqrestore(&proc_inum_lock, flags); + ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 67985a7233c2..8f9d564d0969 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1061,16 +1061,30 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) return -EINVAL; } +static int sysctl_check_table_array(const char *path, struct ctl_table *table) +{ + int err = 0; + + if ((table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax)) { + if (table->maxlen != sizeof(unsigned int)) + err |= sysctl_err(path, table, "array now allowed"); + } + + return err; +} + static int sysctl_check_table(const char *path, struct ctl_table *table) { int err = 0; for (; table->procname; table++) { if (table->child) - err = sysctl_err(path, table, "Not a file"); + err |= sysctl_err(path, table, "Not a file"); if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || @@ -1078,15 +1092,17 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) (table->proc_handler == proc_doulongvec_minmax) || (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { if (!table->data) - err = sysctl_err(path, table, "No data"); + err |= sysctl_err(path, table, "No data"); if (!table->maxlen) - err = sysctl_err(path, table, "No maxlen"); + err |= sysctl_err(path, table, "No maxlen"); + else + err |= sysctl_check_table_array(path, table); } if (!table->proc_handler) - err = sysctl_err(path, table, "No proc_handler"); + err |= sysctl_err(path, table, "No proc_handler"); if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) - err = sysctl_err(path, table, "bogus .mode 0%o", + err |= sysctl_err(path, table, "bogus .mode 0%o", table->mode); } return err; diff --git a/fs/seq_file.c b/fs/seq_file.c index dc7c2be963ed..13e8c092d4d2 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -694,11 +694,6 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, if (m->count + 1 >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; @@ -733,11 +728,6 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num num = -num; } - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 5bdae85ceef7..f5191cb2c947 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -45,7 +45,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index de01b8f2aa78..48609f1d9580 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -53,7 +53,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) mark_inode_dirty(dir); } if (IS_DIRSYNC(dir)) - err = write_one_page(page, 1); + err = write_one_page(page); else unlock_page(page); return err; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f7555fc25877..a6c2b94a42c3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); if (!pte) goto out; @@ -243,6 +244,7 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -435,7 +437,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + vmf->address, vmf->flags, reason); up_read(&mm->mmap_sem); @@ -1101,11 +1104,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, static void __wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { - unsigned long start, end; - - start = range->start; - end = range->start + range->len; - spin_lock(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index d6f4aed479a1..87191357d303 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -97,6 +97,7 @@ extern void warn_slowpath_null(const char *file, const int line); /* used internally by panic.c */ struct warn_args; +struct pt_regs; void __warn(const char *file, int line, void *caller, unsigned taint, struct pt_regs *regs, struct warn_args *args); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 99b490b4d05a..540354f94f83 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -31,10 +31,12 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +#ifndef huge_pte_clear static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } +#endif #endif /* _ASM_GENERIC_HUGETLB_H */ diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 962164d36506..e223d91b6439 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -358,6 +358,7 @@ extern void *alloc_large_system_hash(const char *tablename, #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ #define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min * shift passed via *_hash_shift */ +#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. diff --git a/include/linux/bug.h b/include/linux/bug.h index 687b557fc5eb..5d5554c874fd 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -3,6 +3,7 @@ #include <asm/bug.h> #include <linux/compiler.h> +#include <linux/build_bug.h> enum bug_trap_type { BUG_TRAP_TYPE_NONE = 0, @@ -13,80 +14,9 @@ enum bug_trap_type { struct pt_regs; #ifdef __CHECKER__ -#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) -#define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) -#define BUILD_BUG_ON_ZERO(e) (0) -#define BUILD_BUG_ON_NULL(e) ((void*)0) -#define BUILD_BUG_ON_INVALID(e) (0) -#define BUILD_BUG_ON_MSG(cond, msg) (0) -#define BUILD_BUG_ON(condition) (0) -#define BUILD_BUG() (0) #define MAYBE_BUILD_BUG_ON(cond) (0) #else /* __CHECKER__ */ -/* Force a compilation error if a constant expression is not a power of 2 */ -#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ - BUILD_BUG_ON(((n) & ((n) - 1)) != 0) -#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ - BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) - -/* Force a compilation error if condition is true, but also produce a - result (of value 0 and type size_t), so the expression can be used - e.g. in a structure initializer (or where-ever else comma expressions - aren't permitted). */ -#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) -#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) - -/* - * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the - * expression but avoids the generation of any code, even if that expression - * has side-effects. - */ -#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e)))) - -/** - * BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied - * error message. - * @condition: the condition which the compiler should know is false. - * - * See BUILD_BUG_ON for description. - */ -#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) - -/** - * BUILD_BUG_ON - break compile if a condition is true. - * @condition: the condition which the compiler should know is false. - * - * If you have some code which relies on certain constants being equal, or - * some other compile-time-evaluated condition, you should use BUILD_BUG_ON to - * detect if someone changes it. - * - * The implementation uses gcc's reluctance to create a negative array, but gcc - * (as of 4.4) only emits that error for obvious cases (e.g. not arguments to - * inline functions). Luckily, in 4.3 they added the "error" function - * attribute just for this type of case. Thus, we use a negative sized array - * (should always create an error on gcc versions older than 4.4) and then call - * an undefined function with the error attribute (should always create an - * error on gcc 4.3 and later). If for some reason, neither creates a - * compile-time error, we'll still have a link-time error, which is harder to - * track down. - */ -#ifndef __OPTIMIZE__ -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) -#else -#define BUILD_BUG_ON(condition) \ - BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) -#endif - -/** - * BUILD_BUG - break compile if used. - * - * If you have some code that you expect the compiler to eliminate at - * build time, you should use BUILD_BUG to detect if it is - * unexpectedly used. - */ -#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") - #define MAYBE_BUILD_BUG_ON(cond) \ do { \ if (__builtin_constant_p((cond))) \ diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h new file mode 100644 index 000000000000..b7d22d60008a --- /dev/null +++ b/include/linux/build_bug.h @@ -0,0 +1,84 @@ +#ifndef _LINUX_BUILD_BUG_H +#define _LINUX_BUILD_BUG_H + +#include <linux/compiler.h> + +#ifdef __CHECKER__ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) +#define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) +#define BUILD_BUG_ON_ZERO(e) (0) +#define BUILD_BUG_ON_NULL(e) ((void *)0) +#define BUILD_BUG_ON_INVALID(e) (0) +#define BUILD_BUG_ON_MSG(cond, msg) (0) +#define BUILD_BUG_ON(condition) (0) +#define BUILD_BUG() (0) +#else /* __CHECKER__ */ + +/* Force a compilation error if a constant expression is not a power of 2 */ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON(((n) & ((n) - 1)) != 0) +#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) + +/* + * Force a compilation error if condition is true, but also produce a + * result (of value 0 and type size_t), so the expression can be used + * e.g. in a structure initializer (or where-ever else comma expressions + * aren't permitted). + */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); })) +#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:(-!!(e)); })) + +/* + * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the + * expression but avoids the generation of any code, even if that expression + * has side-effects. + */ +#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e)))) + +/** + * BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied + * error message. + * @condition: the condition which the compiler should know is false. + * + * See BUILD_BUG_ON for description. + */ +#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) + +/** + * BUILD_BUG_ON - break compile if a condition is true. + * @condition: the condition which the compiler should know is false. + * + * If you have some code which relies on certain constants being equal, or + * some other compile-time-evaluated condition, you should use BUILD_BUG_ON to + * detect if someone changes it. + * + * The implementation uses gcc's reluctance to create a negative array, but gcc + * (as of 4.4) only emits that error for obvious cases (e.g. not arguments to + * inline functions). Luckily, in 4.3 they added the "error" function + * attribute just for this type of case. Thus, we use a negative sized array + * (should always create an error on gcc versions older than 4.4) and then call + * an undefined function with the error attribute (should always create an + * error on gcc 4.3 and later). If for some reason, neither creates a + * compile-time error, we'll still have a link-time error, which is harder to + * track down. + */ +#ifndef __OPTIMIZE__ +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#else +#define BUILD_BUG_ON(condition) \ + BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) +#endif + +/** + * BUILD_BUG - break compile if used. + * + * If you have some code that you expect the compiler to eliminate at + * build time, you should use BUILD_BUG to detect if it is + * unexpectedly used. + */ +#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") + +#endif /* __CHECKER__ */ + +#endif /* _LINUX_BUILD_BUG_H */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 541a197ba4a2..e9de6b4ef405 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -14,7 +14,7 @@ CRASH_CORE_NOTE_NAME_BYTES + \ CRASH_CORE_NOTE_DESC_BYTES) -#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_BYTES PAGE_SIZE #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) #define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ @@ -23,6 +23,7 @@ typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +void crash_update_vmcoreinfo_safecopy(void *ptr); void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); __printf(1, 2) @@ -50,10 +51,10 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -extern size_t vmcoreinfo_size; -extern size_t vmcoreinfo_max_size; +extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 6daf6d4971f6..2f14ac73d01d 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -14,6 +14,7 @@ #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> +#include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ @@ -22,6 +23,10 @@ struct file; #ifdef CONFIG_EPOLL +#ifdef CONFIG_CHECKPOINT_RESTORE +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); +#endif + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) { diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a89d37e8b387..4c6656f1fee7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -432,14 +432,13 @@ static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask); +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask); static inline struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist) +__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) { - return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); + return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); } /* @@ -452,7 +451,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_WARN_ON(!node_online(nid)); - return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); + return __alloc_pages(gfp_mask, order, nid); } /* diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a3762d49ba39..40d7b7dd2653 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,6 +92,7 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) && \ ((__vma)->vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ + !test_bit(MMF_DISABLE_THP, &(__vma)->vm_mm->flags) && \ !is_vma_temporary_stack(__vma)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -113,6 +114,7 @@ extern unsigned long thp_get_unmapped_area(struct file *filp, extern void prep_transhuge_page(struct page *page); extern void free_transhuge_page(struct page *page); +bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { @@ -231,6 +233,12 @@ static inline void prep_transhuge_page(struct page *page) {} #define thp_get_unmapped_area NULL +static inline bool +can_split_huge_page(struct page *page, int *pextra_pins) +{ + BUILD_BUG(); + return false; +} static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b857fc8cc2ec..dbb118c566cd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -14,6 +14,30 @@ struct ctl_table; struct user_struct; struct mmu_gather; +#ifndef is_hugepd +/* + * Some architectures requires a hugepage directory format that is + * required to support multiple hugepage sizes. For example + * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables" + * introduced the same on powerpc. This allows for a more flexible hugepage + * pagetable layout. + */ +typedef struct { unsigned long pd; } hugepd_t; +#define is_hugepd(hugepd) (0) +#define __hugepd(x) ((hugepd_t) { (x) }) +static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned pdshift, unsigned long end, + int write, struct page **pages, int *nr) +{ + return 0; +} +#else +extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned pdshift, unsigned long end, + int write, struct page **pages, int *nr); +#endif + + #ifdef CONFIG_HUGETLB_PAGE #include <linux/mempolicy.h> @@ -92,7 +116,6 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); void free_huge_page(struct page *page); @@ -113,19 +136,27 @@ extern struct list_head huge_boot_pages; pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz); -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz); int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); +struct page *follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, + int flags, int pdshift); struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags); +struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, + pgd_t *pgd, int flags); + int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); +bool is_hugetlb_entry_migration(pte_t pte); #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) @@ -147,8 +178,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) static inline void hugetlb_show_meminfo(void) { } +#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL #define follow_huge_pmd(mm, addr, pmd, flags) NULL #define follow_huge_pud(mm, addr, pud, flags) NULL +#define follow_huge_pgd(mm, addr, pgd, flags) NULL #define prepare_hugepage_range(file, addr, len) (-EINVAL) #define pmd_huge(x) 0 #define pud_huge(x) 0 @@ -157,11 +190,7 @@ static inline void hugetlb_show_meminfo(void) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ src_addr, pagep) ({ BUG(); 0; }) -#define huge_pte_offset(mm, address) 0 -static inline int dequeue_hwpoisoned_huge_page(struct page *page) -{ - return 0; -} +#define huge_pte_offset(mm, address, sz) 0 static inline bool isolate_huge_page(struct page *page, struct list_head *list) { @@ -217,29 +246,6 @@ static inline int pud_write(pud_t pud) } #endif -#ifndef is_hugepd -/* - * Some architectures requires a hugepage directory format that is - * required to support multiple hugepage sizes. For example - * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables" - * introduced the same on powerpc. This allows for a more flexible hugepage - * pagetable layout. - */ -typedef struct { unsigned long pd; } hugepd_t; -#define is_hugepd(hugepd) (0) -#define __hugepd(x) ((hugepd_t) { (x) }) -static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr) -{ - return 0; -} -#else -extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr); -#endif - #define HUGETLB_ANON_FILE "anon_hugepage" enum { @@ -461,12 +467,17 @@ static inline pgoff_t basepage_index(struct page *page) return __basepage_index(page); } +extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); static inline bool hugepage_migration_supported(struct hstate *h) { #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION - return huge_page_shift(h) == PMD_SHIFT; + if ((huge_page_shift(h) == PMD_SHIFT) || + (huge_page_shift(h) == PGDIR_SHIFT)) + return true; + else + return false; #else return false; #endif @@ -501,6 +512,14 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { atomic_long_sub(l, &mm->hugetlb_usage); } + +#ifndef set_huge_swap_pte_at +static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + set_huge_pte_at(mm, addr, ptep, pte); +} +#endif #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL @@ -518,6 +537,11 @@ struct hstate {}; #define vma_mmu_pagesize(v) PAGE_SIZE #define huge_page_order(h) 0 #define huge_page_shift(h) PAGE_SHIFT +static inline bool hstate_is_gigantic(struct hstate *h) +{ + return false; +} + static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; @@ -529,6 +553,7 @@ static inline pgoff_t basepage_index(struct page *page) { return page->index; } +#define dissolve_free_huge_page(p) 0 #define dissolve_free_huge_pages(s, e) 0 #define hugepage_migration_supported(h) false @@ -545,6 +570,11 @@ static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } + +static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 71fd92d81b26..5591f055e13f 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -20,6 +20,9 @@ struct kern_ipc_perm { umode_t mode; unsigned long seq; void *security; + + struct rcu_head rcu; + atomic_t refcount; } ____cacheline_aligned_in_smp; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 1c91f26e2996..d043adadcf33 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -11,6 +11,7 @@ #include <linux/log2.h> #include <linux/typecheck.h> #include <linux/printk.h> +#include <linux/build_bug.h> #include <asm/byteorder.h> #include <uapi/linux/kernel.h> @@ -854,9 +855,11 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } * @member: the name of the member within the struct. * */ -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) +#define container_of(ptr, type, member) ({ \ + BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ + !__same_type(*(ptr), void), \ + "pointer type mismatch in container_of()"); \ + ((type *)((char *)(ptr) - offsetof(type, member))); }) /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c9481ebcbc0c..3ea8275c419c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -181,6 +181,7 @@ struct kimage { unsigned long start; struct page *control_code_page; struct page *swap_page; + void *vmcoreinfo_data_copy; /* locates in the crash memory */ unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; @@ -250,6 +251,7 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 5d9a400af509..f0d7335336cd 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -48,7 +48,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && - !(vm_flags & VM_NOHUGEPAGE)) + !(vm_flags & VM_NOHUGEPAGE) && + !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 1c2a32829620..590343f6c1b1 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -22,6 +22,7 @@ #define __KMEMLEAK_H #include <linux/slab.h> +#include <linux/vmalloc.h> #ifdef CONFIG_DEBUG_KMEMLEAK @@ -30,6 +31,8 @@ extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) __ref; +extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; @@ -81,6 +84,10 @@ static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) { } +static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) +{ +} static inline void kmemleak_free(const void *ptr) { } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8098695e5d8d..77d427974f57 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -57,10 +57,6 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -#ifdef CONFIG_MOVABLE_NODE -/* If movable_node boot option specified */ -extern bool movable_node_enabled; -#endif /* CONFIG_MOVABLE_NODE */ #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #define __init_memblock __meminit @@ -169,27 +165,11 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, i != (u64)ULLONG_MAX; \ __next_reserved_mem_region(&i, p_start, p_end)) -#ifdef CONFIG_MOVABLE_NODE static inline bool memblock_is_hotpluggable(struct memblock_region *m) { return m->flags & MEMBLOCK_HOTPLUG; } -static inline bool __init_memblock movable_node_is_enabled(void) -{ - return movable_node_enabled; -} -#else -static inline bool memblock_is_hotpluggable(struct memblock_region *m) -{ - return false; -} -static inline bool movable_node_is_enabled(void) -{ - return false; -} -#endif - static inline bool memblock_is_mirror(struct memblock_region *m) { return m->flags & MEMBLOCK_MIRROR; @@ -296,7 +276,6 @@ phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); -#ifdef CONFIG_MOVABLE_NODE /* * Set the allocation direction to bottom-up or top-down. */ @@ -314,10 +293,6 @@ static inline bool memblock_bottom_up(void) { return memblock.bottom_up; } -#else -static inline void __init memblock_set_bottom_up(bool enable) {} -static inline bool memblock_bottom_up(void) { return false; } -#endif /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 899949bbb2f9..3914e3dd6168 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -26,7 +26,8 @@ #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> -#include <linux/mmzone.h> +#include <linux/mm.h> +#include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> @@ -44,8 +45,6 @@ enum memcg_stat_item { MEMCG_SOCK, /* XXX: why are these zone and not node counters? */ MEMCG_KERNEL_STACK_KB, - MEMCG_SLAB_RECLAIMABLE, - MEMCG_SLAB_UNRECLAIMABLE, MEMCG_NR_STAT, }; @@ -100,11 +99,16 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; +struct lruvec_stat { + long count[NR_VM_NODE_STAT_ITEMS]; +}; + /* * per-zone information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; + struct lruvec_stat __percpu *lruvec_stat; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -357,6 +361,17 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) +{ + struct mem_cgroup_per_node *mz; + + if (mem_cgroup_disabled()) + return NULL; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->memcg; +} + /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find @@ -487,23 +502,18 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, return val; } -static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) +static inline void __mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val) { if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->count[idx], val); + __this_cpu_add(memcg->stat->count[idx], val); } -static inline void inc_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) -{ - mod_memcg_state(memcg, idx, 1); -} - -static inline void dec_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val) { - mod_memcg_state(memcg, idx, -1); + if (!mem_cgroup_disabled()) + this_cpu_add(memcg->stat->count[idx], val); } /** @@ -523,6 +533,13 @@ static inline void dec_memcg_state(struct mem_cgroup *memcg, * * Kernel pages are an exception to this, since they'll never move. */ +static inline void __mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, int val) +{ + if (page->mem_cgroup) + __mod_memcg_state(page->mem_cgroup, idx, val); +} + static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { @@ -530,24 +547,99 @@ static inline void mod_memcg_page_state(struct page *page, mod_memcg_state(page->mem_cgroup, idx, val); } -static inline void inc_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) { - mod_memcg_page_state(page, idx, 1); + struct mem_cgroup_per_node *pn; + long val = 0; + int cpu; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + for_each_possible_cpu(cpu) + val += per_cpu(pn->lruvec_stat->count[idx], cpu); + + if (val < 0) + val = 0; + + return val; } -static inline void dec_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline void __mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) { - mod_memcg_page_state(page, idx, -1); + struct mem_cgroup_per_node *pn; + + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) + return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + __mod_memcg_state(pn->memcg, idx, val); + __this_cpu_add(pn->lruvec_stat->count[idx], val); +} + +static inline void mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + struct mem_cgroup_per_node *pn; + + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) + return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + mod_memcg_state(pn->memcg, idx, val); + this_cpu_add(pn->lruvec_stat->count[idx], val); +} + +static inline void __mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + struct mem_cgroup_per_node *pn; + + __mod_node_page_state(page_pgdat(page), idx, val); + if (mem_cgroup_disabled() || !page->mem_cgroup) + return; + __mod_memcg_state(page->mem_cgroup, idx, val); + pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; + __this_cpu_add(pn->lruvec_stat->count[idx], val); +} + +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + struct mem_cgroup_per_node *pn; + + mod_node_page_state(page_pgdat(page), idx, val); + if (mem_cgroup_disabled() || !page->mem_cgroup) + return; + mod_memcg_state(page->mem_cgroup, idx, val); + pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; + this_cpu_add(pn->lruvec_stat->count[idx], val); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, - enum vm_event_item idx) +static inline void count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ + if (!mem_cgroup_disabled()) + this_cpu_add(memcg->stat->events[idx], count); +} + +static inline void count_memcg_page_event(struct page *page, + enum memcg_stat_item idx) +{ + if (page->mem_cgroup) + count_memcg_events(page->mem_cgroup, idx, 1); +} + +static inline void count_memcg_event_mm(struct mm_struct *mm, + enum vm_event_item idx) { struct mem_cgroup *memcg; @@ -556,8 +648,11 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (likely(memcg)) + if (likely(memcg)) { this_cpu_inc(memcg->stat->events[idx]); + if (idx == OOM_KILL) + cgroup_file_notify(&memcg->events_file); + } rcu_read_unlock(); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -675,6 +770,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) +{ + return NULL; +} + static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; @@ -745,19 +845,21 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, return 0; } -static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, - int nr) +static inline void __mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, + int nr) { } -static inline void inc_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, + int nr) { } -static inline void dec_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) +static inline void __mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, + int nr) { } @@ -767,14 +869,34 @@ static inline void mod_memcg_page_state(struct page *page, { } -static inline void inc_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) { + return node_page_state(lruvec_pgdat(lruvec), idx); } -static inline void dec_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline void __mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); +} + +static inline void mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); +} + +static inline void __mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) { + __mod_node_page_state(page_pgdat(page), idx, val); +} + +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + mod_node_page_state(page_pgdat(page), idx, val); } static inline @@ -789,12 +911,119 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head) { } +static inline void count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + +static inline void count_memcg_page_event(struct page *page, + enum memcg_stat_item idx) +{ +} + static inline -void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } #endif /* CONFIG_MEMCG */ +static inline void __inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + __mod_memcg_state(memcg, idx, 1); +} + +static inline void __dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + __mod_memcg_state(memcg, idx, -1); +} + +static inline void __inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + __mod_memcg_page_state(page, idx, 1); +} + +static inline void __dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + __mod_memcg_page_state(page, idx, -1); +} + +static inline void __inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + __mod_lruvec_state(lruvec, idx, 1); +} + +static inline void __dec_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + __mod_lruvec_state(lruvec, idx, -1); +} + +static inline void __inc_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + __mod_lruvec_page_state(page, idx, 1); +} + +static inline void __dec_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + __mod_lruvec_page_state(page, idx, -1); +} + +static inline void inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, 1); +} + +static inline void dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, -1); +} + +static inline void inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + mod_memcg_page_state(page, idx, 1); +} + +static inline void dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + mod_memcg_page_state(page, idx, -1); +} + +static inline void inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + mod_lruvec_state(lruvec, idx, 1); +} + +static inline void dec_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + mod_lruvec_state(lruvec, idx, -1); +} + +static inline void inc_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + mod_lruvec_page_state(page, idx, 1); +} + +static inline void dec_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + mod_lruvec_page_state(page, idx, -1); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); @@ -886,19 +1115,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -/** - * memcg_kmem_update_page_stat - update kmem page state statistics - * @page: the page - * @idx: page state item to account - * @val: number of pages (positive or negative) - */ -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ - if (memcg_kmem_enabled() && page->mem_cgroup) - this_cpu_add(page->mem_cgroup->stat->count[idx], val); -} - #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -921,10 +1137,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ -} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 134a2f69c21a..c8a5056a5ae0 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -14,6 +14,20 @@ struct memory_block; struct resource; #ifdef CONFIG_MEMORY_HOTPLUG +/* + * Return page for the valid pfn only if the page is online. All pfn + * walkers which rely on the fully initialized page->flags and others + * should use this rather than pfn_valid && pfn_to_page + */ +#define pfn_to_online_page(pfn) \ +({ \ + struct page *___page = NULL; \ + unsigned long ___nr = pfn_to_section_nr(pfn); \ + \ + if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr))\ + ___page = pfn_to_page(pfn); \ + ___page; \ +}) /* * Types for free bootmem stored in page->lru.next. These have to be in @@ -101,6 +115,12 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); extern bool memhp_auto_online; +/* If movable_node boot option specified */ +extern bool movable_node_enabled; +static inline bool movable_node_is_enabled(void) +{ + return movable_node_enabled; +} #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); @@ -109,9 +129,9 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ -/* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); +/* reasonably generic interface to expand the physical pages */ +extern int __add_pages(int nid, unsigned long start_pfn, + unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); @@ -203,6 +223,14 @@ extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); #else /* ! CONFIG_MEMORY_HOTPLUG */ +#define pfn_to_online_page(pfn) \ +({ \ + struct page *___page = NULL; \ + if (pfn_valid(pfn)) \ + ___page = pfn_to_page(pfn); \ + ___page; \ + }) + /* * Stub functions for when hotplug is off */ @@ -244,6 +272,10 @@ static inline void put_online_mems(void) {} static inline void mem_hotplug_begin(void) {} static inline void mem_hotplug_done(void) {} +static inline bool movable_node_is_enabled(void) +{ + return false; +} #endif /* ! CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE @@ -274,18 +306,19 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); -extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, - bool for_device); -extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); +extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); +extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift); - +extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, + int online_type); +extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, + unsigned long nr_pages); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5f4d8281832b..3a58b4be1b0c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -142,11 +142,10 @@ bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); -extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step); +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); -extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, +extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); @@ -260,8 +259,7 @@ static inline void numa_default_policy(void) } static inline void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new, - enum mpol_rebind_step step) + const nodemask_t *new) { } @@ -269,13 +267,13 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } -static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, +static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; - return node_zonelist(0, gfp_flags); + return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) diff --git a/include/linux/mm.h b/include/linux/mm.h index b892e95d4929..c0b1759304ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2199,7 +2199,7 @@ extern void filemap_map_pages(struct vm_fault *vmf, extern int filemap_page_mkwrite(struct vm_fault *vmf); /* mm/page-writeback.c */ -int write_one_page(struct page *page, int wait); +int __must_check write_one_page(struct page *page); void task_dirty_inc(struct task_struct *tsk); /* readahead.c */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ef6a13b7bd3e..0176a2933c61 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -125,8 +125,6 @@ enum zone_stat_item { NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - NR_SLAB_RECLAIMABLE, - NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_KERNEL_STACK_KB, /* measured in KiB */ /* Second 128 byte cacheline */ @@ -152,6 +150,8 @@ enum node_stat_item { NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ + NR_SLAB_RECLAIMABLE, + NR_SLAB_UNRECLAIMABLE, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_REFAULT, @@ -533,6 +533,26 @@ static inline bool zone_is_empty(struct zone *zone) } /* + * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty + * intersection with the given zone + */ +static inline bool zone_intersects(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + if (zone_is_empty(zone)) + return false; + if (start_pfn >= zone_end_pfn(zone)) + return false; + + if (zone->zone_start_pfn <= start_pfn) + return true; + if (start_pfn + nr_pages > zone->zone_start_pfn) + return true; + + return false; +} + +/* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. @@ -772,7 +792,7 @@ enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, }; -extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, +extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); @@ -1144,9 +1164,10 @@ extern unsigned long usemap_size(void); */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_MAP_LAST_BIT (1UL<<2) +#define SECTION_IS_ONLINE (1UL<<2) +#define SECTION_MAP_LAST_BIT (1UL<<3) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 2 +#define SECTION_NID_SHIFT 3 static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1175,11 +1196,30 @@ static inline int valid_section_nr(unsigned long nr) return valid_section(__nr_to_section(nr)); } +static inline int online_section(struct mem_section *section) +{ + return (section && (section->section_mem_map & SECTION_IS_ONLINE)); +} + +static inline int online_section_nr(unsigned long nr) +{ + return online_section(__nr_to_section(nr)); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); +#ifdef CONFIG_MEMORY_HOTREMOVE +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); +#endif +#endif + static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } +extern int __highest_present_section_nr; + #ifndef CONFIG_HAVE_ARCH_PFN_VALID static inline int pfn_valid(unsigned long pfn) { @@ -1251,10 +1291,15 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL /* * pfn_valid() is meant to be able to tell if a given PFN has valid memmap - * associated with it or not. In FLATMEM, it is expected that holes always - * have valid memmap as long as there is valid PFNs either side of the hole. - * In SPARSEMEM, it is assumed that a valid section has a memmap for the - * entire section. + * associated with it or not. This means that a struct page exists for this + * pfn. The caller cannot assume the page is fully initialized in general. + * Hotplugable pages might not have been onlined yet. pfn_to_online_page() + * will ensure the struct page is fully online and initialized. Special pages + * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly. + * + * In FLATMEM, it is expected that holes always have valid memmap as long as + * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed + * that a valid section has a memmap for the entire section. * * However, an ARM, and maybe other embedded architectures in the future * free memmap backing holes to save memory on the assumption the memmap is diff --git a/include/linux/node.h b/include/linux/node.h index 2115ad5d6f19..d1751beb462c 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -30,9 +30,38 @@ struct memory_block; extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) +extern int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages); +#else +static inline int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages) +{ + return 0; +} +#endif + extern void unregister_node(struct node *node); #ifdef CONFIG_NUMA -extern int register_one_node(int nid); +/* Core of the node registration - only memory hotplug should use this */ +extern int __register_one_node(int nid); + +/* Registers an online node */ +static inline int register_one_node(int nid) +{ + int error = 0; + + if (node_online(nid)) { + struct pglist_data *pgdat = NODE_DATA(nid); + + error = __register_one_node(nid); + if (error) + return error; + /* link memory sections under this node */ + error = link_mem_sections(nid, pgdat->node_start_pfn, pgdat->node_spanned_pages); + } + + return error; +} + extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); @@ -46,6 +75,10 @@ extern void register_hugetlbfs_with_node(node_registration_func_t doregister, node_registration_func_t unregister); #endif #else +static inline int __register_one_node(int nid) +{ + return 0; +} static inline int register_one_node(int nid) { return 0; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index f746e44d4046..cf0b91c3ec12 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -387,11 +387,7 @@ enum node_states { #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif -#ifdef CONFIG_MOVABLE_NODE N_MEMORY, /* The node has memory(regular, high, movable) */ -#else - N_MEMORY = N_HIGH_MEMORY, -#endif N_CPU, /* The node has one or more cpus */ NR_NODE_STATES }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b5818d6de32..d33e3280c8ad 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -326,11 +326,14 @@ PAGEFLAG_FALSE(HighMem) #ifdef CONFIG_SWAP static __always_inline int PageSwapCache(struct page *page) { +#ifdef CONFIG_THP_SWAP + page = compound_head(page); +#endif return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags); } -SETPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) -CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) +SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) +CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) #else PAGEFLAG_FALSE(SwapCache) #endif diff --git a/include/linux/random.h b/include/linux/random.h index ed5c3838780d..1fa0dc880bd7 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -57,6 +57,27 @@ static inline unsigned long get_random_long(void) #endif } +/* + * On 64-bit architectures, protect against non-terminated C string overflows + * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. + */ +#ifdef CONFIG_64BIT +# ifdef __LITTLE_ENDIAN +# define CANARY_MASK 0xffffffffffffff00UL +# else /* big endian, 64 bits: */ +# define CANARY_MASK 0x00ffffffffffffffUL +# endif +#else /* 32 bits: */ +# define CANARY_MASK 0xffffffffUL +#endif + +static inline unsigned long get_random_canary(void) +{ + unsigned long val = get_random_long(); + + return val & CANARY_MASK; +} + unsigned long randomize_page(unsigned long start, unsigned long range); u32 prandom_u32(void); diff --git a/include/linux/reboot.h b/include/linux/reboot.h index a7ff409f386d..ecbf7b56b9db 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -5,6 +5,8 @@ #include <linux/notifier.h> #include <uapi/linux/reboot.h> +struct device; + #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN #define SYS_HALT 0x0002 /* Notify of system halt */ @@ -38,6 +40,8 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); +extern int devm_register_reboot_notifier(struct device *, struct notifier_block *); + extern int register_restart_handler(struct notifier_block *); extern int unregister_restart_handler(struct notifier_block *); extern void do_kernel_restart(char *cmd); diff --git a/include/linux/sched.h b/include/linux/sched.h index 559fa1d7ca92..74862911a54c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -887,7 +887,7 @@ struct task_struct { #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; - short il_next; + short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING @@ -948,6 +948,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; + unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 69eedcef8f03..98ae0d05aa32 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -68,7 +68,10 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sem.h b/include/linux/sem.h index 9edec926e9d9..be5cf2ea14ad 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -8,11 +8,29 @@ struct task_struct; +/* One semaphore structure for each semaphore in the system. */ +struct sem { + int semval; /* current value */ + /* + * PID of the process that last modified the semaphore. For + * Linux, specifically these are: + * - semop + * - semctl, via SETVAL and SETALL. + * - at task exit when performing undo adjustments (see exit_sem). + */ + int sempid; + spinlock_t lock; /* spinlock for fine-grained semtimedop */ + struct list_head pending_alter; /* pending single-sop operations */ + /* that alter the semaphore */ + struct list_head pending_const; /* pending single-sop operations */ + /* that do not alter the semaphore*/ + time_t sem_otime; /* candidate for sem_otime */ +} ____cacheline_aligned_in_smp; + /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ - time_t sem_ctime; /* last change time */ - struct sem *sem_base; /* ptr to first semaphore in array */ + time_t sem_ctime; /* create/last semctl() time */ struct list_head pending_alter; /* pending operations */ /* that alter the array */ struct list_head pending_const; /* pending complex operations */ @@ -21,6 +39,8 @@ struct sem_array { int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ + + struct sem sems[]; }; #ifdef CONFIG_SYSVIPC diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 07ef550c6627..d808e8e6293b 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -41,12 +41,31 @@ struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ +#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *partial; /* Partially allocated frozen slabs */ +#endif #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif }; +#ifdef CONFIG_SLUB_CPU_PARTIAL +#define slub_percpu_partial(c) ((c)->partial) + +#define slub_set_percpu_partial(c, p) \ +({ \ + slub_percpu_partial(c) = (p)->next; \ +}) + +#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) +#else +#define slub_percpu_partial(c) NULL + +#define slub_set_percpu_partial(c, p) + +#define slub_percpu_partial_read_once(c) NULL +#endif // CONFIG_SLUB_CPU_PARTIAL + /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the @@ -67,7 +86,9 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ +#ifdef CONFIG_SLUB_CPU_PARTIAL int cpu_partial; /* Number of per cpu partial objects to keep around */ +#endif struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ @@ -79,9 +100,9 @@ struct kmem_cache { int inuse; /* Offset to metadata */ int align; /* Alignment */ int reserved; /* Reserved bytes at the end of slabs */ + int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ - int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif @@ -111,6 +132,17 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; +#ifdef CONFIG_SLUB_CPU_PARTIAL +#define slub_cpu_partial(s) ((s)->cpu_partial) +#define slub_set_cpu_partial(s, n) \ +({ \ + slub_cpu_partial(s) = (n); \ +}) +#else +#define slub_cpu_partial(s) (0) +#define slub_set_cpu_partial(s, n) +#endif // CONFIG_SLUB_CPU_PARTIAL + #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS void sysfs_slab_release(struct kmem_cache *); diff --git a/include/linux/swap.h b/include/linux/swap.h index ba5882419a7d..61e7180cee21 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -331,7 +331,7 @@ extern void kswapd_stop(int nid); #include <linux/blk_types.h> /* for bio_end_io_t */ /* linux/mm/page_io.c */ -extern int swap_readpage(struct page *); +extern int swap_readpage(struct page *page, bool do_poll); extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern void end_swap_bio_write(struct bio *bio); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, @@ -353,7 +353,7 @@ extern struct address_space *swapper_spaces[]; >> SWAP_ADDRESS_SPACE_SHIFT]) extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *, struct list_head *list); +extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *); @@ -362,7 +362,8 @@ extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr); + struct vm_area_struct *vma, unsigned long addr, + bool do_poll); extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated); @@ -386,15 +387,15 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page(struct page *page); +extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[]); +extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); -extern void swapcache_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); @@ -453,7 +454,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void swapcache_free(swp_entry_t swp) +static inline void put_swap_page(struct page *page, swp_entry_t swp) { } @@ -473,7 +474,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp) return NULL; } -static inline int add_to_swap(struct page *page, struct list_head *list) +static inline int add_to_swap(struct page *page) { return 0; } @@ -515,7 +516,7 @@ static inline int try_to_free_swap(struct page *page) return 0; } -static inline swp_entry_t get_swap_page(void) +static inline swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; entry.val = 0; @@ -548,7 +549,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry); +extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct page *page); #else @@ -562,7 +563,8 @@ static inline int mem_cgroup_try_charge_swap(struct page *page, return 0; } -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, + unsigned int nr_pages) { } diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index 145306bdc92f..b2b8ec7bda3f 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -7,7 +7,8 @@ extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); +extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); @@ -15,7 +16,8 @@ extern void swap_cgroup_swapoff(int type); #else static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents) { return 0; } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 5c3a5f3e7eec..c5ff7b217ee6 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -196,15 +196,6 @@ static inline void num_poisoned_pages_dec(void) atomic_long_dec(&num_poisoned_pages); } -static inline void num_poisoned_pages_add(long num) -{ - atomic_long_add(num, &num_poisoned_pages); -} - -static inline void num_poisoned_pages_sub(long num) -{ - atomic_long_sub(num, &num_poisoned_pages); -} #else static inline swp_entry_t make_hwpoison_entry(struct page *page) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..225001d437ae 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -47,6 +47,9 @@ extern int proc_douintvec(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index d84ae90ccd5c..1707e0a7d943 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,6 +41,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, PAGEOUTRUN, PGROTATED, DROP_PAGECACHE, DROP_SLAB, + OOM_KILL, #ifdef CONFIG_NUMA_BALANCING NUMA_PTE_UPDATES, NUMA_HUGE_PTE_UPDATES, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 613771909b6e..b3d85f30d424 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -3,7 +3,6 @@ #include <linux/types.h> #include <linux/percpu.h> -#include <linux/mm.h> #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h index 84df14b37360..481e103da78e 100644 --- a/include/uapi/linux/kcmp.h +++ b/include/uapi/linux/kcmp.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_KCMP_H #define _UAPI_LINUX_KCMP_H +#include <linux/types.h> + /* Comparison type */ enum kcmp_type { KCMP_FILE, @@ -10,8 +12,16 @@ enum kcmp_type { KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, + KCMP_EPOLL_TFD, KCMP_TYPES, }; +/* Slot for KCMP_EPOLL_TFD */ +struct kcmp_epoll_slot { + __u32 efd; /* epoll file descriptor */ + __u32 tfd; /* target file number */ + __u32 toff; /* target offset within same numbered sequence */ +}; + #endif /* _UAPI_LINUX_KCMP_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index e230af2e6855..e7343d2a68c5 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -42,6 +42,7 @@ #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ #define NFS_SUPER_MAGIC 0x6969 +#define OCFS2_SUPER_MAGIC 0x7461636f #define OPENPROM_SUPER_MAGIC 0x9fa1 #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ #define QNX6_SUPER_MAGIC 0x68191122 /* qnx6 fs detection */ diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 9cd8b21dddbe..2a4d89508fec 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,13 +24,6 @@ enum { MPOL_MAX, /* always last member of enum */ }; -enum mpol_rebind_step { - MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ - MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ - MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ - MPOL_REBIND_NSTEP, -}; - /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) @@ -65,7 +58,6 @@ enum mpol_rebind_step { */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ -#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index dd73b908b2f3..67eb90361692 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -23,7 +23,7 @@ struct semid_ds { struct ipc_perm sem_perm; /* permissions .. see ipc.h */ __kernel_time_t sem_otime; /* last semop time */ - __kernel_time_t sem_ctime; /* last change time */ + __kernel_time_t sem_ctime; /* create/last semctl() time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct sem_queue *sem_pending; /* pending operations to be processed */ struct sem_queue **sem_pending_last; /* last pending operation */ diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index e13d48058b8d..c6602bdd2a10 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -26,6 +26,10 @@ #include <linux/types.h> #include <linux/compiler.h> +#ifndef __KERNEL__ +#include <stddef.h> /* For size_t. */ +#endif + #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is the largest acceptable value for the nlen diff --git a/ipc/msg.c b/ipc/msg.c index 104926dc72be..5b25e0755656 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -97,11 +97,11 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) static void msg_rcu_free(struct rcu_head *head) { - struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); - struct msg_queue *msq = ipc_rcu_to_struct(p); + struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); + struct msg_queue *msq = container_of(p, struct msg_queue, q_perm); security_msg_queue_free(msq); - ipc_rcu_free(head); + kvfree(msq); } /** @@ -114,12 +114,12 @@ static void msg_rcu_free(struct rcu_head *head) static int newque(struct ipc_namespace *ns, struct ipc_params *params) { struct msg_queue *msq; - int id, retval; + int retval; key_t key = params->key; int msgflg = params->flg; - msq = ipc_rcu_alloc(sizeof(*msq)); - if (!msq) + msq = kvmalloc(sizeof(*msq), GFP_KERNEL); + if (unlikely(!msq)) return -ENOMEM; msq->q_perm.mode = msgflg & S_IRWXUGO; @@ -128,7 +128,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) msq->q_perm.security = NULL; retval = security_msg_queue_alloc(msq); if (retval) { - ipc_rcu_putref(msq, ipc_rcu_free); + kvfree(msq); return retval; } @@ -142,10 +142,10 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) INIT_LIST_HEAD(&msq->q_senders); /* ipc_addid() locks msq upon success. */ - id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); - if (id < 0) { - ipc_rcu_putref(msq, msg_rcu_free); - return id; + retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); + if (retval < 0) { + call_rcu(&msq->q_perm.rcu, msg_rcu_free); + return retval; } ipc_unlock_object(&msq->q_perm); @@ -249,7 +249,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) free_msg(msg); } atomic_sub(msq->q_cbytes, &ns->msg_bytes); - ipc_rcu_putref(msq, msg_rcu_free); + ipc_rcu_putref(&msq->q_perm, msg_rcu_free); } /* @@ -688,7 +688,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, /* enqueue the sender and prepare to block */ ss_add(msq, &s, msgsz); - if (!ipc_rcu_getref(msq)) { + if (!ipc_rcu_getref(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } @@ -700,7 +700,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, rcu_read_lock(); ipc_lock_object(&msq->q_perm); - ipc_rcu_putref(msq, msg_rcu_free); + ipc_rcu_putref(&msq->q_perm, msg_rcu_free); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; diff --git a/ipc/sem.c b/ipc/sem.c index 947dc2348271..9e70cd7a17da 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -87,24 +87,6 @@ #include <linux/uaccess.h> #include "util.h" -/* One semaphore structure for each semaphore in the system. */ -struct sem { - int semval; /* current value */ - /* - * PID of the process that last modified the semaphore. For - * Linux, specifically these are: - * - semop - * - semctl, via SETVAL and SETALL. - * - at task exit when performing undo adjustments (see exit_sem). - */ - int sempid; - spinlock_t lock; /* spinlock for fine-grained semtimedop */ - struct list_head pending_alter; /* pending single-sop operations */ - /* that alter the semaphore */ - struct list_head pending_const; /* pending single-sop operations */ - /* that do not alter the semaphore*/ - time_t sem_otime; /* candidate for sem_otime */ -} ____cacheline_aligned_in_smp; /* One queue for each sleeping process in the system. */ struct sem_queue { @@ -175,7 +157,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * sem_array.sem_undo * * b) global or semaphore sem_lock() for read/write: - * sem_array.sem_base[i].pending_{const,alter}: + * sem_array.sems[i].pending_{const,alter}: * * c) special: * sem_undo_list.list_proc: @@ -250,7 +232,7 @@ static void unmerge_queues(struct sem_array *sma) */ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { struct sem *curr; - curr = &sma->sem_base[q->sops[0].sem_num]; + curr = &sma->sems[q->sops[0].sem_num]; list_add_tail(&q->list, &curr->pending_alter); } @@ -270,7 +252,7 @@ static void merge_queues(struct sem_array *sma) { int i; for (i = 0; i < sma->sem_nsems; i++) { - struct sem *sem = sma->sem_base + i; + struct sem *sem = &sma->sems[i]; list_splice_init(&sem->pending_alter, &sma->pending_alter); } @@ -278,11 +260,11 @@ static void merge_queues(struct sem_array *sma) static void sem_rcu_free(struct rcu_head *head) { - struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); - struct sem_array *sma = ipc_rcu_to_struct(p); + struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); + struct sem_array *sma = container_of(p, struct sem_array, sem_perm); security_sem_free(sma); - ipc_rcu_free(head); + kvfree(sma); } /* @@ -306,7 +288,7 @@ static void complexmode_enter(struct sem_array *sma) sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; for (i = 0; i < sma->sem_nsems; i++) { - sem = sma->sem_base + i; + sem = &sma->sems[i]; spin_lock(&sem->lock); spin_unlock(&sem->lock); } @@ -366,7 +348,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * * Both facts are tracked by use_global_mode. */ - sem = sma->sem_base + sops->sem_num; + sem = &sma->sems[sops->sem_num]; /* * Initial check for use_global_lock. Just an optimization, @@ -421,7 +403,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) complexmode_tryleave(sma); ipc_unlock_object(&sma->sem_perm); } else { - struct sem *sem = sma->sem_base + locknum; + struct sem *sem = &sma->sems[locknum]; spin_unlock(&sem->lock); } } @@ -456,7 +438,7 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); - ipc_rcu_putref(sma, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -464,6 +446,24 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) ipc_rmid(&sem_ids(ns), &s->sem_perm); } +static struct sem_array *sem_alloc(size_t nsems) +{ + struct sem_array *sma; + size_t size; + + if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) + return NULL; + + size = sizeof(*sma) + nsems * sizeof(sma->sems[0]); + sma = kvmalloc(size, GFP_KERNEL); + if (unlikely(!sma)) + return NULL; + + memset(sma, 0, size); + + return sma; +} + /** * newary - Create a new semaphore set * @ns: namespace @@ -473,10 +473,8 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) { - int id; int retval; struct sem_array *sma; - int size; key_t key = params->key; int nsems = params->u.nsems; int semflg = params->flg; @@ -487,29 +485,24 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) if (ns->used_sems + nsems > ns->sc_semmns) return -ENOSPC; - size = sizeof(*sma) + nsems * sizeof(struct sem); - sma = ipc_rcu_alloc(size); + sma = sem_alloc(nsems); if (!sma) return -ENOMEM; - memset(sma, 0, size); - sma->sem_perm.mode = (semflg & S_IRWXUGO); sma->sem_perm.key = key; sma->sem_perm.security = NULL; retval = security_sem_alloc(sma); if (retval) { - ipc_rcu_putref(sma, ipc_rcu_free); + kvfree(sma); return retval; } - sma->sem_base = (struct sem *) &sma[1]; - for (i = 0; i < nsems; i++) { - INIT_LIST_HEAD(&sma->sem_base[i].pending_alter); - INIT_LIST_HEAD(&sma->sem_base[i].pending_const); - spin_lock_init(&sma->sem_base[i].lock); + INIT_LIST_HEAD(&sma->sems[i].pending_alter); + INIT_LIST_HEAD(&sma->sems[i].pending_const); + spin_lock_init(&sma->sems[i].lock); } sma->complex_count = 0; @@ -520,10 +513,10 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_nsems = nsems; sma->sem_ctime = get_seconds(); - id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); - if (id < 0) { - ipc_rcu_putref(sma, sem_rcu_free); - return id; + retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); + if (retval < 0) { + call_rcu(&sma->sem_perm.rcu, sem_rcu_free); + return retval; } ns->used_sems += nsems; @@ -612,7 +605,7 @@ static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) un = q->undo; for (sop = sops; sop < sops + nsops; sop++) { - curr = sma->sem_base + sop->sem_num; + curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; result = curr->semval; @@ -639,7 +632,7 @@ static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) sop--; pid = q->pid; while (sop >= sops) { - sma->sem_base[sop->sem_num].sempid = pid; + sma->sems[sop->sem_num].sempid = pid; sop--; } @@ -661,7 +654,7 @@ undo: sop--; while (sop >= sops) { sem_op = sop->sem_op; - sma->sem_base[sop->sem_num].semval -= sem_op; + sma->sems[sop->sem_num].semval -= sem_op; if (sop->sem_flg & SEM_UNDO) un->semadj[sop->sem_num] += sem_op; sop--; @@ -692,7 +685,7 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) * until the operations can go through. */ for (sop = sops; sop < sops + nsops; sop++) { - curr = sma->sem_base + sop->sem_num; + curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; result = curr->semval; @@ -716,7 +709,7 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) } for (sop = sops; sop < sops + nsops; sop++) { - curr = sma->sem_base + sop->sem_num; + curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; result = curr->semval; @@ -815,7 +808,7 @@ static int wake_const_ops(struct sem_array *sma, int semnum, if (semnum == -1) pending_list = &sma->pending_const; else - pending_list = &sma->sem_base[semnum].pending_const; + pending_list = &sma->sems[semnum].pending_const; list_for_each_entry_safe(q, tmp, pending_list, list) { int error = perform_atomic_semop(sma, q); @@ -856,7 +849,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, for (i = 0; i < nsops; i++) { int num = sops[i].sem_num; - if (sma->sem_base[num].semval == 0) { + if (sma->sems[num].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, num, wake_q); } @@ -867,7 +860,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, * Assume all were changed. */ for (i = 0; i < sma->sem_nsems; i++) { - if (sma->sem_base[i].semval == 0) { + if (sma->sems[i].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, i, wake_q); } @@ -909,7 +902,7 @@ static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *w if (semnum == -1) pending_list = &sma->pending_alter; else - pending_list = &sma->sem_base[semnum].pending_alter; + pending_list = &sma->sems[semnum].pending_alter; again: list_for_each_entry_safe(q, tmp, pending_list, list) { @@ -922,7 +915,7 @@ again: * be in the per semaphore pending queue, and decrements * cannot be successful if the value is already 0. */ - if (semnum != -1 && sma->sem_base[semnum].semval == 0) + if (semnum != -1 && sma->sems[semnum].semval == 0) break; error = perform_atomic_semop(sma, q); @@ -959,9 +952,9 @@ again: static void set_semotime(struct sem_array *sma, struct sembuf *sops) { if (sops == NULL) { - sma->sem_base[0].sem_otime = get_seconds(); + sma->sems[0].sem_otime = get_seconds(); } else { - sma->sem_base[sops[0].sem_num].sem_otime = + sma->sems[sops[0].sem_num].sem_otime = get_seconds(); } } @@ -1067,9 +1060,9 @@ static int count_semcnt(struct sem_array *sma, ushort semnum, semcnt = 0; /* First: check the simple operations. They are easy to evaluate */ if (count_zero) - l = &sma->sem_base[semnum].pending_const; + l = &sma->sems[semnum].pending_const; else - l = &sma->sem_base[semnum].pending_alter; + l = &sma->sems[semnum].pending_alter; list_for_each_entry(q, l, list) { /* all task on a per-semaphore list sleep on exactly @@ -1124,7 +1117,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } for (i = 0; i < sma->sem_nsems; i++) { - struct sem *sem = sma->sem_base + i; + struct sem *sem = &sma->sems[i]; list_for_each_entry_safe(q, tq, &sem->pending_const, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); @@ -1142,7 +1135,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) wake_up_q(&wake_q); ns->used_sems -= sma->sem_nsems; - ipc_rcu_putref(sma, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) @@ -1174,9 +1167,9 @@ static time_t get_semotime(struct sem_array *sma) int i; time_t res; - res = sma->sem_base[0].sem_otime; + res = sma->sems[0].sem_otime; for (i = 1; i < sma->sem_nsems; i++) { - time_t to = sma->sem_base[i].sem_otime; + time_t to = sma->sems[i].sem_otime; if (to > res) res = to; @@ -1325,7 +1318,7 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, return -EIDRM; } - curr = &sma->sem_base[semnum]; + curr = &sma->sems[semnum]; ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) @@ -1382,15 +1375,16 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, goto out_unlock; } if (nsems > SEMMSL_FAST) { - if (!ipc_rcu_getref(sma)) { + if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } sem_unlock(sma, -1); rcu_read_unlock(); - sem_io = ipc_alloc(sizeof(ushort)*nsems); + sem_io = kvmalloc_array(nsems, sizeof(ushort), + GFP_KERNEL); if (sem_io == NULL) { - ipc_rcu_putref(sma, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } @@ -1402,7 +1396,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } } for (i = 0; i < sma->sem_nsems; i++) - sem_io[i] = sma->sem_base[i].semval; + sem_io[i] = sma->sems[i].semval; sem_unlock(sma, -1); rcu_read_unlock(); err = 0; @@ -1415,29 +1409,30 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; struct sem_undo *un; - if (!ipc_rcu_getref(sma)) { + if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_rcu_wakeup; } rcu_read_unlock(); if (nsems > SEMMSL_FAST) { - sem_io = ipc_alloc(sizeof(ushort)*nsems); + sem_io = kvmalloc_array(nsems, sizeof(ushort), + GFP_KERNEL); if (sem_io == NULL) { - ipc_rcu_putref(sma, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } } if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) { - ipc_rcu_putref(sma, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { - ipc_rcu_putref(sma, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -ERANGE; goto out_free; } @@ -1450,8 +1445,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } for (i = 0; i < nsems; i++) { - sma->sem_base[i].semval = sem_io[i]; - sma->sem_base[i].sempid = task_tgid_vnr(current); + sma->sems[i].semval = sem_io[i]; + sma->sems[i].sempid = task_tgid_vnr(current); } ipc_assert_locked_object(&sma->sem_perm); @@ -1476,7 +1471,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, err = -EIDRM; goto out_unlock; } - curr = &sma->sem_base[semnum]; + curr = &sma->sems[semnum]; switch (cmd) { case GETVAL: @@ -1500,7 +1495,7 @@ out_rcu_wakeup: wake_up_q(&wake_q); out_free: if (sem_io != fast_sem_io) - ipc_free(sem_io); + kvfree(sem_io); return err; } @@ -1719,7 +1714,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) } nsems = sma->sem_nsems; - if (!ipc_rcu_getref(sma)) { + if (!ipc_rcu_getref(&sma->sem_perm)) { rcu_read_unlock(); un = ERR_PTR(-EIDRM); goto out; @@ -1729,7 +1724,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 2: allocate new undo structure */ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); if (!new) { - ipc_rcu_putref(sma, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); } @@ -1932,7 +1927,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, */ if (nsops == 1) { struct sem *curr; - curr = &sma->sem_base[sops->sem_num]; + curr = &sma->sems[sops->sem_num]; if (alter) { if (sma->complex_count) { @@ -2146,7 +2141,7 @@ void exit_sem(struct task_struct *tsk) /* perform adjustments registered in un */ for (i = 0; i < sma->sem_nsems; i++) { - struct sem *semaphore = &sma->sem_base[i]; + struct sem *semaphore = &sma->sems[i]; if (un->semadj[i]) { semaphore->semval += un->semadj[i]; /* diff --git a/ipc/shm.c b/ipc/shm.c index 34c4344e8d4b..ec5688e98f25 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -174,11 +174,12 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) static void shm_rcu_free(struct rcu_head *head) { - struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); - struct shmid_kernel *shp = ipc_rcu_to_struct(p); - + struct kern_ipc_perm *ptr = container_of(head, struct kern_ipc_perm, + rcu); + struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel, + shm_perm); security_shm_free(shp); - ipc_rcu_free(head); + kvfree(shp); } static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) @@ -241,7 +242,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) user_shm_unlock(i_size_read(file_inode(shm_file)), shp->mlock_user); fput(shm_file); - ipc_rcu_putref(shp, shm_rcu_free); + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); } /* @@ -529,7 +530,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; struct file *file; char name[13]; - int id; vm_flags_t acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) @@ -542,8 +542,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; - shp = ipc_rcu_alloc(sizeof(*shp)); - if (!shp) + shp = kvmalloc(sizeof(*shp), GFP_KERNEL); + if (unlikely(!shp)) return -ENOMEM; shp->shm_perm.key = key; @@ -553,7 +553,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_perm.security = NULL; error = security_shm_alloc(shp); if (error) { - ipc_rcu_putref(shp, ipc_rcu_free); + kvfree(shp); return error; } @@ -598,11 +598,9 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_file = file; shp->shm_creator = current; - id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); - if (id < 0) { - error = id; + error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); + if (error < 0) goto no_id; - } list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); @@ -624,7 +622,7 @@ no_id: user_shm_unlock(size, shp->mlock_user); fput(file); no_file: - ipc_rcu_putref(shp, shm_rcu_free); + call_rcu(&shp->shm_perm.rcu, shm_rcu_free); return error; } diff --git a/ipc/util.c b/ipc/util.c index caec7b1bfaa3..1a2cb02467ab 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -232,6 +232,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) idr_preload(GFP_KERNEL); + atomic_set(&new->refcount, 1); spin_lock_init(&new->lock); new->deleted = false; rcu_read_lock(); @@ -394,70 +395,18 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) ipcp->deleted = true; } -/** - * ipc_alloc - allocate ipc space - * @size: size desired - * - * Allocate memory from the appropriate pools and return a pointer to it. - * NULL is returned if the allocation fails - */ -void *ipc_alloc(int size) -{ - return kvmalloc(size, GFP_KERNEL); -} - -/** - * ipc_free - free ipc space - * @ptr: pointer returned by ipc_alloc - * - * Free a block created with ipc_alloc(). - */ -void ipc_free(void *ptr) +int ipc_rcu_getref(struct kern_ipc_perm *ptr) { - kvfree(ptr); + return atomic_inc_not_zero(&ptr->refcount); } -/** - * ipc_rcu_alloc - allocate ipc and rcu space - * @size: size desired - * - * Allocate memory for the rcu header structure + the object. - * Returns the pointer to the object or NULL upon failure. - */ -void *ipc_rcu_alloc(int size) +void ipc_rcu_putref(struct kern_ipc_perm *ptr, + void (*func)(struct rcu_head *head)) { - /* - * We prepend the allocation with the rcu struct - */ - struct ipc_rcu *out = ipc_alloc(sizeof(struct ipc_rcu) + size); - if (unlikely(!out)) - return NULL; - atomic_set(&out->refcount, 1); - return out + 1; -} - -int ipc_rcu_getref(void *ptr) -{ - struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; - - return atomic_inc_not_zero(&p->refcount); -} - -void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head)) -{ - struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; - - if (!atomic_dec_and_test(&p->refcount)) + if (!atomic_dec_and_test(&ptr->refcount)) return; - call_rcu(&p->rcu, func); -} - -void ipc_rcu_free(struct rcu_head *head) -{ - struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); - - kvfree(p); + call_rcu(&ptr->rcu, func); } /** diff --git a/ipc/util.h b/ipc/util.h index 60ddccca464d..c692010e6f0a 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -47,13 +47,6 @@ static inline void msg_exit_ns(struct ipc_namespace *ns) { } static inline void shm_exit_ns(struct ipc_namespace *ns) { } #endif -struct ipc_rcu { - struct rcu_head rcu; - atomic_t refcount; -} ____cacheline_aligned_in_smp; - -#define ipc_rcu_to_struct(p) ((void *)(p+1)) - /* * Structure that holds the parameters needed by the ipc operations * (see after) @@ -114,22 +107,18 @@ void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *); /* must be called with ipcp locked */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); -/* for rare, potentially huge allocations. - * both function can sleep - */ -void *ipc_alloc(int size); -void ipc_free(void *ptr); - /* * For allocation that need to be freed by RCU. * Objects are reference counted, they start with reference count 1. * getref increases the refcount, the putref call that reduces the recount * to 0 schedules the rcu destruction. Caller must guarantee locking. + * + * refcount is initialized by ipc_addid(), before that point call_rcu() + * must be used. */ -void *ipc_rcu_alloc(int size); -int ipc_rcu_getref(void *ptr); -void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head)); -void ipc_rcu_free(struct rcu_head *head); +int ipc_rcu_getref(struct kern_ipc_perm *ptr); +void ipc_rcu_putref(struct kern_ipc_perm *ptr, + void (*func)(struct rcu_head *head)); struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ae643412948a..ca8376e5008c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1038,40 +1038,25 @@ static void cpuset_post_attach(void) * @tsk: the task to change * @newmems: new nodes that the task will be set * - * In order to avoid seeing no nodes if the old and new nodes are disjoint, - * we structure updates as setting all new allowed nodes, then clearing newly - * disallowed ones. + * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed + * and rebind an eventual tasks' mempolicy. If the task is allocating in + * parallel, it might temporarily see an empty intersection, which results in + * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool need_loop; - task_lock(tsk); - /* - * Determine if a loop is necessary if another thread is doing - * read_mems_allowed_begin(). If at least one node remains unchanged and - * tsk does not have a mempolicy, then an empty nodemask will not be - * possible when mems_allowed is larger than a word. - */ - need_loop = task_has_mempolicy(tsk) || - !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) { - local_irq_disable(); - write_seqcount_begin(&tsk->mems_allowed_seq); - } + local_irq_disable(); + write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); + mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; - if (need_loop) { - write_seqcount_end(&tsk->mems_allowed_seq); - local_irq_enable(); - } + write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); task_unlock(tsk); } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,10 +14,12 @@ #include <asm/sections.h> /* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +static unsigned char *vmcoreinfo_data; +static size_t vmcoreinfo_size; +u32 *vmcoreinfo_note; + +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; /* * parsing the "crashkernel" commandline @@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void) final_note(buf); } +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + void crash_save_vmcoreinfo(void) { + if (!vmcoreinfo_note) + return; + + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } @@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); @@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) { + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_PAGESIZE(PAGE_SIZE); diff --git a/kernel/exit.c b/kernel/exit.c index 2e52b8f30430..d0a34e1e4ec1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,7 +51,6 @@ #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> #include <linux/fs_struct.h> -#include <linux/userfaultfd_k.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..05a4984fc044 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) void *stack; int i; - local_irq_disable(); for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s = this_cpu_read(cached_stacks[i]); + struct vm_struct *s; + + s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; - this_cpu_write(cached_stacks[i], NULL); tsk->stack_vm_area = s; - local_irq_enable(); return s->addr; } - local_irq_enable(); stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { - unsigned long flags; int i; - local_irq_save(flags); for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_read(cached_stacks[i])) + if (this_cpu_cmpxchg(cached_stacks[i], + NULL, tsk->stack_vm_area) != NULL) continue; - this_cpu_write(cached_stacks[i], tsk->stack_vm_area); - local_irq_restore(flags); return; } - local_irq_restore(flags); vfree_atomic(tsk->stack); return; @@ -326,8 +320,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } /* All stack pages belong to the same memcg. */ - memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -338,8 +332,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } @@ -560,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_long(); + tsk->stack_canary = get_random_canary(); #endif /* @@ -579,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -11,6 +11,10 @@ #include <linux/bug.h> #include <linux/err.h> #include <linux/kcmp.h> +#include <linux/capability.h> +#include <linux/list.h> +#include <linux/eventpoll.h> +#include <linux/file.h> #include <asm/unistd.h> @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) return err; } +#ifdef CONFIG_EPOLL +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + struct file *filp, *filp_epoll, *filp_tgt; + struct kcmp_epoll_slot slot; + struct files_struct *files; + + if (copy_from_user(&slot, uslot, sizeof(slot))) + return -EFAULT; + + filp = get_file_raw_ptr(task1, idx1); + if (!filp) + return -EBADF; + + files = get_files_struct(task2); + if (!files) + return -EBADF; + + spin_lock(&files->file_lock); + filp_epoll = fcheck_files(files, slot.efd); + if (filp_epoll) + get_file(filp_epoll); + else + filp_tgt = ERR_PTR(-EBADF); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (filp_epoll) { + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); + } else + + if (IS_ERR(filp_tgt)) + return PTR_ERR(filp_tgt); + + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); +} +#else +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + return -EOPNOTSUPP; +} +#endif + SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = -EOPNOTSUPP; #endif break; + case KCMP_EPOLL_TFD: + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); + break; default: ret = -EINVAL; break; diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + for (i = 0; i < nr_segments; i++) { ret = kimage_load_segment(image, &image->segment[i]); if (ret) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index ae1a3ba24df5..a5f0d9b3e0ca 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -481,6 +481,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, return pages; } +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -568,6 +602,11 @@ void kimage_free(struct kimage *image) if (!image) return; + if (image->vmcoreinfo_data_copy) { + crash_update_vmcoreinfo_safecopy(NULL); + vunmap(image->vmcoreinfo_data_copy); + } + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b118735fea9d..f78f719dae36 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -304,6 +304,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + ret = kexec_calculate_store_digests(image); if (ret) goto out; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..c40a4e5a4304 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)sizeof(vmcoreinfo_note)); + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) */ pv_lock_hash = alloc_large_system_hash("PV qspinlock", sizeof(struct pv_hash_entry), - pv_hash_size, 0, HASH_EARLY, + pv_hash_size, 0, + HASH_EARLY | HASH_ZERO, &pv_lock_hash_bits, NULL, pv_hash_size, pv_hash_size); } diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -358,7 +358,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, true); + error = arch_add_memory(nid, align_start, align_size, false); + if (!error) + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..731c4e528f4e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int i, pidhash_size; + unsigned int pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, + HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); pidhash_size = 1U << pidhash_shift; - - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); } void __init pidmap_init(void) diff --git a/kernel/reboot.c b/kernel/reboot.c index bd30a973fe94..e4ced883d8de 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +static void devm_unregister_reboot_notifier(struct device *dev, void *res) +{ + WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res)); +} + +int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb) +{ + struct notifier_block **rcnb; + int ret; + + rcnb = devres_alloc(devm_unregister_reboot_notifier, + sizeof(*rcnb), GFP_KERNEL); + if (!rcnb) + return -ENOMEM; + + ret = register_reboot_notifier(nb); + if (!ret) { + *rcnb = nb; + devres_add(dev, rcnb); + } else { + devres_free(rcnb); + } + + return ret; +} +EXPORT_SYMBOL(devm_register_reboot_notifier); + /* * Notifier list for kernel code which wants to be called * to restart the system. diff --git a/kernel/signal.c b/kernel/signal.c index d031cd24f8a9..a212491355bc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1394,6 +1394,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) return ret; } + /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ + if (pid == INT_MIN) + return -ESRCH; + read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..e48f0636c7fd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2266,7 +2266,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2274,9 +2274,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (down_write_killable(&me->mm->mmap_sem)) return -EINTR; if (arg2) - me->mm->def_flags |= VM_NOHUGEPAGE; + set_bit(MMF_DISABLE_THP, &me->mm->flags); else - me->mm->def_flags &= ~VM_NOHUGEPAGE; + clear_bit(MMF_DISABLE_THP, &me->mm->flags); up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..df9f2a367882 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -174,11 +174,32 @@ extern int no_unaligned_warning; #ifdef CONFIG_PROC_SYSCTL -#define SYSCTL_WRITES_LEGACY -1 -#define SYSCTL_WRITES_WARN 0 -#define SYSCTL_WRITES_STRICT 1 +/** + * enum sysctl_writes_mode - supported sysctl write modes + * + * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. + * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is + * not 0. + * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max lenght will be ignored. Multiple writes will append + * to the buffer. + * + * These write modes control how current file position affects the behavior of + * updating sysctl values through the proc interface on each write. + */ +enum sysctl_writes_mode { + SYSCTL_WRITES_LEGACY = -1, + SYSCTL_WRITES_WARN = 0, + SYSCTL_WRITES_STRICT = 1, +}; -static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; +static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -1950,6 +1971,32 @@ static void warn_sysctl_write(struct ctl_table *table) } /** + * proc_first_pos_non_zero_ignore - check if firs position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * hadlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + +/** * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file @@ -1969,8 +2016,8 @@ static void warn_sysctl_write(struct ctl_table *table) int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) - warn_sysctl_write(table); + if (write) + proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); @@ -2128,19 +2175,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } -static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) { if (write) { - if (*negp) + if (*lvalp > UINT_MAX) return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; - *negp = false; *lvalp = (unsigned long)val; } return 0; @@ -2172,17 +2218,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2249,6 +2286,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *kbuf = NULL, *p; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return -EINVAL; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + kfree(kbuf); + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + err = proc_put_long(&buffer, &left, lval, false); + if (err || !left) + goto out; + + err = proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -2284,8 +2461,8 @@ int proc_dointvec(struct ctl_table *table, int write, int proc_douintvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -2390,6 +2567,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_douintvec_minmax_conv_param *param = data; + + if (write) { + unsigned int val = *lvalp; + + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -2447,17 +2683,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2898,6 +3125,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2940,6 +3173,7 @@ EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 939a158eab11..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) * CTL_KERN/KERN_VERSION is used by older glibc and cannot * ever go away. */ - if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) return; if (printk_ratelimit()) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4eac3fa71898..650250aec2d0 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1591,7 +1591,7 @@ config RBTREE_TEST config INTERVAL_TREE_TEST tristate "Interval tree test" - depends on m && DEBUG_KERNEL + depends on DEBUG_KERNEL select INTERVAL_TREE help A benchmark measuring the performance of the interval tree library @@ -1782,6 +1782,17 @@ config TEST_FIRMWARE If unsure, say N. +config TEST_SYSCTL + tristate "sysctl test driver" + default n + depends on PROC_SYSCTL + help + This builds the "test_sysctl" module. This driver enables to test the + proc sysctl interfaces available to drivers safely without affecting + production knobs which might alter system functionality. + + If unsure, say N. + config TEST_UDELAY tristate "udelay test driver" default n diff --git a/lib/Makefile b/lib/Makefile index 07fbe6a75692..1935a97171db 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 4ff157159a0d..09ac73c177fd 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + if (in_task() && current->fail_nth) { + if (--current->fail_nth == 0) + goto fail; + return false; + } + /* No need to check any other properties if the probability is 0 */ if (attr->probability == 0) return false; @@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (!fail_stacktrace(attr)) return false; +fail: fail_dump(attr); if (atomic_read(&attr->times) != -1) diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c index 245900b98c8e..df495fe81421 100644 --- a/lib/interval_tree_test.c +++ b/lib/interval_tree_test.c @@ -1,27 +1,38 @@ #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/interval_tree.h> #include <linux/random.h> +#include <linux/slab.h> #include <asm/timex.h> -#define NODES 100 -#define PERF_LOOPS 100000 -#define SEARCHES 100 -#define SEARCH_LOOPS 10000 +#define __param(type, name, init, msg) \ + static type name = init; \ + module_param(name, type, 0444); \ + MODULE_PARM_DESC(name, msg); + +__param(int, nnodes, 100, "Number of nodes in the interval tree"); +__param(int, perf_loops, 100000, "Number of iterations modifying the tree"); + +__param(int, nsearches, 100, "Number of searches to the interval tree"); +__param(int, search_loops, 10000, "Number of iterations searching the tree"); +__param(bool, search_all, false, "Searches will iterate all nodes in the tree"); + +__param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint"); static struct rb_root root = RB_ROOT; -static struct interval_tree_node nodes[NODES]; -static u32 queries[SEARCHES]; +static struct interval_tree_node *nodes = NULL; +static u32 *queries = NULL; static struct rnd_state rnd; static inline unsigned long -search(unsigned long query, struct rb_root *root) +search(struct rb_root *root, unsigned long start, unsigned long last) { struct interval_tree_node *node; unsigned long results = 0; - for (node = interval_tree_iter_first(root, query, query); node; - node = interval_tree_iter_next(node, query, query)) + for (node = interval_tree_iter_first(root, start, last); node; + node = interval_tree_iter_next(node, start, last)) results++; return results; } @@ -29,19 +40,22 @@ search(unsigned long query, struct rb_root *root) static void init(void) { int i; - for (i = 0; i < NODES; i++) { - u32 a = prandom_u32_state(&rnd); - u32 b = prandom_u32_state(&rnd); - if (a <= b) { - nodes[i].start = a; - nodes[i].last = b; - } else { - nodes[i].start = b; - nodes[i].last = a; - } + + for (i = 0; i < nnodes; i++) { + u32 b = (prandom_u32_state(&rnd) >> 4) % max_endpoint; + u32 a = (prandom_u32_state(&rnd) >> 4) % b; + + nodes[i].start = a; + nodes[i].last = b; } - for (i = 0; i < SEARCHES; i++) - queries[i] = prandom_u32_state(&rnd); + + /* + * Limit the search scope to what the user defined. + * Otherwise we are merely measuring empty walks, + * which is pointless. + */ + for (i = 0; i < nsearches; i++) + queries[i] = (prandom_u32_state(&rnd) >> 4) % max_endpoint; } static int interval_tree_test_init(void) @@ -50,6 +64,16 @@ static int interval_tree_test_init(void) unsigned long results; cycles_t time1, time2, time; + nodes = kmalloc(nnodes * sizeof(struct interval_tree_node), GFP_KERNEL); + if (!nodes) + return -ENOMEM; + + queries = kmalloc(nsearches * sizeof(int), GFP_KERNEL); + if (!queries) { + kfree(nodes); + return -ENOMEM; + } + printk(KERN_ALERT "interval tree insert/remove"); prandom_seed_state(&rnd, 3141592653589793238ULL); @@ -57,39 +81,46 @@ static int interval_tree_test_init(void) time1 = get_cycles(); - for (i = 0; i < PERF_LOOPS; i++) { - for (j = 0; j < NODES; j++) + for (i = 0; i < perf_loops; i++) { + for (j = 0; j < nnodes; j++) interval_tree_insert(nodes + j, &root); - for (j = 0; j < NODES; j++) + for (j = 0; j < nnodes; j++) interval_tree_remove(nodes + j, &root); } time2 = get_cycles(); time = time2 - time1; - time = div_u64(time, PERF_LOOPS); + time = div_u64(time, perf_loops); printk(" -> %llu cycles\n", (unsigned long long)time); printk(KERN_ALERT "interval tree search"); - for (j = 0; j < NODES; j++) + for (j = 0; j < nnodes; j++) interval_tree_insert(nodes + j, &root); time1 = get_cycles(); results = 0; - for (i = 0; i < SEARCH_LOOPS; i++) - for (j = 0; j < SEARCHES; j++) - results += search(queries[j], &root); + for (i = 0; i < search_loops; i++) + for (j = 0; j < nsearches; j++) { + unsigned long start = search_all ? 0 : queries[j]; + unsigned long last = search_all ? max_endpoint : queries[j]; + + results += search(&root, start, last); + } time2 = get_cycles(); time = time2 - time1; - time = div_u64(time, SEARCH_LOOPS); - results = div_u64(results, SEARCH_LOOPS); + time = div_u64(time, search_loops); + results = div_u64(results, search_loops); printk(" -> %llu cycles (%lu results)\n", (unsigned long long)time, results); + kfree(queries); + kfree(nodes); + return -EAGAIN; /* Fail will directly unload the module */ } diff --git a/lib/kstrtox.c b/lib/kstrtox.c index bf85e05ce858..720144075c1e 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -51,13 +51,15 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long res = 0; rv = 0; - while (*s) { + while (1) { + unsigned int c = *s; + unsigned int lc = c | 0x20; /* don't tolower() this line */ unsigned int val; - if ('0' <= *s && *s <= '9') - val = *s - '0'; - else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f') - val = _tolower(*s) - 'a' + 10; + if ('0' <= c && c <= '9') + val = c - '0'; + else if ('a' <= lc && lc <= 'f') + val = lc - 'a' + 10; else break; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index d9e7274a04cd..42466c167257 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -211,11 +211,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, int i; size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || - gfp != GFP_KERNEL) + if (gfp != GFP_KERNEL) tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - if (tbl == NULL && gfp == GFP_KERNEL) - tbl = vzalloc(size); + else + tbl = kvzalloc(size, gfp); size = nbuckets; diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c new file mode 100644 index 000000000000..3dd801c1c85b --- /dev/null +++ b/lib/test_sysctl.c @@ -0,0 +1,148 @@ +/* + * proc sysctl test driver + * + * Copyright (C) 2017 Luis R. Rodriguez <mcgrof@kernel.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or at your option any + * later version; or, when distributed separately from the Linux kernel or + * when incorporated into other software packages, subject to the following + * license: + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of copyleft-next (version 0.3.1 or later) as published + * at http://copyleft-next.org/. + */ + +/* + * This module provides an interface to the the proc sysctl interfaces. This + * driver requires CONFIG_PROC_SYSCTL. It will not normally be loaded by the + * system unless explicitly requested by name. You can also build this driver + * into your kernel. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/async.h> +#include <linux/delay.h> +#include <linux/vmalloc.h> + +static int i_zero; +static int i_one_hundred = 100; + +struct test_sysctl_data { + int int_0001; + int int_0002; + int int_0003[4]; + + unsigned int uint_0001; + + char string_0001[65]; +}; + +static struct test_sysctl_data test_data = { + .int_0001 = 60, + .int_0002 = 1, + + .int_0003[0] = 0, + .int_0003[1] = 1, + .int_0003[2] = 2, + .int_0003[3] = 3, + + .uint_0001 = 314, + + .string_0001 = "(none)", +}; + +/* These are all under /proc/sys/debug/test_sysctl/ */ +static struct ctl_table test_table[] = { + { + .procname = "int_0001", + .data = &test_data.int_0001, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }, + { + .procname = "int_0002", + .data = &test_data.int_0002, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "int_0003", + .data = &test_data.int_0003, + .maxlen = sizeof(test_data.int_0003), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "uint_0001", + .data = &test_data.uint_0001, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { + .procname = "string_0001", + .data = &test_data.string_0001, + .maxlen = sizeof(test_data.string_0001), + .mode = 0644, + .proc_handler = proc_dostring, + }, + { } +}; + +static struct ctl_table test_sysctl_table[] = { + { + .procname = "test_sysctl", + .maxlen = 0, + .mode = 0555, + .child = test_table, + }, + { } +}; + +static struct ctl_table test_sysctl_root_table[] = { + { + .procname = "debug", + .maxlen = 0, + .mode = 0555, + .child = test_sysctl_table, + }, + { } +}; + +static struct ctl_table_header *test_sysctl_header; + +static int __init test_sysctl_init(void) +{ + test_sysctl_header = register_sysctl_table(test_sysctl_root_table); + if (!test_sysctl_header) + return -ENOMEM; + return 0; +} +late_initcall(test_sysctl_init); + +static void __exit test_sysctl_exit(void) +{ + if (test_sysctl_header) + unregister_sysctl_table(test_sysctl_header); +} + +module_exit(test_sysctl_exit); + +MODULE_AUTHOR("Luis R. Rodriguez <mcgrof@kernel.org>"); +MODULE_LICENSE("GPL"); diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..c8df94059974 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -149,32 +149,6 @@ config NO_BOOTMEM config MEMORY_ISOLATION bool -config MOVABLE_NODE - bool "Enable to assign a node which has only movable memory" - depends on HAVE_MEMBLOCK - depends on NO_BOOTMEM - depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG - depends on NUMA - default n - help - Allow a node to have only movable memory. Pages used by the kernel, - such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows the following - two things: - - When the system is booting, node full of hotpluggable memory can - be arranged to have only movable memory so that the whole node can - be hot-removed. (need movable_node boot option specified). - - After the system is up, the option allows users to online all the - memory of a node as movable memory so that the whole node can be - hot-removed. - - Users who don't use the memory hotplug feature are fine with this - option on since they don't specify movable_node boot option or they - don't online memory as movable. - - Say Y here if you want to hotplug a whole node. - Say N here if you want kernel to use memory on all nodes evenly. - # # Only be set on architectures that have completely implemented memory hotplug # feature. If you are not sure, don't touch it. @@ -187,7 +161,6 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on COMPILE_TEST || !KASAN config MEMORY_HOTPLUG_SPARSE def_bool y @@ -446,6 +419,18 @@ choice benefit. endchoice +config ARCH_WANTS_THP_SWAP + def_bool n + +config THP_SWAP + def_bool y + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP + help + Swap transparent huge pages in one piece, without splitting. + XXX: For now this only does clustered swap space allocation. + + For selection by architectures with reasonable THP sizes. + config TRANSPARENT_HUGE_PAGECACHE def_bool y depends on TRANSPARENT_HUGEPAGE @@ -127,7 +127,7 @@ static int __init cma_activate_area(struct cma *cma) * to be in the same zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto err; + goto not_in_zone; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -141,7 +141,8 @@ static int __init cma_activate_area(struct cma *cma) return 0; -err: +not_in_zone: + pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; return -EINVAL; diff --git a/mm/compaction.c b/mm/compaction.c index 613c59e928cb..fb548e4c7bd4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone) cond_resched(); - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - - page = pfn_to_page(pfn); if (zone != page_zone(page)) continue; diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..b9e870600572 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -239,14 +239,16 @@ void __delete_from_page_cache(struct page *page, void *shadow) /* Leave page->index set: truncation lookup relies upon it */ /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(page)) - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + if (PageHuge(page)) + return; + + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else { - VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); + VM_BUG_ON_PAGE(PageTransHuge(page), page); } /* @@ -2226,7 +2228,7 @@ int filemap_fault(struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); @@ -208,72 +208,28 @@ no_page: return no_page_table(vma, flags); } -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in <linux/mm.h> - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +static struct page *follow_pmd_mask(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + unsigned int flags, unsigned int *page_mask) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; pmd_t *pmd; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - return page; - } - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return no_page_table(vma, flags); - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d)) - return no_page_table(vma, flags); - BUILD_BUG_ON(p4d_huge(*p4d)); - if (unlikely(p4d_bad(*p4d))) - return no_page_table(vma, flags); - pud = pud_offset(p4d, address); - if (pud_none(*pud)) + pmd = pmd_offset(pudp, address); + if (pmd_none(*pmd)) return no_page_table(vma, flags); - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pud(mm, address, pud, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (pud_devmap(*pud)) { - ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); - spin_unlock(ptl); - if (page) - return page; - } - if (unlikely(pud_bad(*pud))) - return no_page_table(vma, flags); - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags); + if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pmd_val(*pmd)), flags, + PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); @@ -319,13 +275,131 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags); } - page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; return page; } + +static struct page *follow_pud_mask(struct vm_area_struct *vma, + unsigned long address, p4d_t *p4dp, + unsigned int flags, unsigned int *page_mask) +{ + pud_t *pud; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + pud = pud_offset(p4dp, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pud_val(*pud)))) { + page = follow_huge_pd(vma, address, + __hugepd(pud_val(*pud)), flags, + PUD_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + return follow_pmd_mask(vma, address, pud, flags, page_mask); +} + + +static struct page *follow_p4d_mask(struct vm_area_struct *vma, + unsigned long address, pgd_t *pgdp, + unsigned int flags, unsigned int *page_mask) +{ + p4d_t *p4d; + struct page *page; + + p4d = p4d_offset(pgdp, address); + if (p4d_none(*p4d)) + return no_page_table(vma, flags); + BUILD_BUG_ON(p4d_huge(*p4d)); + if (unlikely(p4d_bad(*p4d))) + return no_page_table(vma, flags); + + if (is_hugepd(__hugepd(p4d_val(*p4d)))) { + page = follow_huge_pd(vma, address, + __hugepd(p4d_val(*p4d)), flags, + P4D_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + return follow_pud_mask(vma, address, p4d, flags, page_mask); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + /* make this handle hugepd */ + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + return page; + } + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + if (pgd_huge(*pgd)) { + page = follow_huge_pgd(mm, address, pgd, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pgd_val(*pgd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pgd_val(*pgd)), flags, + PGDIR_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + + return follow_p4d_mask(vma, address, pgd, flags, page_mask); +} + static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -1354,16 +1428,15 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return __gup_device_huge_pmd(orig, addr, end, pages, nr); refs = 0; - head = pmd_page(orig); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(page); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1393,16 +1466,15 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return __gup_device_huge_pud(orig, addr, end, pages, nr); refs = 0; - head = pud_page(orig); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(page); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1431,16 +1503,15 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, BUILD_BUG_ON(pgd_devmap(orig)); refs = 0; - head = pgd_page(orig); - page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(page); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a84909cf20d3..3a14c77fcce7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2197,7 +2197,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc()/atomic_add(). */ - if (PageAnon(head)) { + if (PageAnon(head) && !PageSwapCache(head)) { page_ref_inc(page_tail); } else { /* Additional pin to radix tree */ @@ -2208,6 +2208,7 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->flags |= (head->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | + (1L << PG_swapcache) | (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | @@ -2270,7 +2271,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - page_ref_inc(head); + /* Additional pin to radix tree of swap cache */ + if (PageSwapCache(head)) + page_ref_add(head, 2); + else + page_ref_inc(head); } else { /* Additional pin to radix tree */ page_ref_add(head, 2); @@ -2379,6 +2384,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) return ret; } +/* Racy check whether the huge page can be split */ +bool can_split_huge_page(struct page *page, int *pextra_pins) +{ + int extra_pins; + + /* Additional pins from radix tree */ + if (PageAnon(page)) + extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + else + extra_pins = HPAGE_PMD_NR; + if (pextra_pins) + *pextra_pins = extra_pins; + return total_mapcount(page) == page_count(page) - extra_pins - 1; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. @@ -2426,7 +2446,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } - extra_pins = 0; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2438,8 +2457,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - /* Addidional pins from radix tree */ - extra_pins = HPAGE_PMD_NR; anon_vma = NULL; i_mmap_lock_read(mapping); } @@ -2448,7 +2465,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * Racy check if we can split the page, before freeze_page() will * split PMDs */ - if (total_mapcount(head) != page_count(head) - extra_pins - 1) { + if (!can_split_huge_page(head, &extra_pins)) { ret = -EBUSY; goto out_unlock; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3eedb187e549..761a669d0b62 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,7 +22,6 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> -#include <linux/page-isolation.h> #include <linux/jhash.h> #include <asm/page.h> @@ -71,6 +70,17 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static char * __init memfmt(char *buf, unsigned long n) +{ + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); + else + sprintf(buf, "%lu KB", n >> 10); + return buf; +} + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -867,12 +877,12 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; list_for_each_entry(page, &h->hugepage_freelists[nid], lru) - if (!is_migrate_isolate_page(page)) + if (!PageHWPoison(page)) break; /* * if 'non-isolated free hugepage' not found on the list, @@ -887,6 +897,22 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + int node; + + if (nid != NUMA_NO_NODE) + return dequeue_huge_page_node_exact(h, nid); + + for_each_online_node(node) { + page = dequeue_huge_page_node_exact(h, node); + if (page) + return page; + } + return NULL; +} + /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { @@ -904,6 +930,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; + gfp_t gfp_mask; + int nid; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; @@ -924,12 +952,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask(h), &mpol, &nodemask); + gfp_mask = htlb_alloc_mask(h); + nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + zonelist = node_zonelist(nid, gfp_mask); for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { + if (cpuset_zone_allowed(zone, gfp_mask)) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { if (avoid_reserve) @@ -1024,9 +1053,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ - ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ - defined(CONFIG_CMA)) +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -1158,8 +1185,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h, return 0; } -static inline bool gigantic_page_supported(void) { return true; } -#else +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, @@ -1444,7 +1470,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * number of free hugepages would be reduced below the number of reserved * hugepages. */ -static int dissolve_free_huge_page(struct page *page) +int dissolve_free_huge_page(struct page *page) { int rc = 0; @@ -1457,6 +1483,14 @@ static int dissolve_free_huge_page(struct page *page) rc = -EBUSY; goto out; } + /* + * Move PageHWPoison flag from head page to the raw error page, + * which makes any subpages rather than the error page reusable. + */ + if (PageHWPoison(head) && page != head) { + SetPageHWPoison(page); + ClearPageHWPoison(head); + } list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -1545,13 +1579,13 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, do { struct page *page; struct mempolicy *mpol; - struct zonelist *zl; + int nid; nodemask_t *nodemask; cpuset_mems_cookie = read_mems_allowed_begin(); - zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); + nid = huge_node(vma, addr, gfp, &mpol, &nodemask); mpol_cond_put(mpol); - page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + page = __alloc_pages_nodemask(gfp, order, nid, nodemask); if (page) return page; } while (read_mems_allowed_retry(cpuset_mems_cookie)); @@ -2189,7 +2223,14 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) &node_states[N_MEMORY])) break; } - h->max_huge_pages = i; + if (i < h->max_huge_pages) { + char buf[32]; + + memfmt(buf, huge_page_size(h)), + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, i); + h->max_huge_pages = i; + } } static void __init hugetlb_init_hstates(void) @@ -2207,17 +2248,6 @@ static void __init hugetlb_init_hstates(void) VM_BUG_ON(minimum_order == UINT_MAX); } -static char * __init memfmt(char *buf, unsigned long n) -{ - if (n >= (1UL << 30)) - sprintf(buf, "%lu GB", n >> 30); - else if (n >= (1UL << 20)) - sprintf(buf, "%lu MB", n >> 20); - else - sprintf(buf, "%lu KB", n >> 10); - return buf; -} - static void __init report_hugepages(void) { struct hstate *h; @@ -2785,6 +2815,11 @@ static int __init hugetlb_init(void) return 0; if (!size_to_hstate(default_hstate_size)) { + if (default_hstate_size != 0) { + pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", + default_hstate_size, HPAGE_SIZE); + } + default_hstate_size = HPAGE_SIZE; if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); @@ -3185,17 +3220,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } -static int is_hugetlb_entry_migration(pte_t pte) +bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) - return 0; + return false; swp = pte_to_swp_entry(pte); if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; + return true; else - return 0; + return false; } static int is_hugetlb_entry_hwpoisoned(pte_t pte) @@ -3233,7 +3268,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr); + src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); @@ -3263,9 +3298,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ make_migration_entry_read(&swp_entry); entry = swp_entry_to_pte(swp_entry); - set_huge_pte_at(src, addr, src_pte, entry); + set_huge_swap_pte_at(src, addr, src_pte, + entry, sz); } - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { huge_ptep_set_wrprotect(src, addr, src_pte); @@ -3317,7 +3353,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; @@ -3338,7 +3374,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { - huge_pte_clear(mm, address, ptep); + huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } @@ -3535,7 +3571,8 @@ retry_avoidcopy: unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3574,7 +3611,8 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -3861,7 +3899,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, address &= huge_page_mask(h); - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -4118,7 +4156,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -4257,7 +4296,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); @@ -4279,7 +4318,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, make_migration_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_huge_pte_at(mm, address, ptep, newpte); + set_huge_swap_pte_at(mm, address, ptep, + newpte, huge_page_size(h)); pages++; } spin_unlock(ptl); @@ -4521,7 +4561,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr); + spte = huge_pte_offset(svma->vm_mm, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -4617,7 +4658,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; @@ -4653,6 +4695,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, } struct page * __weak +follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, int flags, int pdshift) +{ + WARN(1, "hugepd follow called with no support for hugepage directory format\n"); + return NULL; +} + +struct page * __weak follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int flags) { @@ -4699,39 +4749,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); } -#ifdef CONFIG_MEMORY_FAILURE - -/* - * This function is called from memory failure code. - */ -int dequeue_hwpoisoned_huge_page(struct page *hpage) +struct page * __weak +follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) { - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - int ret = -EBUSY; + if (flags & FOLL_GET) + return NULL; - spin_lock(&hugetlb_lock); - /* - * Just checking !page_huge_active is not enough, because that could be - * an isolated/hwpoisoned hugepage (which have >0 refcount). - */ - if (!page_huge_active(hpage) && !page_count(hpage)) { - /* - * Hwpoisoned hugepage isn't linked to activelist or freelist, - * but dangling hpage->lru can trigger list-debug warnings - * (this happens when we call unpoison_memory() on it), - * so let it point to itself with list_del_init(). - */ - list_del_init(&hpage->lru); - set_page_refcounted(hpage); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - ret = 0; - } - spin_unlock(&hugetlb_lock); - return ret; + return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -#endif bool isolate_huge_page(struct page *page, struct list_head *list) { diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c81549d5c833..ca11bc4ce205 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -134,97 +134,33 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr) return false; } -static __always_inline bool memory_is_poisoned_2(unsigned long addr) +static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, + unsigned long size) { - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 1)) - return true; - - /* - * If single shadow byte covers 2-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) - return false; - - return unlikely(*(u8 *)shadow_addr); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_4(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 3)) - return true; - - /* - * If single shadow byte covers 4-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) - return false; - - return unlikely(*(u8 *)shadow_addr); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_8(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 7)) - return true; - - /* - * If single shadow byte covers 8-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return false; - - return unlikely(*(u8 *)shadow_addr); - } + /* + * Access crosses 8(shadow size)-byte boundary. Such access maps + * into 2 shadow bytes, so we need to check them both. + */ + if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) + return *shadow_addr || memory_is_poisoned_1(addr + size - 1); - return false; + return memory_is_poisoned_1(addr + size - 1); } static __always_inline bool memory_is_poisoned_16(unsigned long addr) { - u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - u16 shadow_first_bytes = *(u16 *)shadow_addr; - - if (unlikely(shadow_first_bytes)) - return true; - - /* - * If two shadow bytes covers 16-byte access, we don't - * need to do anything more. Otherwise, test the last - * shadow byte. - */ - if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return false; + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - return memory_is_poisoned_1(addr + 15); - } + /* Unaligned 16-bytes access maps into 3 shadow bytes. */ + if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) + return *shadow_addr || memory_is_poisoned_1(addr + 15); - return false; + return *shadow_addr; } -static __always_inline unsigned long bytes_is_zero(const u8 *start, +static __always_inline unsigned long bytes_is_nonzero(const u8 *start, size_t size) { while (size) { @@ -237,7 +173,7 @@ static __always_inline unsigned long bytes_is_zero(const u8 *start, return 0; } -static __always_inline unsigned long memory_is_zero(const void *start, +static __always_inline unsigned long memory_is_nonzero(const void *start, const void *end) { unsigned int words; @@ -245,11 +181,11 @@ static __always_inline unsigned long memory_is_zero(const void *start, unsigned int prefix = (unsigned long)start % 8; if (end - start <= 16) - return bytes_is_zero(start, end - start); + return bytes_is_nonzero(start, end - start); if (prefix) { prefix = 8 - prefix; - ret = bytes_is_zero(start, prefix); + ret = bytes_is_nonzero(start, prefix); if (unlikely(ret)) return ret; start += prefix; @@ -258,12 +194,12 @@ static __always_inline unsigned long memory_is_zero(const void *start, words = (end - start) / 8; while (words) { if (unlikely(*(u64 *)start)) - return bytes_is_zero(start, 8); + return bytes_is_nonzero(start, 8); start += 8; words--; } - return bytes_is_zero(start, (end - start) % 8); + return bytes_is_nonzero(start, (end - start) % 8); } static __always_inline bool memory_is_poisoned_n(unsigned long addr, @@ -271,7 +207,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, { unsigned long ret; - ret = memory_is_zero(kasan_mem_to_shadow((void *)addr), + ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), kasan_mem_to_shadow((void *)addr + size - 1) + 1); if (unlikely(ret)) { @@ -292,11 +228,9 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) case 1: return memory_is_poisoned_1(addr); case 2: - return memory_is_poisoned_2(addr); case 4: - return memory_is_poisoned_4(addr); case 8: - return memory_is_poisoned_8(addr); + return memory_is_poisoned_2_4_8(addr, size); case 16: return memory_is_poisoned_16(addr); default: @@ -803,17 +737,47 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size) EXPORT_SYMBOL(__asan_unpoison_stack_memory); #ifdef CONFIG_MEMORY_HOTPLUG -static int kasan_mem_notifier(struct notifier_block *nb, +static int __meminit kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { - return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK; + struct memory_notify *mem_data = data; + unsigned long nr_shadow_pages, start_kaddr, shadow_start; + unsigned long shadow_end, shadow_size; + + nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT; + start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn); + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr); + shadow_size = nr_shadow_pages << PAGE_SHIFT; + shadow_end = shadow_start + shadow_size; + + if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) || + WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT))) + return NOTIFY_BAD; + + switch (action) { + case MEM_GOING_ONLINE: { + void *ret; + + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, + shadow_end, GFP_KERNEL, + PAGE_KERNEL, VM_NO_GUARD, + pfn_to_nid(mem_data->start_pfn), + __builtin_return_address(0)); + if (!ret) + return NOTIFY_BAD; + + kmemleak_ignore(ret); + return NOTIFY_OK; + } + case MEM_OFFLINE: + vfree((void *)shadow_start); + } + + return NOTIFY_OK; } static int __init kasan_memhotplug_init(void) { - pr_info("WARNING: KASAN doesn't support memory hot-add\n"); - pr_info("Memory hot-add will be disabled\n"); - hotplug_memory_notifier(kasan_mem_notifier, 0); return 0; diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index b96a5f773d88..554e4c0f23a2 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -118,6 +118,18 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); + if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + pud_t *pud; + pmd_t *pmd; + + p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_zero_pte)); + continue; + } if (p4d_none(*p4d)) { p4d_populate(&init_mm, p4d, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 945fd1ca49b5..bf84529a1145 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -817,7 +817,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) static bool hugepage_vma_check(struct vm_area_struct *vma) { if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 20036d4f9f13..7780cd83a495 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -150,7 +150,7 @@ struct kmemleak_scan_area { */ struct kmemleak_object { spinlock_t lock; - unsigned long flags; /* object status flags */ + unsigned int flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; struct rb_node rb_node; @@ -159,6 +159,8 @@ struct kmemleak_object { atomic_t use_count; unsigned long pointer; size_t size; + /* pass surplus references to this pointer */ + unsigned long excess_ref; /* minimum number of a pointers found before it is considered leak */ int min_count; /* the total number of pointers found pointing to this object */ @@ -253,7 +255,8 @@ enum { KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN + KMEMLEAK_NO_SCAN, + KMEMLEAK_SET_EXCESS_REF }; /* @@ -262,9 +265,12 @@ enum { */ struct early_log { int op_type; /* kmemleak operation type */ - const void *ptr; /* allocated/freed memory block */ - size_t size; /* memory block size */ int min_count; /* minimum reference count */ + const void *ptr; /* allocated/freed memory block */ + union { + size_t size; /* memory block size */ + unsigned long excess_ref; /* surplus reference passing */ + }; unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -393,7 +399,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); - pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); @@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; + object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ object->jiffies = jiffies; @@ -795,6 +802,30 @@ out: } /* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* * Set the OBJECT_NO_SCAN flag for the object corresponding to the give * pointer. Such object will not be scanned by kmemleak but references to it * are searched. @@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log) * @gfp: kmalloc() flags used for kmemleak internal memory allocations * * This function is called from the kernel allocators when a new object - * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -952,6 +983,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); /** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area: pointer to vm_struct + * @size: size of the object + * @gfp: __vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + + /* + * A min_count = 2 is needed because vm_struct contains a reference to + * the virtual address of the vmalloc'ed block. + */ + if (kmemleak_enabled) { + create_object((unsigned long)area->addr, size, 2, gfp); + object_set_excess_ref((unsigned long)area, + (unsigned long)area->addr); + } else if (kmemleak_early_log) { + log_early(KMEMLEAK_ALLOC, area->addr, size, 2); + /* reusing early_log.size for storing area->addr */ + log_early(KMEMLEAK_SET_EXCESS_REF, + area, (unsigned long)area->addr, 0); + } +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + +/** * kmemleak_free - unregister a previously registered object * @ptr: pointer to beginning of the object * @@ -1188,6 +1249,30 @@ static bool update_checksum(struct kmemleak_object *object) } /* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ + if (!color_white(object)) { + /* non-orphan, ignored or new */ + return; + } + + /* + * Increase the object's reference count (number of pointers to the + * memory block). If this count reaches the required minimum, the + * object's color will become gray and it will be added to the + * gray_list. + */ + object->count++; + if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); + list_add_tail(&object->gray_list, &gray_list); + } +} + +/* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occurred. */ @@ -1224,6 +1309,7 @@ static void scan_block(void *_start, void *_end, for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; unsigned long pointer; + unsigned long excess_ref; if (scan_should_stop()) break; @@ -1259,25 +1345,27 @@ static void scan_block(void *_start, void *_end, * enclosed by scan_mutex. */ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - if (!color_white(object)) { - /* non-orphan, ignored or new */ - spin_unlock(&object->lock); - continue; - } - - /* - * Increase the object's reference count (number of pointers - * to the memory block). If this count reaches the required - * minimum, the object's color will become gray and it will be - * added to the gray_list. - */ - object->count++; + /* only pass surplus references (object already gray) */ if (color_gray(object)) { - /* put_object() called when removing from gray_list */ - WARN_ON(!get_object(object)); - list_add_tail(&object->gray_list, &gray_list); + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); } spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + continue; + if (object == scanned) + /* circular reference, ignore */ + continue; + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + spin_unlock(&object->lock); + } } read_unlock_irqrestore(&kmemleak_lock, flags); } @@ -1980,6 +2068,10 @@ void __init kmemleak_init(void) case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); break; + case KMEMLEAK_SET_EXCESS_REF: + object_set_excess_ref((unsigned long)log->ptr, + log->excess_ref); + break; default: kmemleak_warn("Unknown early log operation: %d\n", log->op_type); @@ -128,9 +128,12 @@ struct ksm_scan { * struct stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @chain_prune_time: time of the last full garbage collection + * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -138,11 +141,24 @@ struct stable_node { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; - struct list_head list; + struct { + struct hlist_node hlist_dup; + struct list_head list; + }; }; }; struct hlist_head hlist; - unsigned long kpfn; + union { + unsigned long kpfn; + unsigned long chain_prune_time; + }; + /* + * STABLE_NODE_CHAIN can be any negative number in + * rmap_hlist_len negative range, but better not -1 to be able + * to reliably detect underflows. + */ +#define STABLE_NODE_CHAIN -1024 + int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif @@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); +#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; +/* The number of stable_node chains */ +static unsigned long ksm_stable_node_chains; + +/* The number of stable_node dups linked to the stable_node chains */ +static unsigned long ksm_stable_node_dups; + +/* Delay in pruning stale stable_node_dups in the stable_node_chains */ +static int ksm_stable_node_chains_prune_millisecs = 2000; + +/* Maximum number of page slots sharing a stable node */ +static int ksm_max_page_sharing = 256; + /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -287,6 +316,45 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } +static __always_inline bool is_stable_node_chain(struct stable_node *chain) +{ + return chain->rmap_hlist_len == STABLE_NODE_CHAIN; +} + +static __always_inline bool is_stable_node_dup(struct stable_node *dup) +{ + return dup->head == STABLE_NODE_DUP_HEAD; +} + +static inline void stable_node_chain_add_dup(struct stable_node *dup, + struct stable_node *chain) +{ + VM_BUG_ON(is_stable_node_dup(dup)); + dup->head = STABLE_NODE_DUP_HEAD; + VM_BUG_ON(!is_stable_node_chain(chain)); + hlist_add_head(&dup->hlist_dup, &chain->hlist); + ksm_stable_node_dups++; +} + +static inline void __stable_node_dup_del(struct stable_node *dup) +{ + VM_BUG_ON(!is_stable_node_dup(dup)); + hlist_del(&dup->hlist_dup); + ksm_stable_node_dups--; +} + +static inline void stable_node_dup_del(struct stable_node *dup) +{ + VM_BUG_ON(is_stable_node_chain(dup)); + if (is_stable_node_dup(dup)) + __stable_node_dup_del(dup); + else + rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); +#ifdef CONFIG_DEBUG_VM + dup->head = NULL; +#endif +} + static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; @@ -317,6 +385,8 @@ static inline struct stable_node *alloc_stable_node(void) static inline void free_stable_node(struct stable_node *stable_node) { + VM_BUG_ON(stable_node->rmap_hlist_len && + !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } @@ -498,25 +568,82 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } +static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, + struct rb_root *root) +{ + struct stable_node *chain = alloc_stable_node(); + VM_BUG_ON(is_stable_node_chain(dup)); + if (likely(chain)) { + INIT_HLIST_HEAD(&chain->hlist); + chain->chain_prune_time = jiffies; + chain->rmap_hlist_len = STABLE_NODE_CHAIN; +#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) + chain->nid = -1; /* debug */ +#endif + ksm_stable_node_chains++; + + /* + * Put the stable node chain in the first dimension of + * the stable tree and at the same time remove the old + * stable node. + */ + rb_replace_node(&dup->node, &chain->node, root); + + /* + * Move the old stable node to the second dimension + * queued in the hlist_dup. The invariant is that all + * dup stable_nodes in the chain->hlist point to pages + * that are wrprotected and have the exact same + * content. + */ + stable_node_chain_add_dup(dup, chain); + } + return chain; +} + +static inline void free_stable_node_chain(struct stable_node *chain, + struct rb_root *root) +{ + rb_erase(&chain->node, root); + free_stable_node(chain); + ksm_stable_node_chains--; +} + static void remove_node_from_stable_tree(struct stable_node *stable_node) { struct rmap_item *rmap_item; + /* check it's not STABLE_NODE_CHAIN or negative */ + BUG_ON(stable_node->rmap_hlist_len < 0); + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } + /* + * We need the second aligned pointer of the migrate_nodes + * list_head to stay clear from the rb_parent_color union + * (aligned and different than any node) and also different + * from &migrate_nodes. This will verify that future list.h changes + * don't break STABLE_NODE_DUP_HEAD. + */ +#if GCC_VERSION >= 40903 /* only recent gcc can handle it */ + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); +#endif + if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + stable_node_dup_del(stable_node); free_stable_node(stable_node); } @@ -635,6 +762,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; @@ -743,6 +872,31 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } +static int remove_stable_node_chain(struct stable_node *stable_node, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + if (remove_stable_node(stable_node)) + return true; + else + return false; + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + if (remove_stable_node(dup)) + return true; + } + BUG_ON(!hlist_empty(&stable_node->hlist)); + free_stable_node_chain(stable_node, root); + return false; +} + static int remove_all_stable_nodes(void) { struct stable_node *stable_node, *next; @@ -753,7 +907,8 @@ static int remove_all_stable_nodes(void) while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct stable_node, node); - if (remove_stable_node(stable_node)) { + if (remove_stable_node_chain(stable_node, + root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } @@ -1138,6 +1293,214 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, return err ? NULL : page; } +static __always_inline +bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +{ + VM_BUG_ON(stable_node->rmap_hlist_len < 0); + /* + * Check that at least one mapping still exists, otherwise + * there's no much point to merge and share with this + * stable_node, as the underlying tree_page of the other + * sharer is going to be freed soon. + */ + return stable_node->rmap_hlist_len && + stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; +} + +static __always_inline +bool is_page_sharing_candidate(struct stable_node *stable_node) +{ + return __is_page_sharing_candidate(stable_node, 0); +} + +struct page *stable_node_dup(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct hlist_node *hlist_safe; + struct page *_tree_page, *tree_page = NULL; + int nr = 0; + int found_rmap_hlist_len; + + if (!prune_stale_stable_nodes || + time_before(jiffies, stable_node->chain_prune_time + + msecs_to_jiffies( + ksm_stable_node_chains_prune_millisecs))) + prune_stale_stable_nodes = false; + else + stable_node->chain_prune_time = jiffies; + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + cond_resched(); + /* + * We must walk all stable_node_dup to prune the stale + * stable nodes during lookup. + * + * get_ksm_page can drop the nodes from the + * stable_node->hlist if they point to freed pages + * (that's why we do a _safe walk). The "dup" + * stable_node parameter itself will be freed from + * under us if it returns NULL. + */ + _tree_page = get_ksm_page(dup, false); + if (!_tree_page) + continue; + nr += 1; + if (is_page_sharing_candidate(dup)) { + if (!found || + dup->rmap_hlist_len > found_rmap_hlist_len) { + if (found) + put_page(tree_page); + found = dup; + found_rmap_hlist_len = found->rmap_hlist_len; + tree_page = _tree_page; + + /* skip put_page for found dup */ + if (!prune_stale_stable_nodes) + break; + continue; + } + } + put_page(_tree_page); + } + + if (found) { + /* + * nr is counting all dups in the chain only if + * prune_stale_stable_nodes is true, otherwise we may + * break the loop at nr == 1 even if there are + * multiple entries. + */ + if (prune_stale_stable_nodes && nr == 1) { + /* + * If there's not just one entry it would + * corrupt memory, better BUG_ON. In KSM + * context with no lock held it's not even + * fatal. + */ + BUG_ON(stable_node->hlist.first->next); + + /* + * There's just one entry and it is below the + * deduplication limit so drop the chain. + */ + rb_replace_node(&stable_node->node, &found->node, + root); + free_stable_node(stable_node); + ksm_stable_node_chains--; + ksm_stable_node_dups--; + /* + * NOTE: the caller depends on the stable_node + * to be equal to stable_node_dup if the chain + * was collapsed. + */ + *_stable_node = found; + /* + * Just for robustneess as stable_node is + * otherwise left as a stable pointer, the + * compiler shall optimize it away at build + * time. + */ + stable_node = NULL; + } else if (stable_node->hlist.first != &found->hlist_dup && + __is_page_sharing_candidate(found, 1)) { + /* + * If the found stable_node dup can accept one + * more future merge (in addition to the one + * that is underway) and is not at the head of + * the chain, put it there so next search will + * be quicker in the !prune_stale_stable_nodes + * case. + * + * NOTE: it would be inaccurate to use nr > 1 + * instead of checking the hlist.first pointer + * directly, because in the + * prune_stale_stable_nodes case "nr" isn't + * the position of the found dup in the chain, + * but the total number of dups in the chain. + */ + hlist_del(&found->hlist_dup); + hlist_add_head(&found->hlist_dup, + &stable_node->hlist); + } + } + + *_stable_node_dup = found; + return tree_page; +} + +static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, + struct rb_root *root) +{ + if (!is_stable_node_chain(stable_node)) + return stable_node; + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return NULL; + } + return hlist_entry(stable_node->hlist.first, + typeof(*stable_node), hlist_dup); +} + +/* + * Like for get_ksm_page, this function can free the *_stable_node and + * *_stable_node_dup if the returned tree_page is NULL. + * + * It can also free and overwrite *_stable_node with the found + * stable_node_dup if the chain is collapsed (in which case + * *_stable_node will be equal to *_stable_node_dup like if the chain + * never existed). It's up to the caller to verify tree_page is not + * NULL before dereferencing *_stable_node or *_stable_node_dup. + * + * *_stable_node_dup is really a second output parameter of this + * function and will be overwritten in all cases, the caller doesn't + * need to initialize it. + */ +static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct stable_node *stable_node = *_stable_node; + if (!is_stable_node_chain(stable_node)) { + if (is_page_sharing_candidate(stable_node)) { + *_stable_node_dup = stable_node; + return get_ksm_page(stable_node, false); + } + /* + * _stable_node_dup set to NULL means the stable_node + * reached the ksm_max_page_sharing limit. + */ + *_stable_node_dup = NULL; + return NULL; + } + return stable_node_dup(_stable_node_dup, _stable_node, root, + prune_stale_stable_nodes); +} + +static __always_inline struct page *chain_prune(struct stable_node **s_n_d, + struct stable_node **s_n, + struct rb_root *root) +{ + return __stable_node_chain(s_n_d, s_n, root, true); +} + +static __always_inline struct page *chain(struct stable_node **s_n_d, + struct stable_node *s_n, + struct rb_root *root) +{ + struct stable_node *old_stable_node = s_n; + struct page *tree_page; + + tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + /* not pruning dups so s_n cannot have changed */ + VM_BUG_ON(s_n != old_stable_node); + return tree_page; +} + /* * stable_tree_search - search for page inside the stable tree * @@ -1153,7 +1516,7 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; struct stable_node *page_node; page_node = page_stable_node(page); @@ -1175,7 +1538,44 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + tree_page = chain_prune(&stable_node_dup, &stable_node, root); + /* + * NOTE: stable_node may have been freed by + * chain_prune() if the returned stable_node_dup is + * not NULL. stable_node_dup may have been inserted in + * the rbtree instead as a regular stable_node (in + * order to collapse the stable_node chain if a single + * stable_node dup was found in it). In such case the + * stable_node is overwritten by the calleee to point + * to the stable_node_dup that was collapsed in the + * stable rbtree and stable_node will be equal to + * stable_node_dup like if the chain never existed. + */ + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1198,6 +1598,34 @@ again: else if (ret > 0) new = &parent->rb_right; else { + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + /* + * Test if the migrated page should be merged + * into a stable node dup. If the mapcount is + * 1 we can migrate it with another KSM page + * without adding it to the chain. + */ + if (page_mapcount(page) > 1) + goto chain_append; + } + + if (!stable_node_dup) { + /* + * If the stable_node is a chain and + * we got a payload match in memcmp + * but we cannot merge the scanned + * page in any of the existing + * stable_node dups because they're + * all full, we need to wait the + * scanned page to find itself a match + * in the unstable tree to create a + * brand new KSM page to add later to + * the dups of this stable_node. + */ + return NULL; + } + /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page @@ -1205,23 +1633,21 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node, true); - if (tree_page) { - unlock_page(tree_page); - if (get_kpfn_nid(stable_node->kpfn) != - NUMA(stable_node->nid)) { - put_page(tree_page); - goto replace; - } - return tree_page; - } - /* - * There is now a place for page_node, but the tree may - * have been rebalanced, so re-evaluate parent and new. - */ - if (page_node) + tree_page = get_ksm_page(stable_node_dup, true); + if (unlikely(!tree_page)) + /* + * The tree may have been rebalanced, + * so re-evaluate parent and new. + */ goto again; - return NULL; + unlock_page(tree_page); + + if (get_kpfn_nid(stable_node_dup->kpfn) != + NUMA(stable_node_dup->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; } } @@ -1232,22 +1658,95 @@ again: DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); - get_page(page); - return page; +out: + if (is_page_sharing_candidate(page_node)) { + get_page(page); + return page; + } else + return NULL; replace: - if (page_node) { - list_del(&page_node->list); - DO_NUMA(page_node->nid = nid); - rb_replace_node(&stable_node->node, &page_node->node, root); - get_page(page); + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* there is no chain */ + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node_dup->node, + &page_node->node, + root); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + rb_erase(&stable_node_dup->node, root); + page = NULL; + } } else { - rb_erase(&stable_node->node, root); - page = NULL; + VM_BUG_ON(!is_stable_node_chain(stable_node)); + __stable_node_dup_del(stable_node_dup); + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + page = NULL; + } } - stable_node->head = &migrate_nodes; - list_add(&stable_node->list, stable_node->head); + stable_node_dup->head = &migrate_nodes; + list_add(&stable_node_dup->list, stable_node_dup->head); return page; + +chain_append: + /* stable_node_dup could be null if it reached the limit */ + if (!stable_node_dup) + stable_node_dup = stable_node_any; + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(stable_node_dup, + root); + if (!stable_node) + return NULL; + } + /* + * Add this stable_node dup that was + * migrated to the stable_node chain + * of the current nid for this page + * content. + */ + VM_BUG_ON(!is_stable_node_chain(stable_node)); + VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + goto out; } /* @@ -1264,7 +1763,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + bool need_chain = false; kpfn = page_to_pfn(kpage); nid = get_kpfn_nid(kpfn); @@ -1279,7 +1779,32 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + tree_page = chain(&stable_node_dup, stable_node, root); + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1302,27 +1827,37 @@ again: else if (ret > 0) new = &parent->rb_right; else { - /* - * It is not a bug that stable_tree_search() didn't - * find this node: because at that time our page was - * not yet write-protected, so may have changed since. - */ - return NULL; + need_chain = true; + break; } } - stable_node = alloc_stable_node(); - if (!stable_node) + stable_node_dup = alloc_stable_node(); + if (!stable_node_dup) return NULL; - INIT_HLIST_HEAD(&stable_node->hlist); - stable_node->kpfn = kpfn; - set_page_stable_node(kpage, stable_node); - DO_NUMA(stable_node->nid = nid); - rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, root); + INIT_HLIST_HEAD(&stable_node_dup->hlist); + stable_node_dup->kpfn = kpfn; + set_page_stable_node(kpage, stable_node_dup); + stable_node_dup->rmap_hlist_len = 0; + DO_NUMA(stable_node_dup->nid = nid); + if (!need_chain) { + rb_link_node(&stable_node_dup->node, parent, new); + rb_insert_color(&stable_node_dup->node, root); + } else { + if (!is_stable_node_chain(stable_node)) { + struct stable_node *orig = stable_node; + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(orig, root); + if (!stable_node) { + free_stable_node(stable_node_dup); + return NULL; + } + } + stable_node_chain_add_dup(stable_node_dup, stable_node); + } - return stable_node; + return stable_node_dup; } /* @@ -1412,8 +1947,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * the same ksm page. */ static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node) + struct stable_node *stable_node, + bool max_page_sharing_bypass) { + /* + * rmap won't find this mapping if we don't insert the + * rmap_item in the right stable_node + * duplicate. page_migration could break later if rmap breaks, + * so we can as well crash here. We really need to check for + * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check + * for other negative values as an undeflow if detected here + * for the first time (and not when decreasing rmap_hlist_len) + * would be sign of memory corruption in the stable_node. + */ + BUG_ON(stable_node->rmap_hlist_len < 0); + + stable_node->rmap_hlist_len++; + if (!max_page_sharing_bypass) + /* possibly non fatal but unexpected overflow, only warn */ + WARN_ON_ONCE(stable_node->rmap_hlist_len > + ksm_max_page_sharing); + rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); @@ -1441,19 +1995,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) struct page *kpage; unsigned int checksum; int err; + bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && - get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != + NUMA(stable_node->nid)) { + stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; + /* + * If it's a KSM fork, allow it to go over the sharing limit + * without warnings. + */ + if (!is_page_sharing_candidate(stable_node)) + max_page_sharing_bypass = true; } /* We first start with searching the page inside the stable tree */ @@ -1473,7 +2034,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * add its rmap_item to the stable tree. */ lock_page(kpage); - stable_tree_append(rmap_item, page_stable_node(kpage)); + stable_tree_append(rmap_item, page_stable_node(kpage), + max_page_sharing_bypass); unlock_page(kpage); } put_page(kpage); @@ -1523,8 +2085,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { - stable_tree_append(tree_rmap_item, stable_node); - stable_tree_append(rmap_item, stable_node); + stable_tree_append(tree_rmap_item, stable_node, + false); + stable_tree_append(rmap_item, stable_node, + false); } unlock_page(kpage); @@ -2028,6 +2592,48 @@ static void wait_while_offlining(void) } } +static bool stable_node_dup_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn) +{ + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + return true; + } + return false; +} + +static bool stable_node_chain_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + return stable_node_dup_remove_range(stable_node, start_pfn, + end_pfn); + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + stable_node_dup_remove_range(dup, start_pfn, end_pfn); + } + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return true; /* notify caller that tree was rebalanced */ + } else + return false; +} + static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { @@ -2039,15 +2645,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn, node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct stable_node, node); - if (stable_node->kpfn >= start_pfn && - stable_node->kpfn < end_pfn) { - /* - * Don't get_ksm_page, page has already gone: - * which is why we keep kpfn instead of page* - */ - remove_node_from_stable_tree(stable_node); + if (stable_node_chain_remove_range(stable_node, + start_pfn, end_pfn, + root_stable_tree + + nid)) node = rb_first(root_stable_tree + nid); - } else + else node = rb_next(node); cond_resched(); } @@ -2293,6 +2896,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj, } KSM_ATTR(use_zero_pages); +static ssize_t max_page_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_max_page_sharing); +} + +static ssize_t max_page_sharing_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + int knob; + + err = kstrtoint(buf, 10, &knob); + if (err) + return err; + /* + * When a KSM page is created it is shared by 2 mappings. This + * being a signed comparison, it implicitly verifies it's not + * negative. + */ + if (knob < 2) + return -EINVAL; + + if (READ_ONCE(ksm_max_page_sharing) == knob) + return count; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_max_page_sharing != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else + ksm_max_page_sharing = knob; + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(max_page_sharing); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2331,6 +2975,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t stable_node_dups_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_dups); +} +KSM_ATTR_RO(stable_node_dups); + +static ssize_t stable_node_chains_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_chains); +} +KSM_ATTR_RO(stable_node_chains); + +static ssize_t +stable_node_chains_prune_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); +} + +static ssize_t +stable_node_chains_prune_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_stable_node_chains_prune_millisecs = msecs; + + return count; +} +KSM_ATTR(stable_node_chains_prune_millisecs); + static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2350,6 +3034,10 @@ static struct attribute *ksm_attrs[] = { #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif + &max_page_sharing_attr.attr, + &stable_node_chains_attr.attr, + &stable_node_dups_attr.attr, + &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, NULL, }; diff --git a/mm/madvise.c b/mm/madvise.c index 25b78ee4fc2c..8eda1841c576 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, continue; page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, index); + vma, index, false); if (page) put_page(page); } @@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, } swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, - NULL, 0); + NULL, 0, false); if (page) put_page(page); } diff --git a/mm/memblock.c b/mm/memblock.c index 7b8a5db76a2f..2cb25fe4452c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,9 +54,6 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; -#ifdef CONFIG_MOVABLE_NODE -bool movable_node_enabled __initdata_memblock = false; -#endif static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94172089f52f..e3fe4d0913b3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head) #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) + int nr_entries) { - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); + this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); } /** @@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, false); - mem_cgroup_swap_statistics(to, true); + mem_cgroup_swap_statistics(from, -1); + mem_cgroup_swap_statistics(to, 1); return 0; } return -EINVAL; @@ -3574,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); + seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } @@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; + pn->lruvec_stat = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat) { + kfree(pn); + return 1; + } + lruvec_init(&pn->lruvec); pn->usage_in_excess = 0; pn->on_tree = false; @@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->nodeinfo[node]); + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + + free_percpu(pn->lruvec_stat); + kfree(pn); } static void __mem_cgroup_free(struct mem_cgroup *memcg) @@ -5165,6 +5174,7 @@ static int memory_events_show(struct seq_file *m, void *v) seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } @@ -5197,8 +5207,8 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "kernel_stack %llu\n", (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + - stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(stat[NR_SLAB_RECLAIMABLE] + + stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5222,15 +5232,25 @@ static int memory_stat_show(struct seq_file *m, void *v) } seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + + events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + + events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "workingset_refault %lu\n", stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", @@ -5297,38 +5317,52 @@ struct cgroup_subsys memory_cgrp_subsys = { /** * mem_cgroup_low - check if memory consumption is below the normal range - * @root: the highest ancestor to consider + * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * Returns %true if memory consumption of @memcg, and that of all - * configurable ancestors up to @root, is below the normal range. + * ancestors up to (but not including) @root, is below the normal range. + * + * @root is exclusive; it is never low when looked at directly and isn't + * checked when traversing the hierarchy. + * + * Excluding @root enables using memory.low to prioritize memory usage + * between cgroups within a subtree of the hierarchy that is limited by + * memory.high or memory.max. + * + * For example, given cgroup A with children B and C: + * + * A + * / \ + * B C + * + * and + * + * 1. A/memory.current > A/memory.high + * 2. A/B/memory.current < A/B/memory.low + * 3. A/C/memory.current >= A/C/memory.low + * + * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we + * should reclaim from 'C' until 'A' is no longer high or until we can + * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by + * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered + * low and we will reclaim indiscriminately from both 'B' and 'C'. */ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return false; - /* - * The toplevel group doesn't have a configurable range, so - * it's never low when looked at directly, and it is not - * considered an ancestor when assessing the hierarchy. - */ - - if (memcg == root_mem_cgroup) - return false; - - if (page_counter_read(&memcg->memory) >= memcg->low) + if (!root) + root = root_mem_cgroup; + if (memcg == root) return false; - while (memcg != root) { - memcg = parent_mem_cgroup(memcg); - - if (memcg == root_mem_cgroup) - break; - + for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { if (page_counter_read(&memcg->memory) >= memcg->low) return false; } + return true; } @@ -5445,7 +5479,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry); + mem_cgroup_uncharge_swap(entry, nr_pages); } } @@ -5873,9 +5907,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, true); + mem_cgroup_swap_statistics(swap_memcg, 1); page->mem_cgroup = NULL; @@ -5902,19 +5936,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) css_put(&memcg->css); } -/* - * mem_cgroup_try_charge_swap - try charging a swap entry +/** + * mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * - * Try to charge @entry to the memcg that @page belongs to. + * Try to charge @page's memcg for the swap space at @entry. * * Returns 0 on success, -ENOMEM on failure. */ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + unsigned int nr_pages = hpage_nr_pages(page); struct page_counter *counter; + struct mem_cgroup *memcg; unsigned short oldid; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) @@ -5929,25 +5964,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) { + !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { mem_cgroup_id_put(memcg); return -ENOMEM; } - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* Get references for the tail pages, too */ + if (nr_pages > 1) + mem_cgroup_id_get_many(memcg, nr_pages - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(memcg, nr_pages); return 0; } /** - * mem_cgroup_uncharge_swap - uncharge a swap entry + * mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge - * - * Drop the swap charge associated with @entry. + * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry) +void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; @@ -5955,18 +5992,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) if (!do_swap_account) return; - id = swap_cgroup_record(entry, 0); + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, 1); + page_counter_uncharge(&memcg->swap, nr_pages); else - page_counter_uncharge(&memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_id_put(memcg); + mem_cgroup_swap_statistics(memcg, -nr_pages); + mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 342fac9ba89b..e2e0cb0e1d0f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -49,7 +49,6 @@ #include <linux/swap.h> #include <linux/backing-dev.h> #include <linux/migrate.h> -#include <linux/page-isolation.h> #include <linux/suspend.h> #include <linux/slab.h> #include <linux/swapops.h> @@ -555,6 +554,39 @@ static int delete_from_lru_cache(struct page *p) return -EIO; } +static int truncate_error_page(struct page *p, unsigned long pfn, + struct address_space *mapping) +{ + int ret = MF_FAILED; + + if (mapping->a_ops->error_remove_page) { + int err = mapping->a_ops->error_remove_page(mapping, p); + + if (err != 0) { + pr_info("Memory failure: %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_info("Memory failure: %#lx: failed to release buffers\n", + pfn); + } else { + ret = MF_RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = MF_RECOVERED; + else + pr_info("Memory failure: %#lx: Failed to invalidate\n", + pfn); + } + + return ret; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we @@ -579,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { - int err; - int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -612,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - if (mapping->a_ops->error_remove_page) { - err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { - pr_info("Memory failure: %#lx: Failed to punch page: %d\n", - pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { - pr_info("Memory failure: %#lx: failed to release buffers\n", - pfn); - } else { - ret = MF_RECOVERED; - } - } else { - /* - * If the file system doesn't support it just invalidate - * This fails on dirty or anything with private pages - */ - if (invalidate_inode_page(p)) - ret = MF_RECOVERED; - else - pr_info("Memory failure: %#lx: Failed to invalidate\n", - pfn); - } - return ret; + return truncate_error_page(p, pfn, mapping); } /* @@ -684,7 +691,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * the first EIO, but we're not worse than other parts * of the kernel. */ - mapping_set_error(mapping, EIO); + mapping_set_error(mapping, -EIO); } return me_pagecache_clean(p, pfn); @@ -741,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn) { int res = 0; struct page *hpage = compound_head(p); + struct address_space *mapping; if (!PageHuge(hpage)) return MF_DELAYED; - /* - * We can safely recover from error on free or reserved (i.e. - * not in-use) hugepage by dequeuing it from freelist. - * To check whether a hugepage is in-use or not, we can't use - * page->lru because it can be used in other hugepage operations, - * such as __unmap_hugepage_range() and gather_surplus_pages(). - * So instead we use page_mapping() and PageAnon(). - */ - if (!(page_mapping(hpage) || PageAnon(hpage))) { - res = dequeue_hwpoisoned_huge_page(hpage); - if (!res) - return MF_RECOVERED; + mapping = page_mapping(hpage); + if (mapping) { + res = truncate_error_page(hpage, pfn, mapping); + } else { + unlock_page(hpage); + /* + * migration entry prevents later access on error anonymous + * hugepage, so we can free and dissolve it into buddy to + * save healthy subpages. + */ + if (PageAnon(hpage)) + put_page(hpage); + dissolve_free_huge_page(p); + res = MF_RECOVERED; + lock_page(hpage); } - return MF_DELAYED; + + return res; } /* @@ -857,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p, count = page_count(p) - 1; if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; - if (count != 0) { + if (count > 0) { pr_err("Memory failure: %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; @@ -1010,20 +1022,84 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, return unmap_success; } -static void set_page_hwpoison_huge_page(struct page *hpage) +static int identify_page_state(unsigned long pfn, struct page *p, + unsigned long page_flags) { - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - SetPageHWPoison(hpage + i); + struct page_state *ps; + + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flags is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) + break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + return page_action(ps, p, pfn); } -static void clear_page_hwpoison_huge_page(struct page *hpage) +static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) { - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - ClearPageHWPoison(hpage + i); + struct page *p = pfn_to_page(pfn); + struct page *head = compound_head(p); + int res; + unsigned long page_flags; + + if (TestSetPageHWPoison(head)) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", + pfn); + return 0; + } + + num_poisoned_pages_inc(); + + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { + /* + * Check "filter hit" and "race with other subpage." + */ + lock_page(head); + if (PageHWPoison(head)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != head && TestSetPageHWPoison(head))) { + num_poisoned_pages_dec(); + unlock_page(head); + return 0; + } + } + unlock_page(head); + dissolve_free_huge_page(p); + action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED); + return 0; + } + + lock_page(head); + page_flags = head->flags; + + if (!PageHWPoison(head)) { + pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); + num_poisoned_pages_dec(); + unlock_page(head); + put_hwpoison_page(head); + return 0; + } + + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) { + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + res = -EBUSY; + goto out; + } + + res = identify_page_state(pfn, p, page_flags); +out: + unlock_page(head); + return res; } /** @@ -1046,12 +1122,10 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) */ int memory_failure(unsigned long pfn, int trapno, int flags) { - struct page_state *ps; struct page *p; struct page *hpage; struct page *orig_head; int res; - unsigned int nr_pages; unsigned long page_flags; if (!sysctl_memory_failure_recovery) @@ -1064,34 +1138,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - orig_head = hpage = compound_head(p); + if (PageHuge(p)) + return memory_failure_hugetlb(pfn, trapno, flags); if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); return 0; } - /* - * Currently errors on hugetlbfs pages are measured in hugepage units, - * so nr_pages should be 1 << compound_order. OTOH when errors are on - * transparent hugepages, they are supposed to be split and error - * measurement is done in normal page units. So nr_pages should be one - * in this case. - */ - if (PageHuge(p)) - nr_pages = 1 << compound_order(hpage); - else /* normal page or thp */ - nr_pages = 1; - num_poisoned_pages_add(nr_pages); + orig_head = hpage = compound_head(p); + num_poisoned_pages_inc(); /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's a free hugepage, which is also safe: - * an affected hugepage will be dequeued from hugepage freelist, - * so there's no concern about reusing it ever after. - * 3) it's part of a non-compound high order page. + * 2) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -1102,32 +1164,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (is_free_buddy_page(p)) { action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; - } else if (PageHuge(hpage)) { - /* - * Check "filter hit" and "race with other subpage." - */ - lock_page(hpage); - if (PageHWPoison(hpage)) { - if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - return 0; - } - } - set_page_hwpoison_huge_page(hpage); - res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, MF_MSG_FREE_HUGE, - res ? MF_IGNORED : MF_DELAYED); - unlock_page(hpage); - return res; } else { action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; } } - if (!PageHuge(p) && PageTransHuge(hpage)) { + if (PageTransHuge(hpage)) { lock_page(p); if (!PageAnon(p) || unlikely(split_huge_page(p))) { unlock_page(p); @@ -1138,7 +1181,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) pr_err("Memory failure: %#lx: thp split failed\n", pfn); if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); put_hwpoison_page(p); return -EBUSY; } @@ -1165,7 +1208,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - lock_page(hpage); + lock_page(p); /* * The page could have changed compound pages during the locking. @@ -1184,49 +1227,33 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + if (PageHuge(p)) + page_flags = hpage->flags; + else + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - put_hwpoison_page(hpage); + num_poisoned_pages_dec(); + unlock_page(p); + put_hwpoison_page(p); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - put_hwpoison_page(hpage); + num_poisoned_pages_dec(); + unlock_page(p); + put_hwpoison_page(p); return 0; } - if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + if (!PageTransTail(p) && !PageLRU(p)) goto identify_page_state; /* - * For error on the tail page, we should set PG_hwpoison - * on the head page to show that the hugepage is hwpoisoned - */ - if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); - unlock_page(hpage); - put_hwpoison_page(hpage); - return 0; - } - /* - * Set PG_hwpoison on all pages in an error hugepage, - * because containment is done in hugepage unit for now. - * Since we have done TestSetPageHWPoison() for the head page with - * page lock held, we can safely set PG_hwpoison bits on tail pages. - */ - if (PageHuge(p)) - set_page_hwpoison_huge_page(hpage); - - /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ @@ -1255,25 +1282,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } identify_page_state: - res = -EBUSY; - /* - * The first check uses the current page flags which may not have any - * relevant information. The second check with the saved page flagss is - * carried out only if the first check can't determine the page status. - */ - for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) - break; - - page_flags |= (p->flags & (1UL << PG_dirty)); - - if (!ps->mask) - for (ps = error_states;; ps++) - if ((page_flags & ps->mask) == ps->res) - break; - res = page_action(ps, p, pfn); + res = identify_page_state(pfn, p, page_flags); out: - unlock_page(hpage); + unlock_page(p); return res; } EXPORT_SYMBOL_GPL(memory_failure); @@ -1395,7 +1406,6 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; - unsigned int nr_pages; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1440,20 +1450,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); - if (!get_hwpoison_page(p)) { - /* - * Since HWPoisoned hugepage should have non-zero refcount, - * race between memory failure and unpoison seems to happen. - * In such case unpoison fails and memory failure runs - * to the end. - */ - if (PageHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n", - pfn, &unpoison_rs); - return 0; - } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", @@ -1471,10 +1468,8 @@ int unpoison_memory(unsigned long pfn) if (TestClearPageHWPoison(page)) { unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); freeit = 1; - if (PageHuge(page)) - clear_page_hwpoison_huge_page(page); } unlock_page(page); @@ -1489,11 +1484,16 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - nid); - else + if (PageHuge(p)) { + struct hstate *hstate = page_hstate(compound_head(p)); + + if (hstate_is_gigantic(hstate)) + return alloc_huge_page_node(hstate, NUMA_NO_NODE); + + return alloc_huge_page_node(hstate, nid); + } else { return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); + } } /* @@ -1600,15 +1600,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - /* overcommit hugetlb page will be freed to buddy */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - SetPageHWPoison(page); - num_poisoned_pages_inc(); - } + if (PageHuge(page)) + dissolve_free_huge_page(page); } return ret; } @@ -1724,15 +1717,12 @@ static int soft_offline_in_use_page(struct page *page, int flags) static void soft_offline_free_page(struct page *page) { - if (PageHuge(page)) { - struct page *hpage = compound_head(page); + struct page *head = compound_head(page); - set_page_hwpoison_huge_page(hpage); - if (!dequeue_hwpoisoned_huge_page(hpage)) - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - if (!TestSetPageHWPoison(page)) - num_poisoned_pages_inc(); + if (!TestSetPageHWPoison(head)) { + num_poisoned_pages_inc(); + if (PageHuge(head)) + dissolve_free_huge_page(page); } } diff --git a/mm/memory.c b/mm/memory.c index 2e65df1831d9..b1b97b490791 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf) /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -3300,14 +3300,14 @@ static int fault_around_bytes_set(void *data, u64 val) fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, +DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { void *ret; - ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); if (!ret) pr_warn("Failed to create fault_around_bytes in debugfs"); @@ -3875,7 +3875,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); + count_memcg_event_mm(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); @@ -4052,8 +4052,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); - if (!ptep) - goto out; if (!pte_present(*ptep)) goto unlock; *ptepp = ptep; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b63d7d1239df..d61509752112 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -79,6 +79,8 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool movable_node_enabled = false; + #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; #else @@ -300,229 +302,38 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_zone_end_pfn; - - zone_span_writelock(zone); - - old_zone_end_pfn = zone_end_pfn(zone); - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) - zone->zone_start_pfn = start_pfn; - - zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - - zone->zone_start_pfn; - - zone_span_writeunlock(zone); -} - -static void resize_zone(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - zone_span_writelock(zone); - - if (end_pfn - start_pfn) { - zone->zone_start_pfn = start_pfn; - zone->spanned_pages = end_pfn - start_pfn; - } else { - /* - * make it consist as free_area_init_core(), - * if spanned_pages = 0, then keep start_pfn = 0 - */ - zone->zone_start_pfn = 0; - zone->spanned_pages = 0; - } - - zone_span_writeunlock(zone); -} - -static void fix_zone_id(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - enum zone_type zid = zone_idx(zone); - int nid = zone->zone_pgdat->node_id; - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) - set_page_links(pfn_to_page(pfn), zid, nid, pfn); -} - -/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ -static int __ref ensure_zone_is_initialized(struct zone *zone, - unsigned long start_pfn, unsigned long num_pages) -{ - if (!zone_is_initialized(zone)) - return init_currently_empty_zone(zone, start_pfn, num_pages); - - return 0; -} - -static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, + bool want_memblock) { int ret; - unsigned long flags; - unsigned long z1_start_pfn; - - ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are higher than @z2 */ - if (end_pfn > zone_end_pfn(z2)) - goto out_fail; - /* the move out part must be at the left most of @z2 */ - if (start_pfn > z2->zone_start_pfn) - goto out_fail; - /* must included/overlap */ - if (end_pfn <= z2->zone_start_pfn) - goto out_fail; - - /* use start_pfn for z1's start_pfn if z1 is empty */ - if (!zone_is_empty(z1)) - z1_start_pfn = z1->zone_start_pfn; - else - z1_start_pfn = start_pfn; - - resize_zone(z1, z1_start_pfn, end_pfn); - resize_zone(z2, end_pfn, zone_end_pfn(z2)); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z1, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) -{ - int ret; - unsigned long flags; - unsigned long z2_end_pfn; - - ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are lower than @z1 */ - if (z1->zone_start_pfn > start_pfn) - goto out_fail; - /* the move out part mast at the right most of @z1 */ - if (zone_end_pfn(z1) > end_pfn) - goto out_fail; - /* must included/overlap */ - if (start_pfn >= zone_end_pfn(z1)) - goto out_fail; - - /* use end_pfn for z2's end_pfn if z2 is empty */ - if (!zone_is_empty(z2)) - z2_end_pfn = zone_end_pfn(z2); - else - z2_end_pfn = end_pfn; - - resize_zone(z1, z1->zone_start_pfn, start_pfn); - resize_zone(z2, start_pfn, z2_end_pfn); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z2, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static struct zone * __meminit move_pfn_range(int zone_shift, - unsigned long start_pfn, unsigned long end_pfn) -{ - struct zone *zone = page_zone(pfn_to_page(start_pfn)); - int ret = 0; - - if (zone_shift < 0) - ret = move_pfn_range_left(zone + zone_shift, zone, - start_pfn, end_pfn); - else if (zone_shift) - ret = move_pfn_range_right(zone, zone + zone_shift, - start_pfn, end_pfn); - - if (ret) - return NULL; - - return zone + zone_shift; -} - -static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); - - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) - pgdat->node_start_pfn = start_pfn; - - pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - - pgdat->node_start_pfn; -} + int i; -static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; - int nid = pgdat->node_id; - int zone_type; - unsigned long flags, pfn; - int ret; + if (pfn_valid(phys_start_pfn)) + return -EEXIST; - zone_type = zone - pgdat->node_zones; - ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); - if (ret) + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); + if (ret < 0) return ret; - pgdat_resize_lock(zone->zone_pgdat, &flags); - grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); - grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, - phys_start_pfn + nr_pages); - pgdat_resize_unlock(zone->zone_pgdat, &flags); - memmap_init_zone(nr_pages, nid, zone_type, - phys_start_pfn, MEMMAP_HOTPLUG); - - /* online_page_range is called later and expects pages reserved */ - for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + /* + * Make all the pages reserved so that nobody will stumble over half + * initialized state. + * FIXME: We also have to associate it with a node because pfn_to_node + * relies on having page with the proper node. + */ + for (i = 0; i < PAGES_PER_SECTION; i++) { + unsigned long pfn = phys_start_pfn + i; + struct page *page; if (!pfn_valid(pfn)) continue; - SetPageReserved(pfn_to_page(pfn)); + page = pfn_to_page(pfn); + set_page_node(page, nid); + SetPageReserved(page); } - return 0; -} - -static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn) -{ - int ret; - - if (pfn_valid(phys_start_pfn)) - return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn); - - if (ret < 0) - return ret; - - ret = __add_zone(zone, phys_start_pfn); - - if (ret < 0) - return ret; + if (!want_memblock) + return 0; return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } @@ -533,16 +344,14 @@ static int __meminit __add_section(int nid, struct zone *zone, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) +int __ref __add_pages(int nid, unsigned long phys_start_pfn, + unsigned long nr_pages, bool want_memblock) { unsigned long i; int err = 0; int start_sec, end_sec; struct vmem_altmap *altmap; - clear_zone_contiguous(zone); - /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -562,7 +371,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i)); + err = __add_section(nid, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -575,7 +384,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: - set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -939,33 +747,24 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; struct page *page; + if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); + if (PageHWPoison(page)) { + ClearPageReserved(page); + continue; + } (*online_page_callback)(page); onlined_pages++; } + + online_mem_sections(start_pfn, start_pfn + nr_pages); + *(unsigned long *)arg = onlined_pages; return 0; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have - * normal memory. - */ -static bool can_online_high_movable(struct zone *zone) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure every online node has NORMAL memory */ -static bool can_online_high_movable(struct zone *zone) -{ - return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); -} -#endif /* CONFIG_MOVABLE_NODE */ - /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -1040,39 +839,136 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) { - struct zone *zone = page_zone(pfn_to_page(pfn)); - enum zone_type idx = zone_idx(zone); - int i; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); + + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(default_zone) <= pfn; + } - *zone_shift = 0; + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} - if (idx < target) { - /* pages must be at end of current zone */ - if (pfn + nr_pages != zone_end_pfn(zone)) - return false; +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); + + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); + + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; - /* no zones in use between current zone and target */ - for (i = idx + 1; i < target; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void __ref move_pfn_range_to_zone(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + unsigned long flags; + unsigned long i; + + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + for (i = 0; i < nr_pages; i++) { + unsigned long pfn = start_pfn + i; + set_page_links(pfn_to_page(pfn), zone_idx(zone), nid, pfn); } - if (target < idx) { - /* pages must be at beginning of current zone */ - if (pfn != zone->zone_start_pfn) - return false; + set_zone_contiguous(zone); +} + +/* + * Returns a default kernel memory zone for the given pfn range. + * If no kernel zone covers this pfn range it will automatically go + * to the ZONE_NORMAL. + */ +struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + int zid; + + for (zid = 0; zid <= ZONE_NORMAL; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; - /* no zones in use between current zone and target */ - for (i = target + 1; i < idx; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + if (zone_intersects(zone, start_pfn, nr_pages)) + return zone; } - *zone_shift = target - idx; - return true; + return &pgdat->node_zones[ZONE_NORMAL]; +} + +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); + + if (online_type == MMOP_ONLINE_KEEP) { + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* + * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use + * movable zone if that is not possible (e.g. we are within + * or past the existing movable zone) + */ + if (!allow_online_pfn_range(nid, start_pfn, nr_pages, + MMOP_ONLINE_KERNEL)) + zone = movable_zone; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + zone = &pgdat->node_zones[ZONE_MOVABLE]; + } + + move_pfn_range_to_zone(zone, start_pfn, nr_pages); + return zone; } /* Must be protected by mem_hotplug_begin() */ @@ -1085,38 +981,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; - int zone_shift = 0; - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - - if ((zone_idx(zone) > ZONE_NORMAL || - online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(zone)) + nid = pfn_to_nid(pfn); + if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) { - if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) - return -EINVAL; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) - return -EINVAL; - } - - zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); - if (!zone) - return -EINVAL; + /* associate pfn range with the zone */ + zone = move_pfn_range(online_type, nid, pfn, nr_pages); arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = zone_to_nid(zone); - ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) @@ -1311,39 +1187,6 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } -/* - * If movable zone has already been setup, newly added memory should be check. - * If its address is higher than movable zone, it should be added as movable. - * Without this check, movable zone may overlap with other zone. - */ -static int should_add_memory_movable(int nid, u64 start, u64 size) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; - - if (zone_is_empty(movable_zone)) - return 0; - - if (movable_zone->zone_start_pfn <= start_pfn) - return 1; - - return 0; -} - -int zone_for_memory(int nid, u64 start, u64 size, int zone_default, - bool for_device) -{ -#ifdef CONFIG_ZONE_DEVICE - if (for_device) - return ZONE_DEVICE; -#endif - if (should_add_memory_movable(nid, start, size)) - return ZONE_MOVABLE; - - return zone_default; -} - static int online_memory_block(struct memory_block *mem, void *arg) { return device_online(&mem->dev); @@ -1389,7 +1232,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, false); + ret = arch_add_memory(nid, start, size, true); if (ret < 0) goto error; @@ -1398,7 +1241,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) node_set_online(nid); if (new_node) { - ret = register_one_node(nid); + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + ret = __register_one_node(nid); + if (ret) + goto register_fail; + + /* + * link memory sections under this node. This is already + * done when creatig memory section in register_new_memory + * but that depends to have the node registered so offline + * nodes have to go through register_node. + * TODO clean up this mess. + */ + ret = link_mem_sections(nid, start_pfn, nr_pages); +register_fail: /* * If sysfs file of new node can't create, cpu on the node * can't be hot-added. There is no rollback way now. @@ -1419,7 +1277,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) error: /* rollback pgdat allocation and others */ - if (new_pgdat) + if (new_pgdat && pgdat) rollback_node_hotadd(nid, pgdat); memblock_remove(start, size); @@ -1592,11 +1450,9 @@ static struct page *new_node_page(struct page *page, unsigned long private, gfp_mask |= __GFP_HIGHMEM; if (!nodes_empty(nmask)) - new_page = __alloc_pages_nodemask(gfp_mask, 0, - node_zonelist(nid, gfp_mask), &nmask); + new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask); if (!new_page) - new_page = __alloc_pages(gfp_mask, 0, - node_zonelist(nid, gfp_mask)); + new_page = __alloc_pages(gfp_mask, 0, nid); return new_page; } @@ -1725,47 +1581,12 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have - * normal memory. - */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure the node has NORMAL memory if it is still online */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - if (present_pages > nr_pages) - return true; - - present_pages = 0; - for (; zt <= ZONE_MOVABLE; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - /* - * we can't offline the last normal memory until all - * higher memory is offlined. - */ - return present_pages == 0; -} -#endif /* CONFIG_MOVABLE_NODE */ - static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_MOVABLE_NODE +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; #else - pr_warn("movable_node option not supported\n"); + pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); #endif return 0; } @@ -1887,9 +1708,6 @@ static int __ref __offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - return -EINVAL; - /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37d0b334bfe9..7d8e56214ac0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - /* - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } -/* - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; @@ -325,41 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - /* - * if step == 1, we use ->w.cpuset_mems_allowed to cache the - * result - */ - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { - nodes_remap(tmp, pol->v.nodes, - pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; - } else if (step == MPOL_REBIND_STEP2) { - tmp = pol->w.cpuset_mems_allowed; - pol->w.cpuset_mems_allowed = *nodes; - } else - BUG(); + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = tmp; } if (nodes_empty(tmp)) tmp = *nodes; - if (step == MPOL_REBIND_STEP1) - nodes_or(pol->v.nodes, pol->v.nodes, tmp); - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) - pol->v.nodes = tmp; - else - BUG(); - - if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node_in(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = numa_node_id(); - } + pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes, - enum mpol_rebind_step step) + const nodemask_t *nodes) { nodemask_t tmp; @@ -385,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol, /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_sem. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, - enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) - return; - - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) - BUG(); - - if (step == MPOL_REBIND_STEP1) - pol->flags |= MPOL_F_REBINDING; - else if (step == MPOL_REBIND_STEP2) - pol->flags &= ~MPOL_F_REBINDING; - else if (step >= MPOL_REBIND_NSTEP) - BUG(); - - mpol_ops[pol->mode].rebind(pol, newmask, step); + mpol_ops[pol->mode].rebind(pol, newmask); } /* @@ -430,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - mpol_rebind_policy(tsk->mempolicy, new, step); + mpol_rebind_policy(tsk->mempolicy, new); } /* @@ -448,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); + mpol_rebind_policy(vma->vm_policy, new); up_write(&mm->mmap_sem); } @@ -812,9 +743,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE && - nodes_weight(new->v.nodes)) - current->il_next = first_node(new->v.nodes); + if (new && new->mode == MPOL_INTERLEAVE) + current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; @@ -916,7 +846,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { - *policy = current->il_next; + *policy = next_node_in(current->il_prev, pol->v.nodes); } else { err = -EINVAL; goto out; @@ -1676,9 +1606,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) return NULL; } -/* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, - int nd) +/* Return the node id preferred by the given mempolicy, or the given id */ +static int policy_node(gfp_t gfp, struct mempolicy *policy, + int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; @@ -1691,20 +1621,19 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } - return node_zonelist(nd, gfp); + return nd; } /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { - unsigned nid, next; + unsigned next; struct task_struct *me = current; - nid = me->il_next; - next = next_node_in(nid, policy->v.nodes); + next = next_node_in(me->il_prev, policy->v.nodes); if (next < MAX_NUMNODES) - me->il_next = next; - return nid; + me->il_prev = next; + return next; } /* @@ -1799,38 +1728,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol, #ifdef CONFIG_HUGETLBFS /* - * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * - * Returns a zonelist suitable for a huge page allocation and a pointer + * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags, struct mempolicy **mpol, - nodemask_t **nodemask) +int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask) { - struct zonelist *zl; + int nid; *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { - zl = node_zonelist(interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))), gfp_flags); + nid = interleave_nid(*mpol, vma, addr, + huge_page_shift(hstate_vma(vma))); } else { - zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); + nid = policy_node(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } - return zl; + return nid; } /* @@ -1932,12 +1860,10 @@ out: static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { - struct zonelist *zl; struct page *page; - zl = node_zonelist(nid, gfp); - page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) + page = __alloc_pages(gfp, order, nid); + if (page && page_to_nid(page) == nid) inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1971,13 +1897,10 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, { struct mempolicy *pol; struct page *page; - unsigned int cpuset_mems_cookie; - struct zonelist *zl; + int preferred_nid; nodemask_t *nmask; -retry_cpuset: pol = get_vma_policy(vma, addr); - cpuset_mems_cookie = read_mems_allowed_begin(); if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; @@ -2015,12 +1938,10 @@ retry_cpuset: } nmask = policy_nodemask(gfp, pol); - zl = policy_zonelist(gfp, pol, node); - page = __alloc_pages_nodemask(gfp, order, zl, nmask); + preferred_nid = policy_node(gfp, pol, node); + page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; } @@ -2038,23 +1959,15 @@ out: * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. - * - * Don't call cpuset_update_task_memory_state() unless - * 1) it's ok to take cpuset_sem (can WAIT), and - * 2) allocating for current task (not interrupt). */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; - unsigned int cpuset_mems_cookie; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -2063,12 +1976,9 @@ retry_cpuset: page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, numa_node_id()), + policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; - return page; } EXPORT_SYMBOL(alloc_pages_current); @@ -2112,10 +2022,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - if (new->flags & MPOL_F_REBINDING) - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); - else - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; diff --git a/mm/migrate.c b/mm/migrate.c index 89a0a1707f4c..8935cbe362ce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); + flush_dcache_page(new); #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, vma, new, 0); - } -#endif - flush_dcache_page(new); - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - - if (PageHuge(new)) { + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); - } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, pvmw.address, false); - else - page_add_file_rmap(new, false); + } else +#endif + { + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, pvmw.address, false); + else + page_add_file_rmap(new, false); + } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); @@ -1251,6 +1252,8 @@ put_anon: out: if (rc != -EAGAIN) putback_active_hugepage(hpage); + if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) + num_poisoned_pages_inc(); /* * If migration was not successful and there's a freeing callback, use diff --git a/mm/mmap.c b/mm/mmap.c index f82741e199c0..3bd5ecd20d4d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm, * w: (no) no * x: (yes) yes */ -pgprot_t protection_map[16] = { +pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; diff --git a/mm/mprotect.c b/mm/mprotect.c index 8edd0d576254..1a8c9ca83e48 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * reading. */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (!pte) - return 0; /* Get target node for single threaded private VMAs */ if (prot_numa && !(vma->vm_flags & VM_SHARED) && diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 487dad610731..36454d0f96ee 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, unsigned long end_pfn = min_t(unsigned long, PFN_DOWN(end), max_low_pfn); - if (start_pfn > end_pfn) + if (start_pfn >= end_pfn) return 0; __free_pages_memory(start_pfn, end_pfn); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 04c9143a8625..0e2c925e7826 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -876,6 +876,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; mmgrab(mm); + + /* Raise event before sending signal: task reaper must see this */ + count_vm_event(OOM_KILL); + count_memcg_event_mm(mm, OOM_KILL); + /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 143c1c25d680..36c62fda96bc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2366,15 +2366,16 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) } /** - * write_one_page - write out a single page and optionally wait on I/O + * write_one_page - write out a single page and wait on I/O * @page: the page to write - * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * - * write_one_page() returns a negative error code if I/O failed. + * write_one_page() returns a negative error code if I/O failed. Note that + * the address_space is not marked for error. The caller must do this if + * needed. */ -int write_one_page(struct page *page, int wait) +int write_one_page(struct page *page) { struct address_space *mapping = page->mapping; int ret = 0; @@ -2385,13 +2386,12 @@ int write_one_page(struct page *page, int wait) BUG_ON(!PageLocked(page)); - if (wait) - wait_on_page_writeback(page); + wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { get_page(page); ret = mapping->a_ops->writepage(page, &wbc); - if (ret == 0 && wait) { + if (ret == 0) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; @@ -2433,8 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inode_attach_wb(inode, page); wb = inode_to_wb(inode); - inc_memcg_page_state(page, NR_FILE_DIRTY); - __inc_node_page_state(page, NR_FILE_DIRTY); + __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2455,8 +2454,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); @@ -2712,8 +2710,7 @@ int clear_page_dirty_for_io(struct page *page) */ wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; @@ -2759,8 +2756,7 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - dec_memcg_page_state(page, NR_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK); + dec_lruvec_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } @@ -2814,8 +2810,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - inc_memcg_page_state(page, NR_WRITEBACK); - inc_node_page_state(page, NR_WRITEBACK); + inc_lruvec_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2302f250d6b1..b896897dcda7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, #endif -#ifdef CONFIG_MOVABLE_NODE [N_MEMORY] = { { [0] = 1UL } }, -#endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ }; @@ -511,7 +509,7 @@ static int page_is_consistent(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int bad_range(struct zone *zone, struct page *page) +static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; @@ -521,7 +519,7 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } #else -static inline int bad_range(struct zone *zone, struct page *page) +static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) { return 0; } @@ -1297,8 +1295,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { int nid; @@ -1320,8 +1319,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { return true; } @@ -1365,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) return NULL; - start_page = pfn_to_page(start_pfn); + start_page = pfn_to_online_page(start_pfn); + if (!start_page) + return NULL; if (page_zone(start_page) != zone) return NULL; @@ -2214,7 +2216,11 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) int fallback_mt; bool can_steal; - /* Find the largest possible block of pages in the other list */ + /* + * Find the largest available free page in the other list. This roughly + * approximates finding the pageblock with the most free pages, which + * would be too costly to do exactly. + */ for (current_order = MAX_ORDER-1; current_order >= order && current_order <= MAX_ORDER-1; --current_order) { @@ -2224,19 +2230,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + /* + * We cannot steal all free pages from the pageblock and the + * requested migratetype is movable. In that case it's better to + * steal and split the smallest available page instead of the + * largest available page, because even if the next movable + * allocation falls back into a different pageblock than this + * one, it won't cause permanent fragmentation. + */ + if (!can_steal && start_migratetype == MIGRATE_MOVABLE + && current_order > order) + goto find_smallest; - steal_suitable_fallback(zone, page, start_migratetype, - can_steal); + goto do_steal; + } - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); + return false; - return true; +find_smallest: + for (current_order = order; current_order < MAX_ORDER; + current_order++) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt != -1) + break; } - return false; + /* + * This should not happen - we already found a suitable fallback + * when looking for the largest page. + */ + VM_BUG_ON(current_order == MAX_ORDER); + +do_steal: + page = list_first_entry(&area->free_list[fallback_mt], + struct page, lru); + + steal_suitable_fallback(zone, page, start_migratetype, can_steal); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + + return true; + } /* @@ -3673,6 +3710,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; } +static inline bool +check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) +{ + /* + * It's possible that cpuset's mems_allowed and the nodemask from + * mempolicy don't intersect. This should be normally dealt with by + * policy_nodemask(), but it's possible to race with cpuset update in + * such a way the check therein was true, and then it became false + * before we got our cpuset_mems_cookie here. + * This assumes that for all allocations, ac->nodemask can come only + * from MPOL_BIND mempolicy (whose documented semantics is to be ignored + * when it does not intersect with the cpuset restrictions) or the + * caller can deal with a violated nodemask. + */ + if (cpusets_enabled() && ac->nodemask && + !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { + ac->nodemask = NULL; + return true; + } + + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + return true; + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -3868,11 +3938,9 @@ retry: &compaction_retries)) goto retry; - /* - * It's possible we raced with cpuset update so the OOM would be - * premature (see below the nopage: label for full explanation). - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + + /* Deal with possible cpuset update races before we start OOM killing */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* Reclaim has failed us, start killing things */ @@ -3893,14 +3961,8 @@ retry: } nopage: - /* - * When updating a task's mems_allowed or mempolicy nodemask, it is - * possible to race with parallel threads in such a way that our - * allocation can fail while the mask is being updated. If we are about - * to fail, check if the cpuset changed during allocation and if so, - * retry. - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + /* Deal with possible cpuset update races before we fail */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* @@ -3951,12 +4013,12 @@ got_pg: } static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask, + int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { ac->high_zoneidx = gfp_zone(gfp_mask); - ac->zonelist = zonelist; + ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -4001,8 +4063,8 @@ static inline void finalise_ac(gfp_t gfp_mask, * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4010,7 +4072,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; - if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; finalise_ac(gfp_mask, order, &ac); @@ -4614,8 +4676,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " slab_reclaimable:%lukB" - " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " bounce:%lukB" @@ -4637,8 +4697,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), - K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), @@ -5124,6 +5182,7 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void setup_zone_pageset(struct zone *zone); /* @@ -5528,7 +5587,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -int __meminit init_currently_empty_zone(struct zone *zone, +void __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) { @@ -5546,8 +5605,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_init_free_lists(zone); zone->initialized = 1; - - return 0; } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5794,6 +5851,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); + + /* If this node has no page within this zone, return 0. */ + if (zone_start_pfn == zone_end_pfn) + return 0; + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); /* @@ -6005,7 +6067,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - int ret; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6027,6 +6088,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); + pgdat->per_cpu_nodestats = &boot_nodestats; + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; @@ -6087,8 +6150,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, size); - BUG_ON(ret); + init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init(size, nid, j, zone_start_pfn); } } @@ -7182,6 +7244,21 @@ static unsigned long __init arch_reserved_kernel_pages(void) #endif /* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + * Because 32-bit systems cannot have large physical memory, where this scaling + * makes sense, it is disabled on such platforms. + */ +#if __BITS_PER_LONG > 32 +#define ADAPT_SCALE_BASE (64ul << 30) +#define ADAPT_SCALE_SHIFT 2 +#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) +#endif + +/* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries @@ -7200,6 +7277,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; + gfp_t gfp_flags; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -7211,6 +7289,16 @@ void *__init alloc_large_system_hash(const char *tablename, if (PAGE_SHIFT < 20) numentries = round_up(numentries, (1<<20)/PAGE_SIZE); +#if __BITS_PER_LONG > 32 + if (!high_limit) { + unsigned long adapt; + + for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; + adapt <<= ADAPT_SCALE_SHIFT) + scale++; + } +#endif + /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); @@ -7244,12 +7332,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); + /* + * memblock allocator returns zeroed memory already, so HASH_ZERO is + * currently not used when HASH_EARLY is specified. + */ + gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; if (flags & HASH_EARLY) table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) - table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags, PAGE_KERNEL); else { /* * If bucketsize is not a power-of-two, we may free @@ -7257,8 +7350,8 @@ void *__init alloc_large_system_hash(const char *tablename, * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, GFP_ATOMIC); - kmemleak_alloc(table, size, 1, GFP_ATOMIC); + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -7660,6 +7753,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) break; if (pfn == end_pfn) return; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); pfn = start_pfn; diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d3470f..d0b78a94208d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -117,6 +117,7 @@ static void swap_slot_free_notify(struct page *page) static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; + struct task_struct *waiter = bio->bi_private; if (bio->bi_error) { SetPageError(page); @@ -132,7 +133,9 @@ static void end_swap_bio_read(struct bio *bio) swap_slot_free_notify(page); out: unlock_page(page); + WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); + wake_up_process(waiter); } int generic_swapfile_activate(struct swap_info_struct *sis, @@ -329,11 +332,13 @@ out: return ret; } -int swap_readpage(struct page *page) +int swap_readpage(struct page *page, bool do_poll) { struct bio *bio; int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + blk_qc_t qc; + struct block_device *bdev; VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -372,9 +377,23 @@ int swap_readpage(struct page *page) ret = -ENOMEM; goto out; } + bdev = bio->bi_bdev; + bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); count_vm_event(PSWPIN); - submit_bio(bio); + bio_get(bio); + qc = submit_bio(bio); + while (do_poll) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio->bi_private)) + break; + + if (!blk_mq_poll(bdev_get_queue(bdev), qc)) + break; + } + __set_current_state(TASK_RUNNING); + bio_put(bio); + out: return ret; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5092e4ef00c8..3606104893e0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -138,12 +138,18 @@ static inline struct page * __first_valid_page(unsigned long pfn, unsigned long nr_pages) { int i; - for (i = 0; i < nr_pages; i++) - if (pfn_valid_within(pfn + i)) - break; - if (unlikely(i == nr_pages)) - return NULL; - return pfn_to_page(pfn + i); + + for (i = 0; i < nr_pages; i++) { + struct page *page; + + if (!pfn_valid_within(pfn + i)) + continue; + page = pfn_to_online_page(pfn + i); + if (!page) + continue; + return page; + } + return NULL; } /* @@ -184,8 +190,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo: for (pfn = start_pfn; pfn < undo_pfn; - pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn), migratetype); + pfn += pageblock_nr_pages) { + struct page *page = pfn_to_online_page(pfn); + if (!page) + continue; + unset_migratetype_isolate(page, migratetype); + } return -EBUSY; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 60634dc53a88..c3cee247f2e6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } @@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index de9c40d7304a..8ec6ba230bb9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address); + pvmw->pte = huge_pte_offset(mm, pvmw->address, + PAGE_SIZE << compound_order(page)); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 60f7856e508f..1a4197965415 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); + unsigned long sz = huge_page_size(h); pte_t *pte; int err = 0; do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask); + pte = huge_pte_offset(walk->mm, addr & hmask, sz); if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) diff --git a/mm/rmap.c b/mm/rmap.c index 130c238fe384..ced14f1af6dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1145,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound) if (!atomic_inc_and_test(&page->_mapcount)) goto out; } - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1181,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound) } /* - * We use the irq-unsafe __{inc|mod}_zone_page_state because + * We use the irq-unsafe __{inc|mod}_lruvec_page_state because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1367,15 +1365,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { int nr = 1 << compound_order(page); hugetlb_count_sub(nr, mm); + set_huge_swap_pte_at(mm, address, + pvmw.pte, pteval, + vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + set_pte_at(mm, address, pvmw.pte, pteval); } - pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - set_pte_at(mm, address, pvmw.pte, pteval); } else if (pte_unused(pteval)) { /* * The guest indicated that the page content is of no diff --git a/mm/shmem.c b/mm/shmem.c index 391f2dcca727..3b5c4f31248a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1291,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) SetPageUptodate(page); } - swap = get_swap_page(); + swap = get_swap_page(page); if (!swap.val) goto redirty; @@ -1327,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) mutex_unlock(&shmem_swaplist_mutex); free_swap: - swapcache_free(swap); + put_swap_page(page, swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -1646,8 +1646,7 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(charge_mm, - PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); @@ -1978,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf) } sgp = SGP_CACHE; - if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - else if (vma->vm_flags & VM_NOHUGEPAGE) + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) sgp = SGP_NOHUGE; + else if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); diff --git a/mm/slab.c b/mm/slab.c index 2a31ee3c5814..908908aa8250 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - add_zone_page_state(page_zone(page), - NR_SLAB_RECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages); else - add_zone_page_state(page_zone(page), - NR_SLAB_UNRECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ @@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - sub_zone_page_state(page_zone(page), - NR_SLAB_RECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); else - sub_zone_page_state(page_zone(page), - NR_SLAB_UNRECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed); BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); diff --git a/mm/slab.h b/mm/slab.h index 9cfcf099709c..6885e1192ec5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - int ret; - if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - - ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); - if (ret) - return ret; - - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, - 1 << order); - return 0; + return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); } static __always_inline void memcg_uncharge_slab(struct page *page, int order, @@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, { if (!memcg_kmem_enabled()) return; - - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, - -(1 << order)); memcg_kmem_uncharge(page, order); } diff --git a/mm/slub.c b/mm/slub.c index 7449593fca72..632cc0929f9c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1615,7 +1615,7 @@ out: if (!page) return NULL; - mod_zone_page_state(page_zone(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); @@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) kmemcheck_free_shadow(page, compound_order(page)); - mod_zone_page_state(page_zone(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); @@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, stat(s, CPU_PARTIAL_NODE); } if (!kmem_cache_has_cpu_partial(s) - || available > s->cpu_partial / 2) + || available > slub_cpu_partial(s) / 2) break; } @@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct page *page, - void *freelist) + void *freelist, struct kmem_cache_cpu *c) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2132,6 +2132,9 @@ redo: discard_slab(s, page); stat(s, FREE_SLAB); } + + c->page = NULL; + c->freelist = NULL; } /* @@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist); + deactivate_slab(s, c->page, c->freelist, c); c->tid = next_tid(c->tid); - c->page = NULL; - c->freelist = NULL; } /* @@ -2302,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info) struct kmem_cache *s = info; struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return c->page || c->partial; + return c->page || slub_percpu_partial(c); } static void flush_all(struct kmem_cache *s) @@ -2521,9 +2522,7 @@ redo: if (unlikely(!node_match(page, searchnode))) { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } } @@ -2534,9 +2533,7 @@ redo: * information when the page leaves the per-cpu allocator */ if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } @@ -2568,11 +2565,10 @@ load_freelist: new_slab: - if (c->partial) { - page = c->page = c->partial; - c->partial = page->next; + if (slub_percpu_partial(c)) { + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); stat(s, CPU_PARTIAL_ALLOC); - c->freelist = NULL; goto redo; } @@ -2592,9 +2588,7 @@ new_slab: !alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ - deactivate_slab(s, page, get_freepointer(s, freelist)); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, get_freepointer(s, freelist), c); return freelist; } @@ -3410,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) s->min_partial = min; } +static void set_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. + */ + if (!kmem_cache_has_cpu_partial(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; +#endif +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -3568,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) */ set_min_partial(s, ilog2(s->size) / 2); - /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * This setting also determines - * - * A) The number of objects from per cpu partial slabs dumped to the - * per node list when we reach the limit. - * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch - * 50% to keep some capacity around for frees. - */ - if (!kmem_cache_has_cpu_partial(s)) - s->cpu_partial = 0; - else if (s->size >= PAGE_SIZE) - s->cpu_partial = 2; - else if (s->size >= 1024) - s->cpu_partial = 6; - else if (s->size >= 256) - s->cpu_partial = 13; - else - s->cpu_partial = 30; + set_cpu_partial(s); #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -3981,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) * Disable empty slabs caching. Used to avoid pinning offline * memory cgroups by kmem pages that can be freed. */ - s->cpu_partial = 0; + slub_set_cpu_partial(s, 0); s->min_partial = 0; /* @@ -4760,7 +4761,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[node] += x; - page = READ_ONCE(c->partial); + page = slub_percpu_partial_read_once(c); if (page) { node = page_to_nid(page); if (flags & SO_TOTAL) @@ -4921,7 +4922,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->cpu_partial); + return sprintf(buf, "%u\n", slub_cpu_partial(s)); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -4936,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; - s->cpu_partial = objects; + slub_set_cpu_partial(s, objects); flush_all(s); return length; } @@ -4988,7 +4989,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) int len; for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page) { pages += page->pages; @@ -5000,7 +5003,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) #ifdef CONFIG_SMP for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page && len < PAGE_SIZE - 20) len += sprintf(buf + len, " C%d=%d(%d)", cpu, diff --git a/mm/sparse.c b/mm/sparse.c index 6903c8fc3085..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, } } +/* + * There are a number of times that we loop over NR_MEM_SECTIONS, + * looking for section_present() on each. But, when we have very + * large physical address spaces, NR_MEM_SECTIONS can also be + * very large which makes the loops quite long. + * + * Keeping track of this gives us an easy way to break out of + * those loops early. + */ +int __highest_present_section_nr; +static void section_mark_present(struct mem_section *ms) +{ + int section_nr = __section_nr(ms); + + if (section_nr > __highest_present_section_nr) + __highest_present_section_nr = section_nr; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; +} + +static inline int next_present_section_nr(int section_nr) +{ + do { + section_nr++; + if (present_section_nr(section_nr)) + return section_nr; + } while ((section_nr < NR_MEM_SECTIONS) && + (section_nr <= __highest_present_section_nr)); + + return -1; +} +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start-1); \ + ((section_nr >= 0) && \ + (section_nr < NR_MEM_SECTIONS) && \ + (section_nr <= __highest_present_section_nr)); \ + section_nr = next_present_section_nr(section_nr)) + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -183,9 +221,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) set_section_nid(section, nid); ms = __nr_to_section(section); - if (!ms->section_mem_map) + if (!ms->section_mem_map) { ms->section_mem_map = sparse_encode_early_nid(nid) | - SECTION_MARKED_PRESENT; + SECTION_IS_ONLINE; + section_mark_present(ms); + } } } @@ -476,23 +516,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) int nodeid_begin = 0; unsigned long pnum_begin = 0; - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(0, pnum) { struct mem_section *ms; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid_begin = sparse_early_nid(ms); pnum_begin = pnum; break; } map_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(pnum_begin + 1, pnum) { struct mem_section *ms; int nodeid; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid = sparse_early_nid(ms); if (nodeid == nodeid_begin) { @@ -561,10 +597,7 @@ void __init sparse_init(void) (void *)map_map); #endif - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - if (!present_section_nr(pnum)) - continue; - + for_each_present_section_nr(0, pnum) { usemap = usemap_map[pnum]; if (!usemap) continue; @@ -590,6 +623,48 @@ void __init sparse_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* onlining code should never touch invalid ranges */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* Mark all memory sections within the pfn range as online */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* + * TODO this needs some double checking. Offlining code makes + * sure to check pfn_valid but those checks might be just bogus + */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { @@ -686,10 +761,9 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap; @@ -722,7 +796,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); - ms->section_mem_map |= SECTION_MARKED_PRESENT; + section_mark_present(ms); ret = sparse_init_one_section(ms, section_nr, memmap, usemap); diff --git a/mm/swap.c b/mm/swap.c index 98d08b4579fa..4f44dbd7f780 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + count_memcg_page_event(page, PGLAZYFREE); update_page_reclaim_stat(lruvec, 1, 0); } } diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index ac6318a064d3..fcd2740f4ed7 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type) if (!page) goto not_enough_page; ctrl->map[idx] = page; + + if (!(idx % SWAP_CLUSTER_MAX)) + cond_resched(); } return 0; not_enough_page: @@ -58,21 +61,27 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, + pgoff_t offset) +{ + struct page *mappage; + struct swap_cgroup *sc; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, struct swap_cgroup_ctrl **ctrlp) { pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; - - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; + return __lookup_swap_cgroup(ctrl, offset); } /** @@ -105,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, } /** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into + * swap_cgroup_record - record mem_cgroup for a set of swap entries + * @ent: the first swap entry to be recorded into * @id: mem_cgroup to be recorded + * @nr_ents: number of swap entries to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned short old; unsigned long flags; + pgoff_t offset = swp_offset(ent); + pgoff_t end = offset + nr_ents; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; - sc->id = id; + for (;;) { + VM_BUG_ON(sc->id != old); + sc->id = id; + offset++; + if (offset == end) + break; + if (offset % SC_PER_PAGE) + sc++; + else + sc = __lookup_swap_cgroup(ctrl, offset); + } spin_unlock_irqrestore(&ctrl->lock, flags); return old; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 58f6c78f1dad..90c1032a8ac3 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, + cache->slots); return cache->nr; } @@ -301,11 +302,19 @@ direct_free: return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry, *pentry; struct swap_slots_cache *cache; + entry.val = 0; + + if (PageTransHuge(page)) { + if (IS_ENABLED(CONFIG_THP_SWAP)) + get_swap_pages(1, true, &entry); + return entry; + } + /* * Preemption is allowed here, because we may sleep * in refill_swap_slots_cache(). But it is safe, because @@ -317,7 +326,6 @@ swp_entry_t get_swap_page(void) */ cache = raw_cpu_ptr(&swp_slots); - entry.val = 0; if (check_cache_active()) { mutex_lock(&cache->alloc_lock); if (cache->slots) { @@ -337,7 +345,7 @@ repeat: return entry; } - get_swap_pages(1, &entry); + get_swap_pages(1, false, &entry); return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 539b8885e3d1..b68c93014f50 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -19,6 +19,7 @@ #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> +#include <linux/huge_mm.h> #include <asm/pgtable.h> @@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) +#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) static struct { unsigned long add_total; @@ -90,39 +92,46 @@ void show_swap_cache_info(void) */ int __add_to_swap_cache(struct page *page, swp_entry_t entry) { - int error; + int error, i, nr = hpage_nr_pages(page); struct address_space *address_space; + pgoff_t idx = swp_offset(entry); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - get_page(page); + page_ref_add(page, nr); SetPageSwapCache(page); - set_page_private(page, entry.val); address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - error = radix_tree_insert(&address_space->page_tree, - swp_offset(entry), page); - if (likely(!error)) { - address_space->nrpages++; - __inc_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); + for (i = 0; i < nr; i++) { + set_page_private(page + i, entry.val + i); + error = radix_tree_insert(&address_space->page_tree, + idx + i, page + i); + if (unlikely(error)) + break; } - spin_unlock_irq(&address_space->tree_lock); - - if (unlikely(error)) { + if (likely(!error)) { + address_space->nrpages += nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + ADD_CACHE_INFO(add_total, nr); + } else { /* * Only the context which have set SWAP_HAS_CACHE flag * would call add_to_swap_cache(). * So add_to_swap_cache() doesn't returns -EEXIST. */ VM_BUG_ON(error == -EEXIST); - set_page_private(page, 0UL); + set_page_private(page + i, 0UL); + while (i--) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0UL); + } ClearPageSwapCache(page); - put_page(page); + page_ref_sub(page, nr); } + spin_unlock_irq(&address_space->tree_lock); return error; } @@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_maybe_preload(gfp_mask); + error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t entry; struct address_space *address_space; + int i, nr = hpage_nr_pages(page); + swp_entry_t entry; + pgoff_t idx; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); @@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, swp_offset(entry)); - set_page_private(page, 0); + idx = swp_offset(entry); + for (i = 0; i < nr; i++) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0); + } ClearPageSwapCache(page); - address_space->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(del_total); + address_space->nrpages -= nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + ADD_CACHE_INFO(del_total, nr); } /** @@ -170,7 +184,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page, struct list_head *list) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; @@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - entry = get_swap_page(); + entry = get_swap_page(page); if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) { - swapcache_free(entry); - return 0; - } - - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry); - return 0; - } + if (mem_cgroup_try_charge_swap(page, entry)) + goto fail; /* * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -206,17 +212,19 @@ int add_to_swap(struct page *page, struct list_head *list) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - if (!err) { - return 1; - } else { /* -ENOMEM radix-tree allocation failure */ + /* -ENOMEM radix-tree allocation failure */ + if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); - return 0; - } + goto fail; + + return 1; + +fail: + put_swap_page(page, entry); + return 0; } /* @@ -237,8 +245,8 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry); - put_page(page); + put_swap_page(page, entry); + page_ref_sub(page, hpage_nr_pages(page)); } /* @@ -295,7 +303,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), swp_offset(entry)); - if (page) { + if (page && likely(!PageTransCompound(page))) { INC_CACHE_INFO(find_success); if (TestClearPageReadahead(page)) atomic_inc(&swapin_readahead_hits); @@ -389,7 +397,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); + put_swap_page(new_page, entry); } while (err != -ENOMEM); if (new_page) @@ -404,14 +412,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) + struct vm_area_struct *vma, unsigned long addr, bool do_poll) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage); + swap_readpage(retpage, do_poll); return retpage; } @@ -488,11 +496,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long start_offset, end_offset; unsigned long mask; struct blk_plug plug; + bool do_poll = true; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; + do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -503,10 +513,10 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr); + gfp_mask, vma, addr, false); if (!page) continue; - if (offset != entry_offset) + if (offset != entry_offset && likely(!PageTransCompound(page))) SetPageReadahead(page); put_page(page); } @@ -514,7 +524,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: - return read_swap_cache_async(entry, gfp_mask, vma, addr); + return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f6cba1b6632..6ba4aab2db0b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,6 +37,7 @@ #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> +#include <linux/sort.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -199,7 +200,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#else #define SWAPFILE_CLUSTER 256 +#endif #define LATENCY_LIMIT 256 static inline void cluster_set_flag(struct swap_cluster_info *info, @@ -374,6 +379,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, schedule_work(&si->discard_work); } +static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); + cluster_list_add_tail(&si->free_clusters, ci, idx); +} + /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. @@ -394,10 +407,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) spin_lock(&si->lock); ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); - cluster_set_flag(ci, CLUSTER_FLAG_FREE); - unlock_cluster(ci); - cluster_list_add_tail(&si->free_clusters, info, idx); - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + __free_cluster(si, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); unlock_cluster(ci); @@ -415,6 +425,34 @@ static void swap_discard_work(struct work_struct *work) spin_unlock(&si->lock); } +static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); + cluster_list_del_first(&si->free_clusters, ci); + cluster_set_count_flag(ci + idx, 0, 0); +} + +static void free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info + idx; + + VM_BUG_ON(cluster_count(ci) != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(si, idx); + return; + } + + __free_cluster(si, idx); +} + /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. @@ -426,11 +464,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (!cluster_info) return; - if (cluster_is_free(&cluster_info[idx])) { - VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); - cluster_list_del_first(&p->free_clusters, cluster_info); - cluster_set_count_flag(&cluster_info[idx], 0, 0); - } + if (cluster_is_free(&cluster_info[idx])) + alloc_cluster(p, idx); VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], @@ -454,21 +489,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p, cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); - if (cluster_count(&cluster_info[idx]) == 0) { - /* - * If the swap is discardable, prepare discard the cluster - * instead of free it immediately. The cluster will be freed - * after discard. - */ - if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == - (SWP_WRITEOK | SWP_PAGE_DISCARD)) { - swap_cluster_schedule_discard(p, idx); - return; - } - - cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - cluster_list_add_tail(&p->free_clusters, cluster_info, idx); - } + if (cluster_count(&cluster_info[idx]) == 0) + free_cluster(p, idx); } /* @@ -558,6 +580,60 @@ new_cluster: return found_free; } +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned int end = offset + nr_entries - 1; + + if (offset == si->lowest_bit) + si->lowest_bit += nr_entries; + if (end == si->highest_bit) + si->highest_bit -= nr_entries; + si->inuse_pages += nr_entries; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } +} + +static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned long end = offset + nr_entries - 1; + void (*swap_slot_free_notify)(struct block_device *, unsigned long); + + if (offset < si->lowest_bit) + si->lowest_bit = offset; + if (end > si->highest_bit) { + bool was_full = !si->highest_bit; + + si->highest_bit = end; + if (was_full && (si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&si->avail_list)); + if (plist_node_empty(&si->avail_list)) + plist_add(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } + atomic_long_add(nr_entries, &nr_swap_pages); + si->inuse_pages -= nr_entries; + if (si->flags & SWP_BLKDEV) + swap_slot_free_notify = + si->bdev->bd_disk->fops->swap_slot_free_notify; + else + swap_slot_free_notify = NULL; + while (offset <= end) { + frontswap_invalidate_page(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; + } +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -676,18 +752,7 @@ checks: inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + swap_range_alloc(si, offset, 1); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -766,6 +831,52 @@ no_page: return n_ret; } +#ifdef CONFIG_THP_SWAP +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + unsigned long idx; + struct swap_cluster_info *ci; + unsigned long offset, i; + unsigned char *map; + + if (cluster_list_empty(&si->free_clusters)) + return 0; + + idx = cluster_list_first(&si->free_clusters); + offset = idx * SWAPFILE_CLUSTER; + ci = lock_cluster(si, offset); + alloc_cluster(si, idx); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] = SWAP_HAS_CACHE; + unlock_cluster(ci); + swap_range_alloc(si, offset, SWAPFILE_CLUSTER); + *slot = swp_entry(si->type, offset); + + return 1; +} + +static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + unsigned long offset = idx * SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + cluster_set_count_flag(ci, 0, 0); + free_cluster(si, idx); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); +} +#else +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + VM_WARN_ON_ONCE(1); + return 0; +} +#endif /* CONFIG_THP_SWAP */ + static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) { @@ -781,13 +892,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) { + unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; - avail_pgs = atomic_long_read(&nr_swap_pages); + /* Only single cluster request supported */ + WARN_ON_ONCE(n_goal > 1 && cluster); + + avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; if (avail_pgs <= 0) goto noswap; @@ -797,7 +912,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal, &nr_swap_pages); + atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -823,10 +938,13 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + if (cluster) + n_ret = swap_alloc_cluster(si, swp_entries); + else + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret) + if (n_ret || cluster) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -852,7 +970,8 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); + atomic_long_add((long)(n_goal - n_ret) * nr_pages, + &nr_swap_pages); noswap: return n_ret; } @@ -1008,32 +1127,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) { - bool was_full = !p->highest_bit; - - p->highest_bit = offset; - if (was_full && (p->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&p->avail_list)); - if (plist_node_empty(&p->avail_list)) - plist_add(&p->avail_list, - &swap_avail_head); - spin_unlock(&swap_avail_lock); - } - } - atomic_long_inc(&nr_swap_pages); - p->inuse_pages--; - frontswap_invalidate_page(p->type, offset); - if (p->flags & SWP_BLKDEV) { - struct gendisk *disk = p->bdev->bd_disk; - - if (disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, - offset); - } + mem_cgroup_uncharge_swap(entry, 1); + swap_range_free(p, offset, 1); } /* @@ -1054,7 +1149,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry) +static void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; @@ -1065,6 +1160,52 @@ void swapcache_free(swp_entry_t entry) } } +#ifdef CONFIG_THP_SWAP +static void swapcache_free_cluster(swp_entry_t entry) +{ + unsigned long offset = swp_offset(entry); + unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned char *map; + unsigned int i; + + si = swap_info_get(entry); + if (!si) + return; + + ci = lock_cluster(si, offset); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + VM_BUG_ON(map[i] != SWAP_HAS_CACHE); + map[i] = 0; + } + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); +} +#else +static inline void swapcache_free_cluster(swp_entry_t entry) +{ +} +#endif /* CONFIG_THP_SWAP */ + +void put_swap_page(struct page *page, swp_entry_t entry) +{ + if (!PageTransHuge(page)) + swapcache_free(entry); + else + swapcache_free_cluster(entry); +} + +static int swp_entry_cmp(const void *ent1, const void *ent2) +{ + const swp_entry_t *e1 = ent1, *e2 = ent2; + + return (int)swp_type(*e1) - (int)swp_type(*e2); +} + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; @@ -1075,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n) prev = NULL; p = NULL; + + /* + * Sort swap entries by swap device, so each lock is only taken once. + * nr_swapfiles isn't absolutely correct, but the overhead of sort() is + * so low that it isn't necessary to optimize further. + */ + if (nr_swapfiles > 1) + sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) @@ -1719,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap, swap_map = &si->swap_map[i]; entry = swp_entry(type, i); page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0); + GFP_HIGHUSER_MOVABLE, NULL, 0, false); if (!page) { /* * Either swap_duplicate() failed because entry diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 34a1c3e46ed7..96db9005af5a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -314,6 +314,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); @@ -1486,6 +1487,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; + va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); @@ -1759,12 +1761,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */ clear_vm_uninitialized_flag(area); - /* - * A ref_count = 2 is needed because vm_struct allocated in - * __get_vm_area_node() contains a reference to the virtual address of - * the vmalloc'ed block. - */ - kmemleak_alloc(addr, real_size, 2, gfp_mask); + kmemleak_vmalloc(area, size, gfp_mask); return addr; @@ -2698,8 +2695,14 @@ static int s_show(struct seq_file *m, void *p) * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start, + va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + return 0; + } v = va->vm; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 6063581f705c..0781b1363e0a 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -93,12 +93,25 @@ enum vmpressure_levels { VMPRESSURE_NUM_LEVELS, }; +enum vmpressure_modes { + VMPRESSURE_NO_PASSTHROUGH = 0, + VMPRESSURE_HIERARCHY, + VMPRESSURE_LOCAL, + VMPRESSURE_NUM_MODES, +}; + static const char * const vmpressure_str_levels[] = { [VMPRESSURE_LOW] = "low", [VMPRESSURE_MEDIUM] = "medium", [VMPRESSURE_CRITICAL] = "critical", }; +static const char * const vmpressure_str_modes[] = { + [VMPRESSURE_NO_PASSTHROUGH] = "default", + [VMPRESSURE_HIERARCHY] = "hierarchy", + [VMPRESSURE_LOCAL] = "local", +}; + static enum vmpressure_levels vmpressure_level(unsigned long pressure) { if (pressure >= vmpressure_level_critical) @@ -141,27 +154,31 @@ out: struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level; + enum vmpressure_modes mode; struct list_head node; }; static bool vmpressure_event(struct vmpressure *vmpr, - enum vmpressure_levels level) + const enum vmpressure_levels level, + bool ancestor, bool signalled) { struct vmpressure_event *ev; - bool signalled = false; + bool ret = false; mutex_lock(&vmpr->events_lock); - list_for_each_entry(ev, &vmpr->events, node) { - if (level >= ev->level) { - eventfd_signal(ev->efd, 1); - signalled = true; - } + if (ancestor && ev->mode == VMPRESSURE_LOCAL) + continue; + if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) + continue; + if (level < ev->level) + continue; + eventfd_signal(ev->efd, 1); + ret = true; } - mutex_unlock(&vmpr->events_lock); - return signalled; + return ret; } static void vmpressure_work_fn(struct work_struct *work) @@ -170,6 +187,8 @@ static void vmpressure_work_fn(struct work_struct *work) unsigned long scanned; unsigned long reclaimed; enum vmpressure_levels level; + bool ancestor = false; + bool signalled = false; spin_lock(&vmpr->sr_lock); /* @@ -194,12 +213,9 @@ static void vmpressure_work_fn(struct work_struct *work) level = vmpressure_calc_level(scanned, reclaimed); do { - if (vmpressure_event(vmpr, level)) - break; - /* - * If not handled, propagate the event upward into the - * hierarchy. - */ + if (vmpressure_event(vmpr, level, ancestor, signalled)) + signalled = true; + ancestor = true; } while ((vmpr = vmpressure_parent(vmpr))); } @@ -326,17 +342,40 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } +static enum vmpressure_levels str_to_level(const char *arg) +{ + enum vmpressure_levels level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) + if (!strcmp(vmpressure_str_levels[level], arg)) + return level; + return -1; +} + +static enum vmpressure_modes str_to_mode(const char *arg) +{ + enum vmpressure_modes mode; + + for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) + if (!strcmp(vmpressure_str_modes[mode], arg)) + return mode; + return -1; +} + +#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) + /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with - * @args: event arguments (used to set up a pressure level threshold) + * @args: event arguments (pressure level threshold, optional mode) * * This function associates eventfd context with the vmpressure * infrastructure, so that the notifications will be delivered to the - * @eventfd. The @args parameter is a string that denotes pressure level - * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or - * "critical"). + * @eventfd. The @args parameter is a comma-delimited string that denotes a + * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", + * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. + * "hierarchy" or "local"). * * To be used as memcg event method. */ @@ -345,28 +384,53 @@ int vmpressure_register_event(struct mem_cgroup *memcg, { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; - int level; + enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; + enum vmpressure_levels level = -1; + char *spec = NULL; + char *token; + int ret = 0; + + spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + if (!spec) { + ret = -ENOMEM; + goto out; + } + strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { - if (!strcmp(vmpressure_str_levels[level], args)) - break; + /* Find required level */ + token = strsep(&spec, ","); + level = str_to_level(token); + if (level == -1) { + ret = -EINVAL; + goto out; } - if (level >= VMPRESSURE_NUM_LEVELS) - return -EINVAL; + /* Find optional mode */ + token = strsep(&spec, ","); + if (token) { + mode = str_to_mode(token); + if (mode == -1) { + ret = -EINVAL; + goto out; + } + } ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) - return -ENOMEM; + if (!ev) { + ret = -ENOMEM; + goto out; + } ev->efd = eventfd; ev->level = level; + ev->mode = mode; mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); - - return 0; +out: + kfree(spec); + return ret; } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index c3c1c6ac62da..f84cdd3751e1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - swapcache_free(swap); + put_swap_page(page, swap); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1125,8 +1125,36 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page, page_list)) + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, page_list)) + goto activate_locked; + } + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Split THP and swap individual base pages */ + if (split_huge_page_to_list(page, page_list)) + goto activate_locked; + if (!add_to_swap(page)) + goto activate_locked; + } + + /* XXX: We don't support THP writes */ + if (PageTransHuge(page) && + split_huge_page_to_list(page, page_list)) { + delete_from_swap_cache(page); goto activate_locked; + } + may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1266,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } count_vm_event(PGLAZYFREED); + count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -1295,6 +1324,7 @@ activate_locked: if (!PageMlocked(page)) { SetPageActive(page); pgactivate++; + count_memcg_page_event(page, PGACTIVATE); } keep_locked: unlock_page(page); @@ -1734,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - else + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, + nr_scanned); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_DIRECT, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, + nr_scanned); } spin_unlock_irq(&pgdat->lru_lock); @@ -1750,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - else + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, + nr_reclaimed); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, + nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); @@ -1899,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } } - if (!is_active_lru(lru)) + if (!is_active_lru(lru)) { __count_vm_events(PGDEACTIVATE, nr_moved); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_moved); + } return nr_moved; } @@ -1938,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2184,8 +2228,17 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, } if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { - scan_balance = SCAN_ANON; - goto out; + /* + * Force SCAN_ANON if there are enough inactive + * anonymous pages on the LRU in eligible zones. + * Otherwise, the small LRU gets thrashed. + */ + if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) + >> sc->priority) { + scan_balance = SCAN_ANON; + goto out; + } } } @@ -2579,16 +2632,23 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - nr_scanned, node_lru_pages); + /* + * Record the subtree's reclaim efficiency. The reclaimed + * pages from slab is excluded here because the corresponding + * scanned pages is not accounted. Moreover, freeing a page + * by slab shrinking depends on each slab's object population, + * making the cost model (i.e. scan:free) different from that + * of LRU. + */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -2967,7 +3027,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -2982,12 +3042,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * 1 is returned so that the page allocator does not OOM kill at this * point. */ - if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask, + sc.gfp_mask, sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3774,17 +3834,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int classzone_idx = gfp_zone(gfp_mask); unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, - .reclaim_idx = classzone_idx, + .reclaim_idx = gfp_zone(gfp_mask), }; cond_resched(); @@ -3795,7 +3854,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(gfp_mask); + lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3831,7 +3890,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && - sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 76f73670200a..744ceaeb42a0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -928,8 +928,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", "nr_bounce", @@ -952,6 +950,8 @@ const char * const vmstat_text[] = { "nr_inactive_file", "nr_active_file", "nr_unevictable", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", "workingset_refault", @@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = { "drop_pagecache", "drop_slab", + "oom_kill", #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", @@ -1223,11 +1224,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { struct page *page; - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - page = pfn_to_page(pfn); - /* Watch for unexpected holes punched in the memmap */ if (!memmap_valid_within(pfn, page, zone)) continue; @@ -1322,7 +1322,7 @@ static int fragmentation_open(struct inode *inode, struct file *file) return seq_open(file, &fragmentation_op); } -static const struct file_operations fragmentation_file_operations = { +static const struct file_operations buddyinfo_file_operations = { .open = fragmentation_open, .read = seq_read, .llseek = seq_lseek, @@ -1341,7 +1341,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file) return seq_open(file, &pagetypeinfo_op); } -static const struct file_operations pagetypeinfo_file_ops = { +static const struct file_operations pagetypeinfo_file_operations = { .open = pagetypeinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1463,7 +1463,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file) return seq_open(file, &zoneinfo_op); } -static const struct file_operations proc_zoneinfo_file_operations = { +static const struct file_operations zoneinfo_file_operations = { .open = zoneinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1552,7 +1552,7 @@ static int vmstat_open(struct inode *inode, struct file *file) return seq_open(file, &vmstat_op); } -static const struct file_operations proc_vmstat_file_operations = { +static const struct file_operations vmstat_file_operations = { .open = vmstat_open, .read = seq_read, .llseek = seq_lseek, @@ -1785,10 +1785,10 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif #ifdef CONFIG_PROC_FS - proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); - proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); - proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); - proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); + proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); + proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); + proc_create("vmstat", 0444, NULL, &vmstat_file_operations); + proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); #endif } diff --git a/mm/workingset.c b/mm/workingset.c index b8c9ab678479..7119cd745ace 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -288,12 +288,10 @@ bool workingset_refault(void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; - inc_node_state(pgdat, WORKINGSET_REFAULT); - inc_memcg_state(memcg, WORKINGSET_REFAULT); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); if (refault_distance <= active_file) { - inc_node_state(pgdat, WORKINGSET_ACTIVATE); - inc_memcg_state(memcg, WORKINGSET_ACTIVATE); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); rcu_read_unlock(); return true; } @@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; - inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, workingset_update_node, mapping); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d41edd28298b..15959d35fc26 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -469,7 +469,7 @@ static bool is_zspage_isolated(struct zspage *zspage) return zspage->isolated; } -static int is_first_page(struct page *page) +static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); } diff --git a/mm/zswap.c b/mm/zswap.c index eedc27894b10..d39581a076c3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu) u8 *dst; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) { - pr_err("can't allocate compressor buffer\n"); + if (!dst) return -ENOMEM; - } + per_cpu(zswap_dstmem, cpu) = dst; return 0; } @@ -515,10 +514,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - pr_err("pool alloc failed\n"); + if (!pool) return NULL; - } /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); @@ -1158,7 +1155,7 @@ static void zswap_frontswap_init(unsigned type) { struct zswap_tree *tree; - tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return; diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c index aa243db93f01..be0d4a5fdf53 100644 --- a/samples/kfifo/dma-example.c +++ b/samples/kfifo/dma-example.c @@ -75,8 +75,8 @@ static int __init example_init(void) for (i = 0; i < nents; i++) { printk(KERN_INFO "sg[%d] -> " - "page_link 0x%.8lx offset 0x%.8x length 0x%.8x\n", - i, sg[i].page_link, sg[i].offset, sg[i].length); + "page %p offset 0x%.8x length 0x%.8x\n", + i, sg_page(&sg[i]), sg[i].offset, sg[i].length); if (sg_is_last(&sg[i])) break; @@ -104,8 +104,8 @@ static int __init example_init(void) for (i = 0; i < nents; i++) { printk(KERN_INFO "sg[%d] -> " - "page_link 0x%.8lx offset 0x%.8x length 0x%.8x\n", - i, sg[i].page_link, sg[i].offset, sg[i].length); + "page %p offset 0x%.8x length 0x%.8x\n", + i, sg_page(&sg[i]), sg[i].offset, sg[i].length); if (sg_is_last(&sg[i])) break; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f4ac18970e7e..7fcaf5ca997b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2776,6 +2776,17 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } +# check for MAINTAINERS entries that don't have the right form + if ($realfile =~ /^MAINTAINERS$/ && + $rawline =~ /^\+[A-Z]:/ && + $rawline !~ /^\+[A-Z]:\t\S/) { + if (WARN("MAINTAINERS_STYLE", + "MAINTAINERS entries use one tab after TYPE:\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/^(\+[A-Z]):\s*/$1:\t/; + } + } + # discourage the use of boolean for type definition attributes of Kconfig options if ($realfile =~ /Kconfig/ && $line =~ /^\+\s*\bboolean\b/) { @@ -5311,7 +5322,7 @@ sub process { my ($s, $c) = ctx_statement_block($linenr - 3, $realcnt, 0); # print("line: <$line>\nprevline: <$prevline>\ns: <$s>\nc: <$c>\n\n\n"); - if ($c =~ /(?:^|\n)[ \+]\s*(?:$Type\s*)?\Q$testval\E\s*=\s*(?:\([^\)]*\)\s*)?\s*(?:devm_)?(?:[kv][czm]alloc(?:_node|_array)?\b|kstrdup|(?:dev_)?alloc_skb)/) { + if ($s =~ /(?:^|\n)[ \+]\s*(?:$Type\s*)?\Q$testval\E\s*=\s*(?:\([^\)]*\)\s*)?\s*(?:devm_)?(?:[kv][czm]alloc(?:_node|_array)?\b|kstrdup|kmemdup|(?:dev_)?alloc_skb)/) { WARN("OOM_MESSAGE", "Possible unnecessary 'out of memory' message\n" . $hereprev); } diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 7986f4e0da12..7aad82406422 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/of_fdt.h> /* We need to stringify expanded macros so that they can be parsed */ @@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC) LX_VALUE(MNT_NOATIME) LX_VALUE(MNT_NODIRATIME) LX_VALUE(MNT_RELATIME) + +/* linux/of_fdt.h> */ +LX_VALUE(OF_DT_HEADER) + +/* Kernel Configs */ +LX_CONFIG(CONFIG_OF) diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 38b1f09d1cd9..086d27223c0c 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -16,6 +16,7 @@ from linux import constants from linux import utils from linux import tasks from linux import lists +from struct import * class LxCmdLine(gdb.Command): @@ -195,3 +196,75 @@ values of that process namespace""" info_opts(MNT_INFO, m_flags))) LxMounts() + + +class LxFdtDump(gdb.Command): + """Output Flattened Device Tree header and dump FDT blob to the filename + specified as the command argument. Equivalent to + 'cat /proc/fdt > fdtdump.dtb' on a running target""" + + def __init__(self): + super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA, + gdb.COMPLETE_FILENAME) + + def fdthdr_to_cpu(self, fdt_header): + + fdt_header_be = ">IIIIIII" + fdt_header_le = "<IIIIIII" + + if utils.get_target_endianness() == 1: + output_fmt = fdt_header_le + else: + output_fmt = fdt_header_be + + return unpack(output_fmt, pack(fdt_header_be, + fdt_header['magic'], + fdt_header['totalsize'], + fdt_header['off_dt_struct'], + fdt_header['off_dt_strings'], + fdt_header['off_mem_rsvmap'], + fdt_header['version'], + fdt_header['last_comp_version'])) + + def invoke(self, arg, from_tty): + + if not constants.LX_CONFIG_OF: + raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n") + + if len(arg) == 0: + filename = "fdtdump.dtb" + else: + filename = arg + + py_fdt_header_ptr = gdb.parse_and_eval( + "(const struct fdt_header *) initial_boot_params") + py_fdt_header = py_fdt_header_ptr.dereference() + + fdt_header = self.fdthdr_to_cpu(py_fdt_header) + + if fdt_header[0] != constants.LX_OF_DT_HEADER: + raise gdb.GdbError("No flattened device tree magic found\n") + + gdb.write("fdt_magic: 0x{:02X}\n".format(fdt_header[0])) + gdb.write("fdt_totalsize: 0x{:02X}\n".format(fdt_header[1])) + gdb.write("off_dt_struct: 0x{:02X}\n".format(fdt_header[2])) + gdb.write("off_dt_strings: 0x{:02X}\n".format(fdt_header[3])) + gdb.write("off_mem_rsvmap: 0x{:02X}\n".format(fdt_header[4])) + gdb.write("version: {}\n".format(fdt_header[5])) + gdb.write("last_comp_version: {}\n".format(fdt_header[6])) + + inf = gdb.inferiors()[0] + fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr, + fdt_header[1]).tobytes() + + try: + f = open(filename, 'wb') + except: + raise gdb.GdbError("Could not open file to dump fdt") + + f.write(fdt_buf) + f.close() + + gdb.write("Dumped fdt blob to " + filename + "\n") + +LxFdtDump() diff --git a/scripts/gen_initramfs_list.sh b/scripts/gen_initramfs_list.sh index 0055b07b03b6..cb470fdee733 100755 --- a/scripts/gen_initramfs_list.sh +++ b/scripts/gen_initramfs_list.sh @@ -271,10 +271,12 @@ while [ $# -gt 0 ]; do case "$arg" in "-u") # map $1 to uid=0 (root) root_uid="$1" + [ "$root_uid" = "-1" ] && root_uid=$(id -u || echo 0) shift ;; "-g") # map $1 to gid=0 (root) root_gid="$1" + [ "$root_gid" = "-1" ] && root_gid=$(id -g || echo 0) shift ;; "-d") # display default initramfs list diff --git a/tools/testing/selftests/sysctl/Makefile b/tools/testing/selftests/sysctl/Makefile index b3c33e071f10..95c320b354e8 100644 --- a/tools/testing/selftests/sysctl/Makefile +++ b/tools/testing/selftests/sysctl/Makefile @@ -4,8 +4,7 @@ # No binaries, but make sure arg-less "make" doesn't trigger "run_tests". all: -TEST_PROGS := run_numerictests run_stringtests -TEST_FILES := common_tests +TEST_PROGS := sysctl.sh include ../lib.mk diff --git a/tools/testing/selftests/sysctl/common_tests b/tools/testing/selftests/sysctl/common_tests deleted file mode 100644 index 17d534b1b7b4..000000000000 --- a/tools/testing/selftests/sysctl/common_tests +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/sh - -TEST_FILE=$(mktemp) - -echo "== Testing sysctl behavior against ${TARGET} ==" - -set_orig() -{ - echo "${ORIG}" > "${TARGET}" -} - -set_test() -{ - echo "${TEST_STR}" > "${TARGET}" -} - -verify() -{ - local seen - seen=$(cat "$1") - if [ "${seen}" != "${TEST_STR}" ]; then - return 1 - fi - return 0 -} - -trap 'set_orig; rm -f "${TEST_FILE}"' EXIT - -rc=0 - -echo -n "Writing test file ... " -echo "${TEST_STR}" > "${TEST_FILE}" -if ! verify "${TEST_FILE}"; then - echo "FAIL" >&2 - exit 1 -else - echo "ok" -fi - -echo -n "Checking sysctl is not set to test value ... " -if verify "${TARGET}"; then - echo "FAIL" >&2 - exit 1 -else - echo "ok" -fi - -echo -n "Writing sysctl from shell ... " -set_test -if ! verify "${TARGET}"; then - echo "FAIL" >&2 - exit 1 -else - echo "ok" -fi - -echo -n "Resetting sysctl to original value ... " -set_orig -if verify "${TARGET}"; then - echo "FAIL" >&2 - exit 1 -else - echo "ok" -fi - -# Now that we've validated the sanity of "set_test" and "set_orig", -# we can use those functions to set starting states before running -# specific behavioral tests. - -echo -n "Writing entire sysctl in single write ... " -set_orig -dd if="${TEST_FILE}" of="${TARGET}" bs=4096 2>/dev/null -if ! verify "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Writing middle of sysctl after synchronized seek ... " -set_test -dd if="${TEST_FILE}" of="${TARGET}" bs=1 seek=1 skip=1 2>/dev/null -if ! verify "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Writing beyond end of sysctl ... " -set_orig -dd if="${TEST_FILE}" of="${TARGET}" bs=20 seek=2 2>/dev/null -if verify "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Writing sysctl with multiple long writes ... " -set_orig -(perl -e 'print "A" x 50;'; echo "${TEST_STR}") | \ - dd of="${TARGET}" bs=50 2>/dev/null -if verify "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi diff --git a/tools/testing/selftests/sysctl/config b/tools/testing/selftests/sysctl/config new file mode 100644 index 000000000000..6ca14800d755 --- /dev/null +++ b/tools/testing/selftests/sysctl/config @@ -0,0 +1 @@ +CONFIG_TEST_SYSCTL=y diff --git a/tools/testing/selftests/sysctl/run_numerictests b/tools/testing/selftests/sysctl/run_numerictests deleted file mode 100755 index 8510f93f2d14..000000000000 --- a/tools/testing/selftests/sysctl/run_numerictests +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -SYSCTL="/proc/sys" -TARGET="${SYSCTL}/vm/swappiness" -ORIG=$(cat "${TARGET}") -TEST_STR=$(( $ORIG + 1 )) - -. ./common_tests - -exit $rc diff --git a/tools/testing/selftests/sysctl/run_stringtests b/tools/testing/selftests/sysctl/run_stringtests deleted file mode 100755 index 90a9293d520c..000000000000 --- a/tools/testing/selftests/sysctl/run_stringtests +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/sh - -SYSCTL="/proc/sys" -TARGET="${SYSCTL}/kernel/domainname" -ORIG=$(cat "${TARGET}") -TEST_STR="Testing sysctl" - -. ./common_tests - -# Only string sysctls support seeking/appending. -MAXLEN=65 - -echo -n "Writing entire sysctl in short writes ... " -set_orig -dd if="${TEST_FILE}" of="${TARGET}" bs=1 2>/dev/null -if ! verify "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Writing middle of sysctl after unsynchronized seek ... " -set_test -dd if="${TEST_FILE}" of="${TARGET}" bs=1 seek=1 2>/dev/null -if verify "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Checking sysctl maxlen is at least $MAXLEN ... " -set_orig -perl -e 'print "A" x ('"${MAXLEN}"'-2), "B";' | \ - dd of="${TARGET}" bs="${MAXLEN}" 2>/dev/null -if ! grep -q B "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Checking sysctl keeps original string on overflow append ... " -set_orig -perl -e 'print "A" x ('"${MAXLEN}"'-1), "B";' | \ - dd of="${TARGET}" bs=$(( MAXLEN - 1 )) 2>/dev/null -if grep -q B "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Checking sysctl stays NULL terminated on write ... " -set_orig -perl -e 'print "A" x ('"${MAXLEN}"'-1), "B";' | \ - dd of="${TARGET}" bs="${MAXLEN}" 2>/dev/null -if grep -q B "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -echo -n "Checking sysctl stays NULL terminated on overwrite ... " -set_orig -perl -e 'print "A" x ('"${MAXLEN}"'-1), "BB";' | \ - dd of="${TARGET}" bs=$(( $MAXLEN + 1 )) 2>/dev/null -if grep -q B "${TARGET}"; then - echo "FAIL" >&2 - rc=1 -else - echo "ok" -fi - -exit $rc diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh new file mode 100644 index 000000000000..764d24b68597 --- /dev/null +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -0,0 +1,745 @@ +#!/bin/bash +# Copyright (C) 2017 Luis R. Rodriguez <mcgrof@kernel.org> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or at your option any +# later version; or, when distributed separately from the Linux kernel or +# when incorporated into other software packages, subject to the following +# license: +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of copyleft-next (version 0.3.1 or later) as published +# at http://copyleft-next.org/. + +# This performs a series tests against the proc sysctl interface. + +TEST_NAME="sysctl" +TEST_DRIVER="test_${TEST_NAME}" +TEST_DIR=$(dirname $0) +TEST_FILE=$(mktemp) + +# This represents +# +# TEST_ID:TEST_COUNT:ENABLED +# +# TEST_ID: is the test id number +# TEST_COUNT: number of times we should run the test +# ENABLED: 1 if enabled, 0 otherwise +# +# Once these are enabled please leave them as-is. Write your own test, +# we have tons of space. +ALL_TESTS="0001:1:1" +ALL_TESTS="$ALL_TESTS 0002:1:1" +ALL_TESTS="$ALL_TESTS 0003:1:1" +ALL_TESTS="$ALL_TESTS 0004:1:1" +ALL_TESTS="$ALL_TESTS 0005:3:1" + +test_modprobe() +{ + if [ ! -d $DIR ]; then + echo "$0: $DIR not present" >&2 + echo "You must have the following enabled in your kernel:" >&2 + cat $TEST_DIR/config >&2 + exit 1 + fi +} + +function allow_user_defaults() +{ + if [ -z $DIR ]; then + DIR="/sys/module/test_sysctl/" + fi + if [ -z $DEFAULT_NUM_TESTS ]; then + DEFAULT_NUM_TESTS=50 + fi + if [ -z $SYSCTL ]; then + SYSCTL="/proc/sys/debug/test_sysctl" + fi + if [ -z $PAGE_SIZE ]; then + PAGE_SIZE=$(getconf PAGESIZE) + fi + if [ -z $MAX_DIGITS ]; then + MAX_DIGITS=$(($PAGE_SIZE/8)) + fi + if [ -z $INT_MAX ]; then + INT_MAX=$(getconf INT_MAX) + fi + if [ -z $UINT_MAX ]; then + UINT_MAX=$(getconf UINT_MAX) + fi +} + +test_reqs() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo $msg must be run as root >&2 + exit 0 + fi + + if ! which perl 2> /dev/null > /dev/null; then + echo "$0: You need perl installed" + exit 1 + fi + if ! which getconf 2> /dev/null > /dev/null; then + echo "$0: You need getconf installed" + exit 1 + fi + if ! which diff 2> /dev/null > /dev/null; then + echo "$0: You need diff installed" + exit 1 + fi +} + +function load_req_mod() +{ + trap "test_modprobe" EXIT + + if [ ! -d $DIR ]; then + modprobe $TEST_DRIVER + if [ $? -ne 0 ]; then + exit + fi + fi +} + +reset_vals() +{ + VAL="" + TRIGGER=$(basename ${TARGET}) + case "$TRIGGER" in + int_0001) + VAL="60" + ;; + int_0002) + VAL="1" + ;; + uint_0001) + VAL="314" + ;; + string_0001) + VAL="(none)" + ;; + *) + ;; + esac + echo -n $VAL > $TARGET +} + +set_orig() +{ + if [ ! -z $TARGET ]; then + echo "${ORIG}" > "${TARGET}" + fi +} + +set_test() +{ + echo "${TEST_STR}" > "${TARGET}" +} + +verify() +{ + local seen + seen=$(cat "$1") + if [ "${seen}" != "${TEST_STR}" ]; then + return 1 + fi + return 0 +} + +verify_diff_w() +{ + echo "$TEST_STR" | diff -q -w -u - $1 + return $? +} + +test_rc() +{ + if [[ $rc != 0 ]]; then + echo "Failed test, return value: $rc" >&2 + exit $rc + fi +} + +test_finish() +{ + set_orig + rm -f "${TEST_FILE}" +} + +run_numerictests() +{ + echo "== Testing sysctl behavior against ${TARGET} ==" + + rc=0 + + echo -n "Writing test file ... " + echo "${TEST_STR}" > "${TEST_FILE}" + if ! verify "${TEST_FILE}"; then + echo "FAIL" >&2 + exit 1 + else + echo "ok" + fi + + echo -n "Checking sysctl is not set to test value ... " + if verify "${TARGET}"; then + echo "FAIL" >&2 + exit 1 + else + echo "ok" + fi + + echo -n "Writing sysctl from shell ... " + set_test + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + exit 1 + else + echo "ok" + fi + + echo -n "Resetting sysctl to original value ... " + set_orig + if verify "${TARGET}"; then + echo "FAIL" >&2 + exit 1 + else + echo "ok" + fi + + # Now that we've validated the sanity of "set_test" and "set_orig", + # we can use those functions to set starting states before running + # specific behavioral tests. + + echo -n "Writing entire sysctl in single write ... " + set_orig + dd if="${TEST_FILE}" of="${TARGET}" bs=4096 2>/dev/null + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Writing middle of sysctl after synchronized seek ... " + set_test + dd if="${TEST_FILE}" of="${TARGET}" bs=1 seek=1 skip=1 2>/dev/null + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Writing beyond end of sysctl ... " + set_orig + dd if="${TEST_FILE}" of="${TARGET}" bs=20 seek=2 2>/dev/null + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Writing sysctl with multiple long writes ... " + set_orig + (perl -e 'print "A" x 50;'; echo "${TEST_STR}") | \ + dd of="${TARGET}" bs=50 2>/dev/null + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + +# Your test must accept digits 3 and 4 to use this +run_limit_digit() +{ + echo -n "Checking ignoring spaces up to PAGE_SIZE works on write ..." + reset_vals + + LIMIT=$((MAX_DIGITS -1)) + TEST_STR="3" + (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \ + dd of="${TARGET}" 2>/dev/null + + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Checking passing PAGE_SIZE of spaces fails on write ..." + reset_vals + + LIMIT=$((MAX_DIGITS)) + TEST_STR="4" + (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \ + dd of="${TARGET}" 2>/dev/null + + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + +# You are using an int +run_limit_digit_int() +{ + echo -n "Testing INT_MAX works ..." + reset_vals + TEST_STR="$INT_MAX" + echo -n $TEST_STR > $TARGET + + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing INT_MAX + 1 will fail as expected..." + reset_vals + let TEST_STR=$INT_MAX+1 + echo -n $TEST_STR > $TARGET 2> /dev/null + + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing negative values will work as expected..." + reset_vals + TEST_STR="-3" + echo -n $TEST_STR > $TARGET 2> /dev/null + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + +# You used an int array +run_limit_digit_int_array() +{ + echo -n "Testing array works as expected ... " + TEST_STR="4 3 2 1" + echo -n $TEST_STR > $TARGET + + if ! verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing skipping trailing array elements works ... " + # Do not reset_vals, carry on the values from the last test. + # If we only echo in two digits the last two are left intact + TEST_STR="100 101" + echo -n $TEST_STR > $TARGET + # After we echo in, to help diff we need to set on TEST_STR what + # we expect the result to be. + TEST_STR="100 101 2 1" + + if ! verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing PAGE_SIZE limit on array works ... " + # Do not reset_vals, carry on the values from the last test. + # Even if you use an int array, you are still restricted to + # MAX_DIGITS, this is a known limitation. Test limit works. + LIMIT=$((MAX_DIGITS -1)) + TEST_STR="9" + (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \ + dd of="${TARGET}" 2>/dev/null + + TEST_STR="9 101 2 1" + if ! verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing exceeding PAGE_SIZE limit fails as expected ... " + # Do not reset_vals, carry on the values from the last test. + # Now go over limit. + LIMIT=$((MAX_DIGITS)) + TEST_STR="7" + (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \ + dd of="${TARGET}" 2>/dev/null + + TEST_STR="7 101 2 1" + if verify_diff_w "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + +# You are using an unsigned int +run_limit_digit_uint() +{ + echo -n "Testing UINT_MAX works ..." + reset_vals + TEST_STR="$UINT_MAX" + echo -n $TEST_STR > $TARGET + + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing UINT_MAX + 1 will fail as expected..." + reset_vals + TEST_STR=$(($UINT_MAX+1)) + echo -n $TEST_STR > $TARGET 2> /dev/null + + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc + + echo -n "Testing negative values will not work as expected ..." + reset_vals + TEST_STR="-3" + echo -n $TEST_STR > $TARGET 2> /dev/null + + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + test_rc +} + +run_stringtests() +{ + echo -n "Writing entire sysctl in short writes ... " + set_orig + dd if="${TEST_FILE}" of="${TARGET}" bs=1 2>/dev/null + if ! verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Writing middle of sysctl after unsynchronized seek ... " + set_test + dd if="${TEST_FILE}" of="${TARGET}" bs=1 seek=1 2>/dev/null + if verify "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Checking sysctl maxlen is at least $MAXLEN ... " + set_orig + perl -e 'print "A" x ('"${MAXLEN}"'-2), "B";' | \ + dd of="${TARGET}" bs="${MAXLEN}" 2>/dev/null + if ! grep -q B "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Checking sysctl keeps original string on overflow append ... " + set_orig + perl -e 'print "A" x ('"${MAXLEN}"'-1), "B";' | \ + dd of="${TARGET}" bs=$(( MAXLEN - 1 )) 2>/dev/null + if grep -q B "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Checking sysctl stays NULL terminated on write ... " + set_orig + perl -e 'print "A" x ('"${MAXLEN}"'-1), "B";' | \ + dd of="${TARGET}" bs="${MAXLEN}" 2>/dev/null + if grep -q B "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + echo -n "Checking sysctl stays NULL terminated on overwrite ... " + set_orig + perl -e 'print "A" x ('"${MAXLEN}"'-1), "BB";' | \ + dd of="${TARGET}" bs=$(( $MAXLEN + 1 )) 2>/dev/null + if grep -q B "${TARGET}"; then + echo "FAIL" >&2 + rc=1 + else + echo "ok" + fi + + test_rc +} + +sysctl_test_0001() +{ + TARGET="${SYSCTL}/int_0001" + reset_vals + ORIG=$(cat "${TARGET}") + TEST_STR=$(( $ORIG + 1 )) + + run_numerictests + run_limit_digit +} + +sysctl_test_0002() +{ + TARGET="${SYSCTL}/string_0001" + reset_vals + ORIG=$(cat "${TARGET}") + TEST_STR="Testing sysctl" + # Only string sysctls support seeking/appending. + MAXLEN=65 + + run_numerictests + run_stringtests +} + +sysctl_test_0003() +{ + TARGET="${SYSCTL}/int_0002" + reset_vals + ORIG=$(cat "${TARGET}") + TEST_STR=$(( $ORIG + 1 )) + + run_numerictests + run_limit_digit + run_limit_digit_int +} + +sysctl_test_0004() +{ + TARGET="${SYSCTL}/uint_0001" + reset_vals + ORIG=$(cat "${TARGET}") + TEST_STR=$(( $ORIG + 1 )) + + run_numerictests + run_limit_digit + run_limit_digit_uint +} + +sysctl_test_0005() +{ + TARGET="${SYSCTL}/int_0003" + reset_vals + ORIG=$(cat "${TARGET}") + + run_limit_digit_int_array +} + +list_tests() +{ + echo "Test ID list:" + echo + echo "TEST_ID x NUM_TEST" + echo "TEST_ID: Test ID" + echo "NUM_TESTS: Number of recommended times to run the test" + echo + echo "0001 x $(get_test_count 0001) - tests proc_dointvec_minmax()" + echo "0002 x $(get_test_count 0002) - tests proc_dostring()" + echo "0003 x $(get_test_count 0003) - tests proc_dointvec()" + echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" + echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" +} + +test_reqs + +usage() +{ + NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .) + let NUM_TESTS=$NUM_TESTS+1 + MAX_TEST=$(printf "%04d\n" $NUM_TESTS) + echo "Usage: $0 [ -t <4-number-digit> ] | [ -w <4-number-digit> ] |" + echo " [ -s <4-number-digit> ] | [ -c <4-number-digit> <test- count>" + echo " [ all ] [ -h | --help ] [ -l ]" + echo "" + echo "Valid tests: 0001-$MAX_TEST" + echo "" + echo " all Runs all tests (default)" + echo " -t Run test ID the number amount of times is recommended" + echo " -w Watch test ID run until it runs into an error" + echo " -c Run test ID once" + echo " -s Run test ID x test-count number of times" + echo " -l List all test ID list" + echo " -h|--help Help" + echo + echo "If an error every occurs execution will immediately terminate." + echo "If you are adding a new test try using -w <test-ID> first to" + echo "make sure the test passes a series of tests." + echo + echo Example uses: + echo + echo "$TEST_NAME.sh -- executes all tests" + echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 number of times is recomended" + echo "$TEST_NAME.sh -w 0002 -- Watch test ID 0002 run until an error occurs" + echo "$TEST_NAME.sh -s 0002 -- Run test ID 0002 once" + echo "$TEST_NAME.sh -c 0002 3 -- Run test ID 0002 three times" + echo + list_tests + exit 1 +} + +function test_num() +{ + re='^[0-9]+$' + if ! [[ $1 =~ $re ]]; then + usage + fi +} + +function get_test_count() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + LAST_TWO=${TEST_DATA#*:*} + echo ${LAST_TWO%:*} +} + +function get_test_enabled() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + echo ${TEST_DATA#*:*:} +} + +function run_all_tests() +{ + for i in $ALL_TESTS ; do + TEST_ID=${i%:*:*} + ENABLED=$(get_test_enabled $TEST_ID) + TEST_COUNT=$(get_test_count $TEST_ID) + if [[ $ENABLED -eq "1" ]]; then + test_case $TEST_ID $TEST_COUNT + fi + done +} + +function watch_log() +{ + if [ $# -ne 3 ]; then + clear + fi + date + echo "Running test: $2 - run #$1" +} + +function watch_case() +{ + i=0 + while [ 1 ]; do + + if [ $# -eq 1 ]; then + test_num $1 + watch_log $i ${TEST_NAME}_test_$1 + ${TEST_NAME}_test_$1 + else + watch_log $i all + run_all_tests + fi + let i=$i+1 + done +} + +function test_case() +{ + NUM_TESTS=$DEFAULT_NUM_TESTS + if [ $# -eq 2 ]; then + NUM_TESTS=$2 + fi + + i=0 + while [ $i -lt $NUM_TESTS ]; do + test_num $1 + watch_log $i ${TEST_NAME}_test_$1 noclear + RUN_TEST=${TEST_NAME}_test_$1 + $RUN_TEST + let i=$i+1 + done +} + +function parse_args() +{ + if [ $# -eq 0 ]; then + run_all_tests + else + if [[ "$1" = "all" ]]; then + run_all_tests + elif [[ "$1" = "-w" ]]; then + shift + watch_case $@ + elif [[ "$1" = "-t" ]]; then + shift + test_num $1 + test_case $1 $(get_test_count $1) + elif [[ "$1" = "-c" ]]; then + shift + test_num $1 + test_num $2 + test_case $1 $2 + elif [[ "$1" = "-s" ]]; then + shift + test_case $1 1 + elif [[ "$1" = "-l" ]]; then + list_tests + elif [[ "$1" = "-h" || "$1" = "--help" ]]; then + usage + else + usage + fi + fi +} + +test_reqs +allow_user_defaults +load_req_mod + +trap "test_finish" EXIT + +parse_args $@ + +exit 0 diff --git a/usr/Kconfig b/usr/Kconfig index ad0543e21760..d53112fdbf5a 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -36,10 +36,8 @@ config INITRAMFS_ROOT_UID depends on INITRAMFS_SOURCE!="" default "0" help - This setting is only meaningful if the INITRAMFS_SOURCE is - contains a directory. Setting this user ID (UID) to something - other than "0" will cause all files owned by that UID to be - owned by user root in the initial ramdisk image. + If INITRAMFS_SOURCE points to a directory, files owned by this UID + (-1 = current user) will be owned by root in the resulting image. If you are not sure, leave it set to "0". @@ -48,15 +46,13 @@ config INITRAMFS_ROOT_GID depends on INITRAMFS_SOURCE!="" default "0" help - This setting is only meaningful if the INITRAMFS_SOURCE is - contains a directory. Setting this group ID (GID) to something - other than "0" will cause all files owned by that GID to be - owned by group root in the initial ramdisk image. + If INITRAMFS_SOURCE points to a directory, files owned by this GID + (-1 = current group) will be owned by root in the resulting image. If you are not sure, leave it set to "0". config RD_GZIP - bool "Support initial ramdisks compressed using gzip" + bool "Support initial ramdisk/ramfs compressed using gzip" depends on BLK_DEV_INITRD default y select DECOMPRESS_GZIP @@ -65,7 +61,7 @@ config RD_GZIP If unsure, say Y. config RD_BZIP2 - bool "Support initial ramdisks compressed using bzip2" + bool "Support initial ramdisk/ramfs compressed using bzip2" default y depends on BLK_DEV_INITRD select DECOMPRESS_BZIP2 @@ -74,7 +70,7 @@ config RD_BZIP2 If unsure, say N. config RD_LZMA - bool "Support initial ramdisks compressed using LZMA" + bool "Support initial ramdisk/ramfs compressed using LZMA" default y depends on BLK_DEV_INITRD select DECOMPRESS_LZMA @@ -83,7 +79,7 @@ config RD_LZMA If unsure, say N. config RD_XZ - bool "Support initial ramdisks compressed using XZ" + bool "Support initial ramdisk/ramfs compressed using XZ" depends on BLK_DEV_INITRD default y select DECOMPRESS_XZ @@ -92,7 +88,7 @@ config RD_XZ If unsure, say N. config RD_LZO - bool "Support initial ramdisks compressed using LZO" + bool "Support initial ramdisk/ramfs compressed using LZO" default y depends on BLK_DEV_INITRD select DECOMPRESS_LZO @@ -101,7 +97,7 @@ config RD_LZO If unsure, say N. config RD_LZ4 - bool "Support initial ramdisks compressed using LZ4" + bool "Support initial ramdisk/ramfs compressed using LZ4" default y depends on BLK_DEV_INITRD select DECOMPRESS_LZ4 |