diff options
105 files changed, 2096 insertions, 1296 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 49d7c997fa1e..e50b95c25868 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back. Amount of memory used in network transmission buffers + shmem + + Amount of cached filesystem data that is swap-backed, + such as tmpfs, shm segments, shared anonymous mmap()s + file_mapped Amount of cached filesystem data mapped with mmap() diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 9036dbf16156..4cddbce85ac9 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -413,6 +413,7 @@ Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB +LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB @@ -442,6 +443,11 @@ accessed. "Anonymous" shows the amount of memory that does not belong to any file. Even a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. +"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE). +The memory isn't freed immediately with madvise(). It's freed in memory +pressure if the memory is clean. Please note that the printed value might +be lower than the real value due to optimizations used in the current +implementation. If this is not desirable please file a bug report. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by huge pages. diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include <linux/personality.h> /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include <asm/page-nommu.h> diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 8c35b3152e1e..44b50646582d 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -618,7 +618,7 @@ static int mipspmu_event_init(struct perf_event *event) return -ENOENT; } - if (event->cpu >= nr_cpumask_bits || + if ((unsigned int)event->cpu >= nr_cpumask_bits || (event->cpu >= 0 && !cpu_online(event->cpu))) return -ENODEV; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 1c0b58545c04..534a10c00c54 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -823,7 +823,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event) } /* Check online status of the CPU to which the event is pinned */ - if (event->cpu >= nr_cpumask_bits || + if ((unsigned int)event->cpu >= nr_cpumask_bits || (event->cpu >= 0 && !cpu_online(event->cpu))) return -ENODEV; diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S index 7c0c81f18837..c07834f3a6fc 100644 --- a/arch/sparc/lib/NG4memset.S +++ b/arch/sparc/lib/NG4memset.S @@ -13,14 +13,14 @@ .globl NG4memset NG4memset: andcc %o1, 0xff, %o4 - be,pt %icc, 1f + be,pt %xcc, 1f mov %o2, %o1 sllx %o4, 8, %g1 or %g1, %o4, %o2 sllx %o2, 16, %g1 or %g1, %o2, %o2 sllx %o2, 32, %g1 - ba,pt %icc, 1f + ba,pt %xcc, 1f or %g1, %o2, %o4 .size NG4memset,.-NG4memset @@ -29,7 +29,7 @@ NG4memset: NG4bzero: clr %o4 1: cmp %o1, 16 - ble %icc, .Ltiny + ble %xcc, .Ltiny mov %o0, %o3 sub %g0, %o0, %g1 and %g1, 0x7, %g1 @@ -37,7 +37,7 @@ NG4bzero: sub %o1, %g1, %o1 1: stb %o4, [%o0 + 0x00] subcc %g1, 1, %g1 - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 1, %o0 .Laligned8: cmp %o1, 64 + (64 - 8) @@ -48,7 +48,7 @@ NG4bzero: sub %o1, %g1, %o1 1: stx %o4, [%o0 + 0x00] subcc %g1, 8, %g1 - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x8, %o0 .Laligned64: andn %o1, 64 - 1, %g1 @@ -58,30 +58,30 @@ NG4bzero: 1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P subcc %g1, 0x40, %g1 stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x40, %o0 .Lpostloop: cmp %o1, 8 - bl,pn %icc, .Ltiny + bl,pn %xcc, .Ltiny membar #StoreStore|#StoreLoad .Lmedium: andn %o1, 0x7, %g1 sub %o1, %g1, %o1 1: stx %o4, [%o0 + 0x00] subcc %g1, 0x8, %g1 - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x08, %o0 andcc %o1, 0x4, %g1 - be,pt %icc, .Ltiny + be,pt %xcc, .Ltiny sub %o1, %g1, %o1 stw %o4, [%o0 + 0x00] add %o0, 0x4, %o0 .Ltiny: cmp %o1, 0 - be,pn %icc, .Lexit + be,pn %xcc, .Lexit 1: subcc %o1, 1, %o1 stb %o4, [%o0 + 0x00] - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 1, %o0 .Lexit: retl @@ -99,8 +99,8 @@ NG4bzero: stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x30, %o0 - ba,a,pt %icc, .Lpostloop + ba,a,pt %xcc, .Lpostloop nop .size NG4bzero,.-NG4bzero diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index caa5798c92f4..f717b73182e7 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -16,36 +16,46 @@ #define ATOMIC_INIT(i) { (i) } /** - * atomic_read - read atomic variable + * arch_atomic_read - read atomic variable * @v: pointer of type atomic_t * * Atomically reads the value of @v. */ -static __always_inline int atomic_read(const atomic_t *v) +static __always_inline int arch_atomic_read(const atomic_t *v) { - return READ_ONCE((v)->counter); + /* + * We use READ_ONCE_NOCHECK() because atomic_read() contains KASAN + * instrumentation. Double instrumentation is unnecessary. + */ + return READ_ONCE_NOCHECK((v)->counter); } /** - * atomic_set - set atomic variable + * arch_atomic_set - set atomic variable * @v: pointer of type atomic_t * @i: required value * * Atomically sets the value of @v to @i. */ -static __always_inline void atomic_set(atomic_t *v, int i) +static __always_inline void arch_atomic_set(atomic_t *v, int i) { + /* + * We could use WRITE_ONCE_NOCHECK() if it exists, similar to + * READ_ONCE_NOCHECK() in arch_atomic_read(). But there is no such + * thing at the moment, and introducing it for this case does not + * worth it. + */ WRITE_ONCE(v->counter, i); } /** - * atomic_add - add integer to atomic variable + * arch_atomic_add - add integer to atomic variable * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v. */ -static __always_inline void atomic_add(int i, atomic_t *v) +static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) @@ -53,13 +63,13 @@ static __always_inline void atomic_add(int i, atomic_t *v) } /** - * atomic_sub - subtract integer from atomic variable + * arch_atomic_sub - subtract integer from atomic variable * @i: integer value to subtract * @v: pointer of type atomic_t * * Atomically subtracts @i from @v. */ -static __always_inline void atomic_sub(int i, atomic_t *v) +static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) @@ -67,7 +77,7 @@ static __always_inline void atomic_sub(int i, atomic_t *v) } /** - * atomic_sub_and_test - subtract value from variable and test result + * arch_atomic_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer of type atomic_t * @@ -75,63 +85,63 @@ static __always_inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) +static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); } /** - * atomic_inc - increment atomic variable + * arch_atomic_inc - increment atomic variable * @v: pointer of type atomic_t * * Atomically increments @v by 1. */ -static __always_inline void atomic_inc(atomic_t *v) +static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); } /** - * atomic_dec - decrement atomic variable + * arch_atomic_dec - decrement atomic variable * @v: pointer of type atomic_t * * Atomically decrements @v by 1. */ -static __always_inline void atomic_dec(atomic_t *v) +static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); } /** - * atomic_dec_and_test - decrement and test + * arch_atomic_dec_and_test - decrement and test * @v: pointer of type atomic_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ -static __always_inline bool atomic_dec_and_test(atomic_t *v) +static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); } /** - * atomic_inc_and_test - increment and test + * arch_atomic_inc_and_test - increment and test * @v: pointer of type atomic_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ -static __always_inline bool atomic_inc_and_test(atomic_t *v) +static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); } /** - * atomic_add_negative - add and test if negative + * arch_atomic_add_negative - add and test if negative * @i: integer value to add * @v: pointer of type atomic_t * @@ -139,51 +149,51 @@ static __always_inline bool atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static __always_inline bool atomic_add_negative(int i, atomic_t *v) +static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); } /** - * atomic_add_return - add integer and return + * arch_atomic_add_return - add integer and return * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v and returns @i + @v */ -static __always_inline int atomic_add_return(int i, atomic_t *v) +static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } /** - * atomic_sub_return - subtract integer and return + * arch_atomic_sub_return - subtract integer and return * @v: pointer of type atomic_t * @i: integer value to subtract * * Atomically subtracts @i from @v and returns @v - @i */ -static __always_inline int atomic_sub_return(int i, atomic_t *v) +static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) { - return atomic_add_return(-i, v); + return arch_atomic_add_return(-i, v); } -#define atomic_inc_return(v) (atomic_add_return(1, v)) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) +#define arch_atomic_inc_return(v) (arch_atomic_add_return(1, v)) +#define arch_atomic_dec_return(v) (arch_atomic_sub_return(1, v)) -static __always_inline int atomic_fetch_add(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); } -static __always_inline int atomic_fetch_sub(int i, atomic_t *v) +static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) { return xadd(&v->counter, -i); } -static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { - return cmpxchg(&v->counter, old, new); + return arch_cmpxchg(&v->counter, old, new); } #define atomic_try_cmpxchg atomic_try_cmpxchg @@ -192,13 +202,13 @@ static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) return try_cmpxchg(&v->counter, old, new); } -static inline int atomic_xchg(atomic_t *v, int new) +static inline int arch_atomic_xchg(atomic_t *v, int new) { - return xchg(&v->counter, new); + return arch_xchg(&v->counter, new); } #define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ +static inline void arch_atomic_##op(int i, atomic_t *v) \ { \ asm volatile(LOCK_PREFIX #op"l %1,%0" \ : "+m" (v->counter) \ @@ -207,9 +217,9 @@ static inline void atomic_##op(int i, atomic_t *v) \ } #define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ +static inline int arch_atomic_fetch_##op(int i, atomic_t *v) \ { \ - int val = atomic_read(v); \ + int val = arch_atomic_read(v); \ do { \ } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ return val; \ @@ -228,7 +238,7 @@ ATOMIC_OPS(xor, ^) #undef ATOMIC_OP /** - * __atomic_add_unless - add unless the number is already a given value + * __arch_atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -236,9 +246,9 @@ ATOMIC_OPS(xor, ^) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u) { - int c = atomic_read(v); + int c = arch_atomic_read(v); do { if (unlikely(c == u)) break; @@ -247,13 +257,13 @@ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) } /** - * atomic_inc_short - increment of a short integer + * arch_atomic_inc_short - increment of a short integer * @v: pointer to type int * * Atomically adds 1 to @v * Returns the new value of @u */ -static __always_inline short int atomic_inc_short(short int *v) +static __always_inline short int arch_atomic_inc_short(short int *v) { asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); return *v; @@ -265,4 +275,6 @@ static __always_inline short int atomic_inc_short(short int *v) # include <asm/atomic64_64.h> #endif +#include <asm-generic/atomic-instrumented.h> + #endif /* _ASM_X86_ATOMIC_H */ diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..4a48b20000f3 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -61,7 +61,7 @@ ATOMIC64_DECL(add_unless); #undef ATOMIC64_EXPORT /** - * atomic64_cmpxchg - cmpxchg atomic64 variable + * arch_atomic64_cmpxchg - cmpxchg atomic64 variable * @v: pointer to type atomic64_t * @o: expected value * @n: new value @@ -70,20 +70,21 @@ ATOMIC64_DECL(add_unless); * the old value. */ -static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n) +static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, + long long n) { - return cmpxchg64(&v->counter, o, n); + return arch_cmpxchg64(&v->counter, o, n); } /** - * atomic64_xchg - xchg atomic64 variable + * arch_atomic64_xchg - xchg atomic64 variable * @v: pointer to type atomic64_t * @n: value to assign * * Atomically xchgs the value of @v to @n and returns * the old value. */ -static inline long long atomic64_xchg(atomic64_t *v, long long n) +static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) { long long o; unsigned high = (unsigned)(n >> 32); @@ -95,13 +96,13 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n) } /** - * atomic64_set - set atomic64 variable + * arch_atomic64_set - set atomic64 variable * @v: pointer to type atomic64_t * @i: value to assign * * Atomically sets the value of @v to @n. */ -static inline void atomic64_set(atomic64_t *v, long long i) +static inline void arch_atomic64_set(atomic64_t *v, long long i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; @@ -111,12 +112,12 @@ static inline void atomic64_set(atomic64_t *v, long long i) } /** - * atomic64_read - read atomic64 variable + * arch_atomic64_read - read atomic64 variable * @v: pointer to type atomic64_t * * Atomically reads the value of @v and returns it. */ -static inline long long atomic64_read(const atomic64_t *v) +static inline long long arch_atomic64_read(const atomic64_t *v) { long long r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); @@ -124,13 +125,13 @@ static inline long long atomic64_read(const atomic64_t *v) } /** - * atomic64_add_return - add and return + * arch_atomic64_add_return - add and return * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v and returns @i + *@v */ -static inline long long atomic64_add_return(long long i, atomic64_t *v) +static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) { alternative_atomic64(add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -141,7 +142,7 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v) /* * Other variants with different arithmetic operators: */ -static inline long long atomic64_sub_return(long long i, atomic64_t *v) +static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) { alternative_atomic64(sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -149,7 +150,7 @@ static inline long long atomic64_sub_return(long long i, atomic64_t *v) return i; } -static inline long long atomic64_inc_return(atomic64_t *v) +static inline long long arch_atomic64_inc_return(atomic64_t *v) { long long a; alternative_atomic64(inc_return, "=&A" (a), @@ -157,7 +158,7 @@ static inline long long atomic64_inc_return(atomic64_t *v) return a; } -static inline long long atomic64_dec_return(atomic64_t *v) +static inline long long arch_atomic64_dec_return(atomic64_t *v) { long long a; alternative_atomic64(dec_return, "=&A" (a), @@ -166,13 +167,13 @@ static inline long long atomic64_dec_return(atomic64_t *v) } /** - * atomic64_add - add integer to atomic64 variable + * arch_atomic64_add - add integer to atomic64 variable * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v. */ -static inline long long atomic64_add(long long i, atomic64_t *v) +static inline long long arch_atomic64_add(long long i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -181,13 +182,13 @@ static inline long long atomic64_add(long long i, atomic64_t *v) } /** - * atomic64_sub - subtract the atomic64 variable + * arch_atomic64_sub - subtract the atomic64 variable * @i: integer value to subtract * @v: pointer to type atomic64_t * * Atomically subtracts @i from @v. */ -static inline long long atomic64_sub(long long i, atomic64_t *v) +static inline long long arch_atomic64_sub(long long i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -196,7 +197,7 @@ static inline long long atomic64_sub(long long i, atomic64_t *v) } /** - * atomic64_sub_and_test - subtract value from variable and test result + * arch_atomic64_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer to type atomic64_t * @@ -204,46 +205,46 @@ static inline long long atomic64_sub(long long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic64_sub_and_test(long long i, atomic64_t *v) +static inline int arch_atomic64_sub_and_test(long long i, atomic64_t *v) { - return atomic64_sub_return(i, v) == 0; + return arch_atomic64_sub_return(i, v) == 0; } /** - * atomic64_inc - increment atomic64 variable + * arch_atomic64_inc - increment atomic64 variable * @v: pointer to type atomic64_t * * Atomically increments @v by 1. */ -static inline void atomic64_inc(atomic64_t *v) +static inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } /** - * atomic64_dec - decrement atomic64 variable + * arch_atomic64_dec - decrement atomic64 variable * @v: pointer to type atomic64_t * * Atomically decrements @v by 1. */ -static inline void atomic64_dec(atomic64_t *v) +static inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, "S" (v) : "memory", "eax", "ecx", "edx"); } /** - * atomic64_dec_and_test - decrement and test + * arch_atomic64_dec_and_test - decrement and test * @v: pointer to type atomic64_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic64_dec_and_test(atomic64_t *v) +static inline int arch_atomic64_dec_and_test(atomic64_t *v) { - return atomic64_dec_return(v) == 0; + return arch_atomic64_dec_return(v) == 0; } /** @@ -254,13 +255,13 @@ static inline int atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic64_inc_and_test(atomic64_t *v) +static inline int arch_atomic64_inc_and_test(atomic64_t *v) { - return atomic64_inc_return(v) == 0; + return arch_atomic64_inc_return(v) == 0; } /** - * atomic64_add_negative - add and test if negative + * arch_atomic64_add_negative - add and test if negative * @i: integer value to add * @v: pointer to type atomic64_t * @@ -268,13 +269,13 @@ static inline int atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic64_add_negative(long long i, atomic64_t *v) +static inline int arch_atomic64_add_negative(long long i, atomic64_t *v) { - return atomic64_add_return(i, v) < 0; + return arch_atomic64_add_return(i, v) < 0; } /** - * atomic64_add_unless - add unless the number is a given value + * arch_atomic64_add_unless - add unless the number is a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -282,7 +283,8 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if the add was done, zero otherwise. */ -static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) +static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, + long long u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); @@ -293,7 +295,7 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) } -static inline int atomic64_inc_not_zero(atomic64_t *v) +static inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; alternative_atomic64(inc_not_zero, "=&a" (r), @@ -301,7 +303,7 @@ static inline int atomic64_inc_not_zero(atomic64_t *v) return r; } -static inline long long atomic64_dec_if_positive(atomic64_t *v) +static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) { long long r; alternative_atomic64(dec_if_positive, "=&A" (r), @@ -313,25 +315,25 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef __alternative_atomic64 #define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ +static inline void arch_atomic64_##op(long long i, atomic64_t *v) \ { \ long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ + while ((old = arch_atomic64_cmpxchg(v, c, c c_op i)) != c) \ c = old; \ } #define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ +static inline long long arch_atomic64_fetch_##op(long long i, atomic64_t *v) \ { \ long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ + while ((old = arch_atomic64_cmpxchg(v, c, c c_op i)) != c) \ c = old; \ return old; \ } ATOMIC64_FETCH_OP(add, +) -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) +#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), (v)) #define ATOMIC64_OPS(op, c_op) \ ATOMIC64_OP(op, c_op) \ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..d4b36753d3b5 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -16,31 +16,31 @@ * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -static inline long atomic64_read(const atomic64_t *v) +static inline long arch_atomic64_read(const atomic64_t *v) { - return READ_ONCE((v)->counter); + return READ_ONCE_NOCHECK((v)->counter); } /** - * atomic64_set - set atomic64 variable + * arch_atomic64_set - set atomic64 variable * @v: pointer to type atomic64_t * @i: required value * * Atomically sets the value of @v to @i. */ -static inline void atomic64_set(atomic64_t *v, long i) +static inline void arch_atomic64_set(atomic64_t *v, long i) { WRITE_ONCE(v->counter, i); } /** - * atomic64_add - add integer to atomic64 variable + * arch_atomic64_add - add integer to atomic64 variable * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v. */ -static __always_inline void atomic64_add(long i, atomic64_t *v) +static __always_inline void arch_atomic64_add(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) @@ -48,13 +48,13 @@ static __always_inline void atomic64_add(long i, atomic64_t *v) } /** - * atomic64_sub - subtract the atomic64 variable + * arch_atomic64_sub - subtract the atomic64 variable * @i: integer value to subtract * @v: pointer to type atomic64_t * * Atomically subtracts @i from @v. */ -static inline void atomic64_sub(long i, atomic64_t *v) +static inline void arch_atomic64_sub(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) @@ -62,7 +62,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) } /** - * atomic64_sub_and_test - subtract value from variable and test result + * arch_atomic64_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer to type atomic64_t * @@ -70,18 +70,18 @@ static inline void atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline bool atomic64_sub_and_test(long i, atomic64_t *v) +static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); } /** - * atomic64_inc - increment atomic64 variable + * arch_atomic64_inc - increment atomic64 variable * @v: pointer to type atomic64_t * * Atomically increments @v by 1. */ -static __always_inline void atomic64_inc(atomic64_t *v) +static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) @@ -89,12 +89,12 @@ static __always_inline void atomic64_inc(atomic64_t *v) } /** - * atomic64_dec - decrement atomic64 variable + * arch_atomic64_dec - decrement atomic64 variable * @v: pointer to type atomic64_t * * Atomically decrements @v by 1. */ -static __always_inline void atomic64_dec(atomic64_t *v) +static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) @@ -102,33 +102,33 @@ static __always_inline void atomic64_dec(atomic64_t *v) } /** - * atomic64_dec_and_test - decrement and test + * arch_atomic64_dec_and_test - decrement and test * @v: pointer to type atomic64_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ -static inline bool atomic64_dec_and_test(atomic64_t *v) +static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); } /** - * atomic64_inc_and_test - increment and test + * arch_atomic64_inc_and_test - increment and test * @v: pointer to type atomic64_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ -static inline bool atomic64_inc_and_test(atomic64_t *v) +static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); } /** - * atomic64_add_negative - add and test if negative + * arch_atomic64_add_negative - add and test if negative * @i: integer value to add * @v: pointer to type atomic64_t * @@ -136,44 +136,44 @@ static inline bool atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool atomic64_add_negative(long i, atomic64_t *v) +static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); } /** - * atomic64_add_return - add and return + * arch_atomic64_add_return - add and return * @i: integer value to add * @v: pointer to type atomic64_t * * Atomically adds @i to @v and returns @i + @v */ -static __always_inline long atomic64_add_return(long i, atomic64_t *v) +static __always_inline long arch_atomic64_add_return(long i, atomic64_t *v) { return i + xadd(&v->counter, i); } -static inline long atomic64_sub_return(long i, atomic64_t *v) +static inline long arch_atomic64_sub_return(long i, atomic64_t *v) { - return atomic64_add_return(-i, v); + return arch_atomic64_add_return(-i, v); } -static inline long atomic64_fetch_add(long i, atomic64_t *v) +static inline long arch_atomic64_fetch_add(long i, atomic64_t *v) { return xadd(&v->counter, i); } -static inline long atomic64_fetch_sub(long i, atomic64_t *v) +static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v) { return xadd(&v->counter, -i); } -#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) -#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) +#define arch_atomic64_inc_return(v) (arch_atomic64_add_return(1, (v))) +#define arch_atomic64_dec_return(v) (arch_atomic64_sub_return(1, (v))) -static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) +static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new) { - return cmpxchg(&v->counter, old, new); + return arch_cmpxchg(&v->counter, old, new); } #define atomic64_try_cmpxchg atomic64_try_cmpxchg @@ -182,13 +182,13 @@ static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long return try_cmpxchg(&v->counter, old, new); } -static inline long atomic64_xchg(atomic64_t *v, long new) +static inline long arch_atomic64_xchg(atomic64_t *v, long new) { - return xchg(&v->counter, new); + return arch_xchg(&v->counter, new); } /** - * atomic64_add_unless - add unless the number is a given value + * arch_atomic64_add_unless - add unless the number is a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... * @u: ...unless v is equal to u. @@ -196,9 +196,9 @@ static inline long atomic64_xchg(atomic64_t *v, long new) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) +static inline bool arch_atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + long c = arch_atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -206,18 +206,18 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) return true; } -#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) +#define arch_atomic64_inc_not_zero(v) arch_atomic64_add_unless((v), 1, 0) /* - * atomic64_dec_if_positive - decrement by 1 if old value positive + * arch_atomic64_dec_if_positive - decrement by 1 if old value positive * @v: pointer of type atomic_t * * The function returns the old value of *v minus 1, even if * the atomic variable, v, was not decremented. */ -static inline long atomic64_dec_if_positive(atomic64_t *v) +static inline long arch_atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + long dec, c = arch_atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -227,7 +227,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) } #define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ +static inline void arch_atomic64_##op(long i, atomic64_t *v) \ { \ asm volatile(LOCK_PREFIX #op"q %1,%0" \ : "+m" (v->counter) \ @@ -236,9 +236,9 @@ static inline void atomic64_##op(long i, atomic64_t *v) \ } #define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v) \ { \ - long val = atomic64_read(v); \ + long val = arch_atomic64_read(v); \ do { \ } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ return val; \ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index fb961db51a2a..b580f147e048 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -74,7 +74,7 @@ extern void __add_wrong_size(void) * use "asm volatile" and "memory" clobbers to prevent gcc from moving * information around. */ -#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") +#define arch_xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") /* * Atomic compare and exchange. Compare OLD with MEM, if identical, @@ -144,13 +144,13 @@ extern void __add_wrong_size(void) # include <asm/cmpxchg_64.h> #endif -#define cmpxchg(ptr, old, new) \ +#define arch_cmpxchg(ptr, old, new) \ __cmpxchg(ptr, old, new, sizeof(*(ptr))) -#define sync_cmpxchg(ptr, old, new) \ +#define arch_sync_cmpxchg(ptr, old, new) \ __sync_cmpxchg(ptr, old, new, sizeof(*(ptr))) -#define cmpxchg_local(ptr, old, new) \ +#define arch_cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) @@ -248,10 +248,10 @@ extern void __add_wrong_size(void) __ret; \ }) -#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ +#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \ __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2) -#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ +#define arch_cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ __cmpxchg_double(, p1, p2, o1, o2, n1, n2) #endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index e4959d023af8..d897291d2bf9 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -35,10 +35,10 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) } #ifdef CONFIG_X86_CMPXCHG64 -#define cmpxchg64(ptr, o, n) \ +#define arch_cmpxchg64(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ (unsigned long long)(n))) -#define cmpxchg64_local(ptr, o, n) \ +#define arch_cmpxchg64_local(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ (unsigned long long)(n))) #endif @@ -75,7 +75,7 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define cmpxchg64(ptr, o, n) \ +#define arch_cmpxchg64(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ __typeof__(*(ptr)) __old = (o); \ @@ -92,7 +92,7 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) __ret; }) -#define cmpxchg64_local(ptr, o, n) \ +#define arch_cmpxchg64_local(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ __typeof__(*(ptr)) __old = (o); \ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index caa23a34c963..fafaebacca2d 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -6,13 +6,13 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) *ptr = val; } -#define cmpxchg64(ptr, o, n) \ +#define arch_cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ cmpxchg((ptr), (o), (n)); \ }) -#define cmpxchg64_local(ptr, o, n) \ +#define arch_cmpxchg64_local(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ cmpxchg_local((ptr), (o), (n)); \ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 085c3b300d32..b41ee7a91fe9 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -351,6 +351,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/block/genhd.c b/block/genhd.c index 510aac1486cb..cf67c6f60254 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -889,7 +889,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index dceb5edd1e54..01944419b1f3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -177,15 +177,17 @@ static bool page_same_filled(void *ptr, unsigned long *element) { unsigned int pos; unsigned long *page; + unsigned long val; page = (unsigned long *)ptr; + val = page[0]; - for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) { - if (page[pos] != page[pos + 1]) + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (val != page[pos]) return false; } - *element = page[pos]; + *element = val; return true; } diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 150ce2abf6c8..d3eca879a0a8 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -243,11 +243,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list)); /* Get the physical addresses of the source buffer */ - down_read(¤t->mm->mmap_sem); - num_pinned = get_user_pages(param.local_vaddr - lb_offset, - num_pages, (param.source == -1) ? 0 : FOLL_WRITE, - pages, NULL); - up_read(¤t->mm->mmap_sem); + num_pinned = get_user_pages_unlocked(param.local_vaddr - lb_offset, + num_pages, pages, (param.source == -1) ? 0 : FOLL_WRITE); if (num_pinned != num_pages) { /* get_user_pages() failed */ diff --git a/firmware/Makefile b/firmware/Makefile index e297e1b52636..fa3e81c2a97b 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -176,7 +176,8 @@ quiet_cmd_fwbin = MK_FW $@ wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \ include/config/ppc32.h include/config/ppc64.h \ include/config/superh32.h include/config/superh64.h \ - include/config/x86_32.h include/config/x86_64.h) + include/config/x86_32.h include/config/x86_64.h \ + firmware/Makefile) $(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps) $(call cmd,fwbin,$(patsubst %.gen.S,%,$@)) diff --git a/fs/buffer.c b/fs/buffer.c index 9196f2a270da..2ba905716d34 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1938,7 +1938,7 @@ EXPORT_SYMBOL(page_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, - struct iomap *iomap) + const struct iomap *iomap) { loff_t offset = block << inode->i_blkbits; @@ -1991,7 +1991,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap) + get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -546,21 +546,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { + struct inode *inode = mapping->host; struct page *page; int ret; /* Hole page already exists? Return it... */ if (!radix_tree_exceptional_entry(*entry)) { page = *entry; - goto out; + goto finish_fault; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - out: + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: vmf->page = page; ret = finish_fault(vmf); vmf->page = NULL; @@ -568,8 +572,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry, if (!ret) { /* Grab reference for PTE that is now referencing the page */ get_page(page); - return VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; } +out: + trace_dax_load_hole(inode, vmf, ret); return ret; } @@ -838,6 +844,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); + trace_dax_writeback_one(mapping->host, index, dax.size >> PAGE_SHIFT); unmap: dax_unmap_atomic(bdev, &dax); put_locked_mapping_entry(mapping, index, entry); @@ -873,6 +880,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; + trace_dax_writeback_range(inode, start_index, end_index); + tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); @@ -893,10 +902,12 @@ int dax_writeback_mapping_range(struct address_space *mapping, ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) - return ret; + goto out; } } - return 0; +out: + trace_dax_writeback_range_done(inode, start_index, end_index); + return (ret < 0 ? ret : 0); } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -921,6 +932,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(ret); *entryp = ret; + trace_dax_insert_mapping(mapping->host, vmf, ret); return vm_insert_mixed(vma, vaddr, dax.pfn); } @@ -932,6 +944,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -941,6 +954,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) if (entry) put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); + trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); @@ -953,6 +967,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) */ finish_mkwrite_fault(vmf); put_locked_mapping_entry(mapping, index, entry); + trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -993,14 +1008,14 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) +static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) { return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); } static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { struct iov_iter *iter = data; loff_t end = pos + length, done = 0; @@ -1133,13 +1148,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, int vmf_ret = 0; void *entry; + trace_dax_pte_fault(inode, vmf, vmf_ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + if (pos >= i_size_read(inode)) { + vmf_ret = VM_FAULT_SIGBUS; + goto out; + } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; @@ -1150,8 +1168,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); - if (error) - return dax_fault_return(error); + if (error) { + vmf_ret = dax_fault_return(error); + goto out; + } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ goto finish_iomap; @@ -1235,6 +1255,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } +out: + trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } diff --git a/fs/dcache.c b/fs/dcache.c index 95d71eda8142..808ea99062c2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3548,8 +3548,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3561,24 +3559,19 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); @@ -3592,14 +3585,11 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO | HASH_ADAPT, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ diff --git a/fs/inode.c b/fs/inode.c index 88110fd0b282..32c8ee454ef0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1916,8 +1916,6 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -1929,20 +1927,15 @@ void __init inode_init_early(void) sizeof(struct hlist_head), ihash_entries, 14, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - unsigned int loop; - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1960,14 +1953,11 @@ void __init inode_init(void) sizeof(struct hlist_head), ihash_entries, 14, - 0, + HASH_ZERO | HASH_ADAPT, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) diff --git a/fs/internal.h b/fs/internal.h index 11c6d89dce9c..cef253ac176c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -42,7 +42,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) */ extern void guard_bio_eod(int rw, struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap); + get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c @@ -179,7 +179,7 @@ extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); * iomap support: */ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); + void *data, const struct iomap *iomap); loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, diff --git a/fs/iomap.c b/fs/iomap.c index 141c3cd55a8b..fb17dba3dac5 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -108,7 +108,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap) + struct page **pagep, const struct iomap *iomap) { pgoff_t index = pos >> PAGE_SHIFT; struct page *page; @@ -151,7 +151,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len, static loff_t iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { struct iov_iter *i = data; long status = 0; @@ -273,7 +273,7 @@ __iomap_read_page(struct inode *inode, loff_t offset) static loff_t iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { long status = 0; ssize_t written = 0; @@ -338,7 +338,7 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, EXPORT_SYMBOL_GPL(iomap_file_dirty); static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap) + unsigned bytes, const struct iomap *iomap) { struct page *page; int status; @@ -355,7 +355,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, } static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, - struct iomap *iomap) + const struct iomap *iomap) { sector_t sector = iomap->blkno + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); @@ -365,7 +365,7 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap) + void *data, const struct iomap *iomap) { bool *did_zero = data; loff_t written = 0; @@ -434,7 +434,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page); static loff_t iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, const struct iomap *iomap) { struct page *page = data; int ret; @@ -525,7 +525,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, static loff_t iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { struct fiemap_ctx *ctx = data; loff_t ret = length; @@ -710,7 +710,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) } static blk_qc_t -iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, +iomap_dio_zero(struct iomap_dio *dio, const struct iomap *iomap, loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); @@ -734,7 +734,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, static loff_t iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, const struct iomap *iomap) { struct iomap_dio *dio = data; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 92b255e1ba58..7fe5428dd170 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -43,6 +43,7 @@ #include <linux/backing-dev.h> #include <linux/bitops.h> #include <linux/ratelimit.h> +#include <linux/sched/mm.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -206,6 +207,14 @@ static int kjournald2(void *arg) wake_up(&journal->j_wait_done_commit); /* + * Make sure that no allocations from this kernel thread will ever + * recurse to the fs layer because we are responsible for the + * transaction commit and any fs involvement might get stuck waiting for + * the trasn. commit. + */ + memalloc_nofs_save(); + + /* * And now, wait forever for commit wakeup events. */ write_lock(&journal->j_state_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5e659ee08d6a..9ee4832b6f8b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -29,6 +29,7 @@ #include <linux/backing-dev.h> #include <linux/bug.h> #include <linux/module.h> +#include <linux/sched/mm.h> #include <trace/events/jbd2.h> @@ -388,6 +389,11 @@ repeat: rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); + /* + * Ensure that no allocations done while the transaction is open are + * going to recurse back to the fs layer. + */ + handle->saved_alloc_context = memalloc_nofs_save(); return 0; } @@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); + return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); jbd2_free_handle(handle); return err; } diff --git a/fs/namespace.c b/fs/namespace.c index cc1375eff88c..c8d4324857bd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3241,7 +3241,6 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { - unsigned u; int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), @@ -3250,22 +3249,17 @@ void __init mnt_init(void) mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, - 0, + HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, - 0, + HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - for (u = 0; u <= m_hash_mask; u++) - INIT_HLIST_HEAD(&mount_hashtable[u]); - for (u = 0; u <= mp_hash_mask; u++) - INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - kernfs_init(); err = sysfs_init(); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d0ab7e56d0b4..13014f83042c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = o2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; + setup_timer(&sc->sc_idle_timeout, o2net_idle_timer, + (unsigned long)sc); sclog(sc, "alloced\n"); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..f0072145eead 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2612,20 +2612,48 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + if (ret == -EEXIST) { + if (oldmle) + __dlm_put_mle(oldmle); + + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + mlog(0, "another process is already migrating it\n"); + goto fail; + } + + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again. Because it has been unhasded from the map + * in the function dlm_add_migration_mle. + * Otherwise the memory will be leaked. It will not be found again from + * the hash map. + */ + if (oldmle) { + /* master is known, detach if not already detached */ + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + + /* + * If the type of the mle is BLOCK, it should be put once for + * release. Otherwise a memory leak may be caused because + * oldmle has been unhashed from the hash map and it will not + * be found any more. + */ + if (oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + /* get an extra reference on the mle. * otherwise the assert_master from the new * master will destroy this. */ dlm_get_mle_inuse(mle); + mle_added = 1; + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); - if (ret == -EEXIST) { - mlog(0, "another process is already migrating it\n"); - goto fail; - } - mle_added = 1; - /* * set the MIGRATING flag and flush asts * if we fail after this we need to re-dirty the lockres @@ -2642,12 +2670,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (ret != -EEXIST && oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (ret < 0) { if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); @@ -3182,16 +3204,24 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, if (ret < 0) kmem_cache_free(dlm_mle_cache, mle); + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again because it has been unhashed from the map + * in the dlm_add_migration_mle(). + * Otherwise the memory will be leaked. It will not be found again from + * the hash map. + */ + if (oldmle) { + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); - if (oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (res) dlm_lockres_put(res); leave: diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..908b05942282 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct dlm_lock *lock, *next; unsigned int freed = 0; + struct list_head *queue = NULL; + int i; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2280,31 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ /* TODO: check pending_asts, pending_basts here */ - list_for_each_entry_safe(lock, next, &res->granted, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->converting, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->blocked, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* + * Can't schedule DLM_UNLOCK_FREE_LOCK: do + * manually + */ + dlm_lock_put(lock); + freed++; + } } } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2cc7a8030275..e250910cffc8 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) struct proc_inode *ei; struct inode *inode; - ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f08bd31c1081..f0c8b33d99b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -441,6 +441,7 @@ struct mem_size_stats { unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; + unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long swap; @@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; - if (PageAnon(page)) + if (PageAnon(page)) { mss->anonymous += size; + if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + mss->lazyfree += size; + } mss->resident += size; /* Accumulate the size in pages that have been accessed. */ @@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" "AnonHugePages: %8lu kB\n" "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" @@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.private_dirty >> 10, mss.referenced >> 10, mss.anonymous >> 10, + mss.lazyfree >> 10, mss.anonymous_thp >> 10, mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, @@ -900,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + pmd_t pmd = *pmdp; + + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 70a5b55e0870..780fc8986dab 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { - unsigned noio_flag = 0; + unsigned nofs_flag = 0; void *ptr; gfp_t lflags; @@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * __vmalloc() will allocate data pages and auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - noio_flag = memalloc_noio_save(); + if (flags & KM_NOFS) + nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - memalloc_noio_restore(noio_flag); + if (flags & KM_NOFS) + memalloc_nofs_restore(nofs_flag); return ptr; } diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index f0fc84fcaac2..d6ea520162b2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if (flags & KM_NOFS) lflags &= ~__GFP_FS; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c3decedc9455..3059a3ec7ecb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2886,7 +2886,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; + unsigned long new_pflags = PF_MEMALLOC_NOFS; /* * we are in a transaction context here, but may also be doing work diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 61494295d92f..05eca126c688 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return 0; } @@ -252,7 +252,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -1016,7 +1016,7 @@ xfs_do_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) goto redirty; /* diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b6208728ba39..ca09061369cb 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -443,17 +443,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; - unsigned noio_flag; + unsigned nofs_flag; /* * vm_map_ram() will allocate auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * that we are in such a context via PF_MEMALLOC_NOFS to prevent * memory reclaim re-entering the filesystem here and * potentially deadlocking. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -461,7 +461,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); if (!bp->b_addr) return -ENOMEM; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 70f42ea86dfb..f5969c8274fc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,7 +134,7 @@ xfs_trans_reserve( bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); /* * Attempt to reserve the needed disk blocks by decrementing @@ -144,7 +144,7 @@ xfs_trans_reserve( if (blocks > 0) { error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; } tp->t_blk_res += blocks; @@ -221,7 +221,7 @@ undo_blocks: tp->t_blk_res = 0; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return error; } @@ -914,7 +914,7 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -944,7 +944,7 @@ out_unreserve: if (commit_lsn == -1 && !error) error = -EIO; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); @@ -998,7 +998,7 @@ xfs_trans_cancel( xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); diff --git a/include/asm-generic/atomic-instrumented.h b/include/asm-generic/atomic-instrumented.h new file mode 100644 index 000000000000..951bcd083925 --- /dev/null +++ b/include/asm-generic/atomic-instrumented.h @@ -0,0 +1,246 @@ +/* + * This file provides wrappers with KASAN instrumentation for atomic operations. + * To use this functionality an arch's atomic.h file needs to define all + * atomic operations with arch_ prefix (e.g. arch_atomic_read()) and include + * this file at the end. This file provides atomic_read() that forwards to + * arch_atomic_read() for actual atomic operation. + * Note: if an arch atomic operation is implemented by means of other atomic + * operations (e.g. atomic_read()/atomic_cmpxchg() loop), then it needs to use + * arch_ variants (i.e. arch_atomic_read()/arch_atomic_cmpxchg()) to avoid + * double instrumentation. + */ +#ifndef _LINUX_ATOMIC_INSTRUMENTED_H +#define _LINUX_ATOMIC_INSTRUMENTED_H + +#include <linux/kasan-checks.h> + +static __always_inline int atomic_read(const atomic_t *v) +{ + kasan_check_read(v, sizeof(*v)); + return arch_atomic_read(v); +} + +static __always_inline long long atomic64_read(const atomic64_t *v) +{ + kasan_check_read(v, sizeof(*v)); + return arch_atomic64_read(v); +} + + +static __always_inline void atomic_set(atomic_t *v, int i) +{ + kasan_check_write(v, sizeof(*v)); + arch_atomic_set(v, i); +} + +static __always_inline void atomic64_set(atomic64_t *v, long long i) +{ + kasan_check_write(v, sizeof(*v)); + arch_atomic64_set(v, i); +} + +static __always_inline int atomic_xchg(atomic_t *v, int i) +{ + kasan_check_write(v, sizeof(*v)); + return arch_atomic_xchg(v, i); +} + +static __always_inline long long atomic64_xchg(atomic64_t *v, long long i) +{ + kasan_check_write(v, sizeof(*v)); + return arch_atomic64_xchg(v, i); +} + +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + kasan_check_write(v, sizeof(*v)); + return arch_atomic_cmpxchg(v, old, new); +} + +static __always_inline long long atomic64_cmpxchg(atomic64_t *v, long long old, + long long new) +{ + kasan_check_write(v, sizeof(*v)); + return arch_atomic64_cmpxchg(v, old, new); +} + +static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) +{ + kasan_check_write(v, sizeof(*v)); + return __arch_atomic_add_unless(v, a, u); +} + + +static __always_inline bool atomic64_add_unless(atomic64_t *v, long long a, + long long u) +{ + kasan_check_write(v, sizeof(*v)); + return arch_atomic64_add_unless(v, a, u); +} + +static __always_inline short int atomic_inc_short(short int *v) +{ + kasan_check_write(v, sizeof(*v)); + return arch_atomic_inc_short(v); +} + +#define __INSTR_VOID1(op, sz) \ +static __always_inline void atomic##sz##_##op(atomic##sz##_t *v) \ +{ \ + kasan_check_write(v, sizeof(*v)); \ + arch_atomic##sz##_##op(v); \ +} + +#define INSTR_VOID1(op) \ +__INSTR_VOID1(op,); \ +__INSTR_VOID1(op, 64) + +INSTR_VOID1(inc); +INSTR_VOID1(dec); + +#undef __INSTR_VOID1 +#undef INSTR_VOID1 + +#define __INSTR_VOID2(op, sz, type) \ +static __always_inline void atomic##sz##_##op(type i, atomic##sz##_t *v)\ +{ \ + kasan_check_write(v, sizeof(*v)); \ + arch_atomic##sz##_##op(i, v); \ +} + +#define INSTR_VOID2(op) \ +__INSTR_VOID2(op, , int); \ +__INSTR_VOID2(op, 64, long long) + +INSTR_VOID2(add); +INSTR_VOID2(sub); +INSTR_VOID2(and); +INSTR_VOID2(or); +INSTR_VOID2(xor); + +#undef __INSTR_VOID2 +#undef INSTR_VOID2 + +#define __INSTR_RET1(op, sz, type, rtype) \ +static __always_inline rtype atomic##sz##_##op(atomic##sz##_t *v) \ +{ \ + kasan_check_write(v, sizeof(*v)); \ + return arch_atomic##sz##_##op(v); \ +} + +#define INSTR_RET1(op) \ +__INSTR_RET1(op, , int, int); \ +__INSTR_RET1(op, 64, long long, long long) + +INSTR_RET1(inc_return); +INSTR_RET1(dec_return); +__INSTR_RET1(inc_not_zero, 64, long long, long long); +__INSTR_RET1(dec_if_positive, 64, long long, long long); + +#define INSTR_RET_BOOL1(op) \ +__INSTR_RET1(op, , int, bool); \ +__INSTR_RET1(op, 64, long long, bool) + +INSTR_RET_BOOL1(dec_and_test); +INSTR_RET_BOOL1(inc_and_test); + +#undef __INSTR_RET1 +#undef INSTR_RET1 +#undef INSTR_RET_BOOL1 + +#define __INSTR_RET2(op, sz, type, rtype) \ +static __always_inline rtype atomic##sz##_##op(type i, atomic##sz##_t *v) \ +{ \ + kasan_check_write(v, sizeof(*v)); \ + return arch_atomic##sz##_##op(i, v); \ +} + +#define INSTR_RET2(op) \ +__INSTR_RET2(op, , int, int); \ +__INSTR_RET2(op, 64, long long, long long) + +INSTR_RET2(add_return); +INSTR_RET2(sub_return); +INSTR_RET2(fetch_add); +INSTR_RET2(fetch_sub); +INSTR_RET2(fetch_and); +INSTR_RET2(fetch_or); +INSTR_RET2(fetch_xor); + +#define INSTR_RET_BOOL2(op) \ +__INSTR_RET2(op, , int, bool); \ +__INSTR_RET2(op, 64, long long, bool) + +INSTR_RET_BOOL2(sub_and_test); +INSTR_RET_BOOL2(add_negative); + +#undef __INSTR_RET2 +#undef INSTR_RET2 +#undef INSTR_RET_BOOL2 + +/* + * In the following macros we need to be careful to not clash with arch_ macros. + * arch_xchg() can be defined as an extended statement expression as well, + * if we define a __ptr variable, and arch_xchg() also defines __ptr variable, + * and we pass __ptr as an argument to arch_xchg(), it will use own __ptr + * instead of ours. This leads to unpleasant crashes. To avoid the problem + * the following macros declare variables with lots of underscores. + */ + +#define xchg(ptr, v) \ +({ \ + __typeof__(ptr) ____ptr = (ptr); \ + kasan_check_write(____ptr, sizeof(*____ptr)); \ + arch_xchg(____ptr, (v)); \ +}) + +#define cmpxchg(ptr, old, new) \ +({ \ + __typeof__(ptr) ___ptr = (ptr); \ + kasan_check_write(___ptr, sizeof(*___ptr)); \ + arch_cmpxchg(___ptr, (old), (new)); \ +}) + +#define sync_cmpxchg(ptr, old, new) \ +({ \ + __typeof__(ptr) ___ptr = (ptr); \ + kasan_check_write(___ptr, sizeof(*___ptr)); \ + arch_sync_cmpxchg(___ptr, (old), (new)); \ +}) + +#define cmpxchg_local(ptr, old, new) \ +({ \ + __typeof__(ptr) ____ptr = (ptr); \ + kasan_check_write(____ptr, sizeof(*____ptr)); \ + arch_cmpxchg_local(____ptr, (old), (new)); \ +}) + +#define cmpxchg64(ptr, old, new) \ +({ \ + __typeof__(ptr) ____ptr = (ptr); \ + kasan_check_write(____ptr, sizeof(*____ptr)); \ + arch_cmpxchg64(____ptr, (old), (new)); \ +}) + +#define cmpxchg64_local(ptr, old, new) \ +({ \ + __typeof__(ptr) ____ptr = (ptr); \ + kasan_check_write(____ptr, sizeof(*____ptr)); \ + arch_cmpxchg64_local(____ptr, (old), (new)); \ +}) + +#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ +({ \ + __typeof__(p1) ____p1 = (p1); \ + kasan_check_write(____p1, 2 * sizeof(*____p1)); \ + arch_cmpxchg_double(____p1, (p2), (o1), (o2), (n1), (n2)); \ +}) + +#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ +({ \ + __typeof__(p1) ____p1 = (p1); \ + kasan_check_write(____p1, 2 * sizeof(*____p1)); \ + arch_cmpxchg_double_local(____p1, (p2), (o1), (o2), (n1), (n2));\ +}) + +#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 962164d36506..dbaf312b3317 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -358,6 +358,8 @@ extern void *alloc_large_system_hash(const char *tablename, #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ #define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min * shift passed via *_hash_shift */ +#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ +#define HASH_ADAPT 0x00000008 /* Adaptive scale for large memory */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 96f1e88b767c..a3ba193f042e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -40,9 +40,9 @@ extern int nr_cpu_ids; #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ -#define nr_cpumask_bits nr_cpu_ids +#define nr_cpumask_bits ((unsigned int)nr_cpu_ids) #else -#define nr_cpumask_bits NR_CPUS +#define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* diff --git a/include/linux/gfp.h b/include/linux/gfp.h index db373b9d3223..2bfcfd33e476 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -40,6 +40,11 @@ struct vm_area_struct; #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u +#ifdef CONFIG_LOCKDEP +#define ___GFP_NOLOCKDEP 0x4000000u +#else +#define ___GFP_NOLOCKDEP 0 +#endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -179,8 +184,11 @@ struct vm_area_struct; #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) +/* Disable lockdep for GFP context tracking */ +#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) + /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 25 +#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* @@ -202,8 +210,16 @@ struct vm_area_struct; * * GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. * * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. * * GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index dfaa1f4dcb0c..606b6bce3a5b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -491,6 +491,8 @@ struct jbd2_journal_handle unsigned long h_start_jiffies; unsigned int h_requested_credits; + + unsigned int saved_alloc_context; }; diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index b7f8aced7870..41960fecf783 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,11 +2,13 @@ #define _LINUX_KASAN_CHECKS_H #ifdef CONFIG_KASAN -void kasan_check_read(const void *p, unsigned int size); -void kasan_check_write(const void *p, unsigned int size); +void kasan_check_read(const volatile void *p, unsigned int size); +void kasan_check_write(const volatile void *p, unsigned int size); #else -static inline void kasan_check_read(const void *p, unsigned int size) { } -static inline void kasan_check_write(const void *p, unsigned int size) { } +static inline void kasan_check_read(const volatile void *p, unsigned int size) +{ } +static inline void kasan_check_write(const volatile void *p, unsigned int size) +{ } #endif #endif diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d419d0e51fe5..e98e546b543c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -283,6 +283,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e1cfda4bee58..78b44a024eaa 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page, struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); #else /* !CONFIG_KSM */ @@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page, return 0; } -static inline int rmap_walk_ksm(struct page *page, +static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { - return 0; } static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5af377303880..baa274150210 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -46,6 +46,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ + MEM_CGROUP_STAT_SHMEM, /* # of pages charged as shmem */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ diff --git a/include/linux/mm.h b/include/linux/mm.h index e197d3ca3e8a..a835edd2db34 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,8 @@ struct user_struct; struct writeback_control; struct bdi_writeback; +void init_mm_internals(void); + #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 51891fb0d3ce..c91b3bcd158f 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pud; \ }) -#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ - pmd_t ___pmd; \ - \ - ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ - mmu_notifier_invalidate_range(__mm, ___haddr, \ - ___haddr + HPAGE_PMD_SIZE); \ - \ - ___pmd; \ -}) - /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU @@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush -#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e02b3750fe0..618499159a7c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,7 +35,7 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 -enum { +enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, @@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES]; # define is_migrate_cma_page(_page) false #endif +static inline bool is_migrate_movable(int mt) +{ + return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; +} + #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) @@ -149,7 +154,6 @@ enum node_stat_item { NR_UNEVICTABLE, /* " " " " " */ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ - NR_PAGES_SCANNED, /* pages scanned since last reclaim */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_NODERECLAIM, @@ -630,6 +634,8 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_classzone_idx; + int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 047d64706f2a..d4cd2014fa6f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); -int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, - int migratetype); + int migratetype, int *num_movable); /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8c89e902df3e..43ef2c30cb0f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -83,19 +83,17 @@ struct anon_vma_chain { }; enum ttu_flags { - TTU_UNMAP = 1, /* unmap mode */ - TTU_MIGRATION = 2, /* migration mode */ - TTU_MUNLOCK = 4, /* munlock mode */ - TTU_LZFREE = 8, /* lazy free mode */ - TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */ - - TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ - TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ - TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ - TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible + TTU_MIGRATION = 0x1, /* migration mode */ + TTU_MUNLOCK = 0x2, /* munlock mode */ + + TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ + TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ + TTU_IGNORE_ACCESS = 0x10, /* don't age */ + TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ + TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ - TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock: + TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock: * caller holds it */ }; @@ -193,9 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound) int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); -#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) - -int try_to_unmap(struct page *, enum ttu_flags flags); +bool try_to_unmap(struct page *, enum ttu_flags flags); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) @@ -239,7 +235,7 @@ int page_mkclean(struct page *); * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ -int try_to_munlock(struct page *); +void try_to_munlock(struct page *); void remove_migration_ptes(struct page *old, struct page *new, bool locked); @@ -261,15 +257,19 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; - int (*rmap_one)(struct page *page, struct vm_area_struct *vma, + /* + * Return false if page table scanning in rmap_walk should be stopped. + * Otherwise, return true. + */ + bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -int rmap_walk(struct page *page, struct rmap_walk_control *rwc); -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ @@ -285,7 +285,7 @@ static inline int page_referenced(struct page *page, int is_locked, return 0; } -#define try_to_unmap(page, refs) SWAP_FAIL +#define try_to_unmap(page, refs) false static inline int page_mkclean(struct page *page) { @@ -295,13 +295,4 @@ static inline int page_mkclean(struct page *page) #endif /* CONFIG_MMU */ -/* - * Return values of try_to_unmap - */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 -#define SWAP_MLOCK 3 -#define SWAP_LZFREE 4 - #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h index ea05f6c51413..84766bcdd01f 100644 --- a/include/linux/rodata_test.h +++ b/include/linux/rodata_test.h @@ -14,7 +14,6 @@ #define _RODATA_TEST_H #ifdef CONFIG_DEBUG_RODATA_TEST -extern const int rodata_test_data; void rodata_test(void); #else static inline void rodata_test(void) {} diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c2045decbc9..9f8c9857abc0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1222,9 +1222,9 @@ extern struct pid *cad_pid; #define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ -#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */ -#define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 830953ebb391..9daabe138c99 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk) return ret; } -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags - * __GFP_FS is also cleared as it implies __GFP_IO. +/* + * Applies per-task gfp context to the given allocation flags. + * PF_MEMALLOC_NOIO implies GFP_NOIO + * PF_MEMALLOC_NOFS implies GFP_NOFS */ -static inline gfp_t memalloc_noio_flags(gfp_t flags) +static inline gfp_t current_gfp_context(gfp_t flags) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precendence + */ if (unlikely(current->flags & PF_MEMALLOC_NOIO)) flags &= ~(__GFP_IO | __GFP_FS); + else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) + flags &= ~__GFP_FS; return flags; } @@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } +static inline unsigned int memalloc_nofs_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOFS; + current->flags |= PF_MEMALLOC_NOFS; + return flags; +} + +static inline void memalloc_nofs_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 45e91dd6716d..486494e6b2fc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); -extern void deactivate_page(struct page *page); +extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a80b7b59cf33..d84ae90ccd5c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), FOR_ALL_ZONES(ALLOCSTALL), FOR_ALL_ZONES(PGSCAN_SKIP), - PGFREE, PGACTIVATE, PGDEACTIVATE, + PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, PGREFILL, diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index c566ddc87f73..08bb3ed18dcc 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -150,6 +150,136 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); +DECLARE_EVENT_CLASS(dax_pte_fault_class, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), + TP_ARGS(inode, vmf, result), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(pgoff_t, pgoff) + __field(dev_t, dev) + __field(unsigned int, flags) + __field(int, result) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->flags = vmf->flags; + __entry->pgoff = vmf->pgoff; + __entry->result = result; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), + __entry->address, + __entry->pgoff, + __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE) + ) +) + +#define DEFINE_PTE_FAULT_EVENT(name) \ +DEFINE_EVENT(dax_pte_fault_class, name, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \ + TP_ARGS(inode, vmf, result)) + +DEFINE_PTE_FAULT_EVENT(dax_pte_fault); +DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); +DEFINE_PTE_FAULT_EVENT(dax_load_hole); + +TRACE_EVENT(dax_insert_mapping, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), + TP_ARGS(inode, vmf, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(void *, radix_entry) + __field(dev_t, dev) + __field(int, write) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->write = vmf->flags & FAULT_FLAG_WRITE; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->write ? "write" : "read", + __entry->address, + (unsigned long)__entry->radix_entry + ) +) + +DECLARE_EVENT_CLASS(dax_writeback_range_class, + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index), + TP_ARGS(inode, start_index, end_index), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, start_index) + __field(pgoff_t, end_index) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start_index = start_index; + __entry->end_index = end_index; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->start_index, + __entry->end_index + ) +) + +#define DEFINE_WRITEBACK_RANGE_EVENT(name) \ +DEFINE_EVENT(dax_writeback_range_class, name, \ + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),\ + TP_ARGS(inode, start_index, end_index)) + +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range); +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done); + +TRACE_EVENT(dax_writeback_one, + TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen), + TP_ARGS(inode, pgoff, pglen), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, pgoff) + __field(pgoff_t, pglen) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgoff = pgoff; + __entry->pglen = pglen; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->pglen + ) +) + #endif /* _TRACE_FS_DAX_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index e13d48058b8d..c6602bdd2a10 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -26,6 +26,10 @@ #include <linux/types.h> #include <linux/compiler.h> +#ifndef __KERNEL__ +#include <stddef.h> /* For size_t. */ +#endif + #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is the largest acceptable value for the nlen diff --git a/init/do_mounts.h b/init/do_mounts.h index 067af1d9e8b6..282d65bfd674 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -19,29 +19,15 @@ static inline int create_dev(char *name, dev_t dev) return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); } -#if BITS_PER_LONG == 32 static inline u32 bstat(char *name) { - struct stat64 stat; - if (sys_stat64(name, &stat) != 0) + struct kstat stat; + if (vfs_stat(name, &stat) != 0) return 0; - if (!S_ISBLK(stat.st_mode)) + if (!S_ISBLK(stat.mode)) return 0; - if (stat.st_rdev != (u32)stat.st_rdev) - return 0; - return stat.st_rdev; -} -#else -static inline u32 bstat(char *name) -{ - struct stat stat; - if (sys_newstat(name, &stat) != 0) - return 0; - if (!S_ISBLK(stat.st_mode)) - return 0; - return stat.st_rdev; + return stat.rdev; } -#endif #ifdef CONFIG_BLK_DEV_RAM diff --git a/init/initramfs.c b/init/initramfs.c index 981f286c1d16..a5b686696535 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -312,10 +312,10 @@ static int __init maybe_link(void) static void __init clean_path(char *path, umode_t fmode) { - struct stat st; + struct kstat st; - if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) { - if (S_ISDIR(st.st_mode)) + if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) { + if (S_ISDIR(st.mode)) sys_rmdir(path); else sys_unlink(path); @@ -581,13 +581,13 @@ static void __init clean_rootfs(void) num = sys_getdents64(fd, dirp, BUF_SIZE); while (num > 0) { while (num > 0) { - struct stat st; + struct kstat st; int ret; - ret = sys_newlstat(dirp->d_name, &st); + ret = vfs_lstat(dirp->d_name, &st); WARN_ON_ONCE(ret); if (!ret) { - if (S_ISDIR(st.st_mode)) + if (S_ISDIR(st.mode)) sys_rmdir(dirp->d_name); else sys_unlink(dirp->d_name); @@ -611,7 +611,7 @@ static int __init populate_rootfs(void) char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic("%s", err); /* Failed to decompress INTERNAL initramfs */ - if (initrd_start) { + if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { #ifdef CONFIG_BLK_DEV_RAM int fd; printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); diff --git a/init/main.c b/init/main.c index f9c9d9948203..b0c11cbf5ddf 100644 --- a/init/main.c +++ b/init/main.c @@ -1022,6 +1022,8 @@ static noinline void __init kernel_init_freeable(void) workqueue_init(); + init_mm_internals(); + do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f0f8e2a8496f..751593ed7c0b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; +static bool hung_task_show_lock; static struct task_struct *watchdog_task; @@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_all_locks(); + hung_task_show_lock = true; } touch_nmi_watchdog(); if (sysctl_hung_task_panic) { + if (hung_task_show_lock) + debug_show_all_locks(); trigger_all_cpu_backtrace(); panic("hung_task: blocked tasks"); } @@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; + hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { if (!max_count--) @@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); + if (hung_task_show_lock) + debug_show_all_locks(); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 23ca53e8d3fb..c0e31bfee25c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/sched/task.h> +#include <linux/sched/mm.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/proc_fs.h> @@ -2876,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (unlikely(!debug_locks)) return; + gfp_mask = current_gfp_context(gfp_mask); + /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; @@ -2885,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) + if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) return; /* @@ -2894,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; + /* Disable lockdep if explicitly requested */ + if (gfp_mask & __GFP_NOLOCKDEP) + return; + mark_held_locks(curr, RECLAIM_FS); } @@ -3947,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { - current->lockdep_reclaim_gfp = gfp_mask; + current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); } EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) */ pv_lock_hash = alloc_large_system_hash("PV qspinlock", sizeof(struct pv_hash_entry), - pv_hash_size, 0, HASH_EARLY, + pv_hash_size, 0, + HASH_EARLY | HASH_ZERO, &pv_lock_hash_bits, NULL, pv_hash_size, pv_hash_size); } diff --git a/kernel/pid.c b/kernel/pid.c index 0143ac0ddceb..a1f8459c70ac 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -573,16 +573,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int i, pidhash_size; + unsigned int pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, + HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); pidhash_size = 1U << pidhash_shift; - - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); } void __init pidmap_init(void) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8a5e44236f78..4559e914452b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -30,6 +30,7 @@ #include <linux/pid_namespace.h> #include <net/genetlink.h> #include <linux/atomic.h> +#include <linux/sched/cputime.h> /* * Maximum length of a cpumask that can be specified in @@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; + u64 delta, utime, stime; + u64 start_time; /* * Add additional stats from live tasks except zombie thread group @@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) memset(stats, 0, sizeof(*stats)); tsk = first; + start_time = ktime_get_ns(); do { if (tsk->exit_state) continue; @@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) */ delayacct_add_tsk(stats, tsk); + /* calculate task elapsed time in nsec */ + delta = start_time - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime += delta; + + task_cputime(tsk, &utime, &stime); + stats->ac_utime += div_u64(utime, NSEC_PER_USEC); + stats->ac_stime += div_u64(stime, NSEC_PER_USEC); + stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; } while_each_thread(first, tsk); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 22c75bcfb498..77fadface4f9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1719,19 +1719,21 @@ config LKDTM Documentation/fault-injection/provoke-crashes.txt config TEST_LIST_SORT - bool "Linked list sorting test" - depends on DEBUG_KERNEL + tristate "Linked list sorting test" + depends on DEBUG_KERNEL || m help Enable this to turn on 'list_sort()' function test. This test is - executed only once during system boot, so affects only boot time. + executed only once during system boot (so affects only boot time), + or at module load time. If unsure, say N. config TEST_SORT - bool "Array-based sort test" - depends on DEBUG_KERNEL + tristate "Array-based sort test" + depends on DEBUG_KERNEL || m help - This option enables the self-test function of 'sort()' at boot. + This option enables the self-test function of 'sort()' at boot, + or at module load time. If unsure, say N. diff --git a/lib/Makefile b/lib/Makefile index acbc16bed9af..0f64ef3956bf 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o +obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_SORT) += test_sort.o diff --git a/lib/list_sort.c b/lib/list_sort.c index 3fe401067e20..9e9acc37652f 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,6 +1,3 @@ - -#define pr_fmt(fmt) "list_sort_test: " fmt - #include <linux/kernel.h> #include <linux/bug.h> #include <linux/compiler.h> @@ -145,149 +142,3 @@ void list_sort(void *priv, struct list_head *head, merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); } EXPORT_SYMBOL(list_sort); - -#ifdef CONFIG_TEST_LIST_SORT - -#include <linux/slab.h> -#include <linux/random.h> - -/* - * The pattern of set bits in the list length determines which cases - * are hit in list_sort(). - */ -#define TEST_LIST_LEN (512+128+2) /* not including head */ - -#define TEST_POISON1 0xDEADBEEF -#define TEST_POISON2 0xA324354C - -struct debug_el { - unsigned int poison1; - struct list_head list; - unsigned int poison2; - int value; - unsigned serial; -}; - -/* Array, containing pointers to all elements in the test list */ -static struct debug_el **elts __initdata; - -static int __init check(struct debug_el *ela, struct debug_el *elb) -{ - if (ela->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", ela->serial); - return -EINVAL; - } - if (elb->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", elb->serial); - return -EINVAL; - } - if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - pr_err("error: phantom element\n"); - return -EINVAL; - } - if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); - return -EINVAL; - } - if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); - return -EINVAL; - } - return 0; -} - -static int __init cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct debug_el *ela, *elb; - - ela = container_of(a, struct debug_el, list); - elb = container_of(b, struct debug_el, list); - - check(ela, elb); - return ela->value - elb->value; -} - -static int __init list_sort_test(void) -{ - int i, count = 1, err = -ENOMEM; - struct debug_el *el; - struct list_head *cur; - LIST_HEAD(head); - - pr_debug("start testing list_sort()\n"); - - elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); - if (!elts) { - pr_err("error: cannot allocate memory\n"); - return err; - } - - for (i = 0; i < TEST_LIST_LEN; i++) { - el = kmalloc(sizeof(*el), GFP_KERNEL); - if (!el) { - pr_err("error: cannot allocate memory\n"); - goto exit; - } - /* force some equivalencies */ - el->value = prandom_u32() % (TEST_LIST_LEN / 3); - el->serial = i; - el->poison1 = TEST_POISON1; - el->poison2 = TEST_POISON2; - elts[i] = el; - list_add_tail(&el->list, &head); - } - - list_sort(NULL, &head, cmp); - - err = -EINVAL; - for (cur = head.next; cur->next != &head; cur = cur->next) { - struct debug_el *el1; - int cmp_result; - - if (cur->next->prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - cmp_result = cmp(NULL, cur, cur->next); - if (cmp_result > 0) { - pr_err("error: list is not sorted\n"); - goto exit; - } - - el = container_of(cur, struct debug_el, list); - el1 = container_of(cur->next, struct debug_el, list); - if (cmp_result == 0 && el->serial >= el1->serial) { - pr_err("error: order of equivalent elements not " - "preserved\n"); - goto exit; - } - - if (check(el, el1)) { - pr_err("error: element check failed\n"); - goto exit; - } - count++; - } - if (head.prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - - if (count != TEST_LIST_LEN) { - pr_err("error: bad list length %d", count); - goto exit; - } - - err = 0; -exit: - for (i = 0; i < TEST_LIST_LEN; i++) - kfree(elts[i]); - kfree(elts); - return err; -} -late_initcall(list_sort_test); -#endif /* CONFIG_TEST_LIST_SORT */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 691a9ad48497..898e87998417 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu) void __init radix_tree_init(void) { int ret; + + BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c new file mode 100644 index 000000000000..28e817387b04 --- /dev/null +++ b/lib/test_list_sort.c @@ -0,0 +1,150 @@ +#define pr_fmt(fmt) "list_sort_test: " fmt + +#include <linux/kernel.h> +#include <linux/list_sort.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/random.h> + +/* + * The pattern of set bits in the list length determines which cases + * are hit in list_sort(). + */ +#define TEST_LIST_LEN (512+128+2) /* not including head */ + +#define TEST_POISON1 0xDEADBEEF +#define TEST_POISON2 0xA324354C + +struct debug_el { + unsigned int poison1; + struct list_head list; + unsigned int poison2; + int value; + unsigned serial; +}; + +/* Array, containing pointers to all elements in the test list */ +static struct debug_el **elts __initdata; + +static int __init check(struct debug_el *ela, struct debug_el *elb) +{ + if (ela->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", ela->serial); + return -EINVAL; + } + if (elb->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", elb->serial); + return -EINVAL; + } + if (elts[ela->serial] != ela || elts[elb->serial] != elb) { + pr_err("error: phantom element\n"); + return -EINVAL; + } + if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); + return -EINVAL; + } + if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); + return -EINVAL; + } + return 0; +} + +static int __init cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct debug_el *ela, *elb; + + ela = container_of(a, struct debug_el, list); + elb = container_of(b, struct debug_el, list); + + check(ela, elb); + return ela->value - elb->value; +} + +static int __init list_sort_test(void) +{ + int i, count = 1, err = -ENOMEM; + struct debug_el *el; + struct list_head *cur; + LIST_HEAD(head); + + pr_debug("start testing list_sort()\n"); + + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); + if (!elts) { + pr_err("error: cannot allocate memory\n"); + return err; + } + + for (i = 0; i < TEST_LIST_LEN; i++) { + el = kmalloc(sizeof(*el), GFP_KERNEL); + if (!el) { + pr_err("error: cannot allocate memory\n"); + goto exit; + } + /* force some equivalencies */ + el->value = prandom_u32() % (TEST_LIST_LEN / 3); + el->serial = i; + el->poison1 = TEST_POISON1; + el->poison2 = TEST_POISON2; + elts[i] = el; + list_add_tail(&el->list, &head); + } + + list_sort(NULL, &head, cmp); + + err = -EINVAL; + for (cur = head.next; cur->next != &head; cur = cur->next) { + struct debug_el *el1; + int cmp_result; + + if (cur->next->prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + cmp_result = cmp(NULL, cur, cur->next); + if (cmp_result > 0) { + pr_err("error: list is not sorted\n"); + goto exit; + } + + el = container_of(cur, struct debug_el, list); + el1 = container_of(cur->next, struct debug_el, list); + if (cmp_result == 0 && el->serial >= el1->serial) { + pr_err("error: order of equivalent elements not " + "preserved\n"); + goto exit; + } + + if (check(el, el1)) { + pr_err("error: element check failed\n"); + goto exit; + } + count++; + } + if (head.prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + + if (count != TEST_LIST_LEN) { + pr_err("error: bad list length %d", count); + goto exit; + } + + err = 0; +exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); + kfree(elts); + return err; +} +module_init(list_sort_test); +MODULE_LICENSE("GPL"); diff --git a/lib/test_sort.c b/lib/test_sort.c index 4db3911db50a..d389c1cc2f6c 100644 --- a/lib/test_sort.c +++ b/lib/test_sort.c @@ -1,11 +1,8 @@ #include <linux/sort.h> #include <linux/slab.h> -#include <linux/init.h> +#include <linux/module.h> -/* - * A simple boot-time regression test - * License: GPL - */ +/* a simple boot-time regression test */ #define TEST_LEN 1000 @@ -41,4 +38,6 @@ exit: kfree(a); return err; } -subsys_initcall(test_sort_init); + +module_init(test_sort_init); +MODULE_LICENSE("GPL"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 40d3cb42b0e7..21dbdf05dfdf 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1477,6 +1477,9 @@ int kptr_restrict __read_mostly; * by an extra set of alphanumeric characters that are extended format * specifiers. * + * Please update scripts/checkpatch.pl when adding/removing conversion + * characters. (Search for "check for vsprintf extension"). + * * Right now we handle: * * - 'F' For symbolic function descriptor pointers with offset diff --git a/mm/compaction.c b/mm/compaction.c index 81e1eaa2a2cf..613c59e928cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,11 +89,6 @@ static void map_pages(struct list_head *list) list_splice(&tmp_list, list); } -static inline bool migrate_async_suitable(int migratetype) -{ - return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; -} - #ifdef CONFIG_COMPACTION int PageMovable(struct page *page) @@ -988,13 +983,26 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct compact_control *cc, +static bool suitable_migration_source(struct compact_control *cc, struct page *page) { - if (cc->ignore_block_suitable) + int block_mt; + + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; + block_mt = get_pageblock_migratetype(page); + + if (cc->migratetype == MIGRATE_MOVABLE) + return is_migrate_movable(block_mt); + else + return block_mt == cc->migratetype; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) +{ /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1006,8 +1014,11 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } + if (cc->ignore_block_suitable) + return true; + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(get_pageblock_migratetype(page))) + if (is_migrate_movable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Async compaction is optimistic to see if the minimum amount * of work satisfies the allocation. */ - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(get_pageblock_migratetype(page))) + if (!suitable_migration_source(cc, page)) continue; /* Perform the isolation */ @@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result __compact_finished(struct zone *zone, + struct compact_control *cc) { unsigned int order; - unsigned long watermark; + const int migratetype = cc->migratetype; if (cc->contended || fatal_signal_pending(current)) return COMPACT_CONTENDED; @@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - /* Compaction run is not finished if the watermark is not met */ - watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; - - if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, - cc->alloc_flags)) - return COMPACT_CONTINUE; + if (cc->finishing_block) { + /* + * We have finished the pageblock, but better check again that + * we really succeeded. + */ + if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + cc->finishing_block = false; + else + return COMPACT_CONTINUE; + } /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { @@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) - return COMPACT_SUCCESS; + true, &can_steal) != -1) { + + /* movable pages are OK in any pageblock */ + if (migratetype == MIGRATE_MOVABLE) + return COMPACT_SUCCESS; + + /* + * We are stealing for a non-movable allocation. Make + * sure we finish compacting the current pageblock + * first so it is as free as possible and we won't + * have to steal another one soon. This only applies + * to sync compaction, as async compaction operates + * on pageblocks of the same migratetype. + */ + if (cc->mode == MIGRATE_ASYNC || + IS_ALIGNED(cc->migrate_pfn, + pageblock_nr_pages)) { + return COMPACT_SUCCESS; + } + + cc->finishing_block = true; + return COMPACT_CONTINUE; + } } return COMPACT_NO_SUITABLE_PAGE; } static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc, - const int migratetype) + struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc, migratetype); + ret = __compact_finished(zone, cc); trace_mm_compaction_finished(zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); - const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ @@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro migrate_prep_local(); - while ((ret = compact_finished(zone, cc, migratetype)) == - COMPACT_CONTINUE) { + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { int err; switch (isolate_migratepages(zone, cc)) { diff --git a/mm/filemap.c b/mm/filemap.c index 1694623a6289..68b166a9eda0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2202,12 +2202,12 @@ int filemap_fault(struct vm_fault *vmf) struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; + pgoff_t max_off; struct page *page; - loff_t size; int ret = 0; - size = round_up(i_size_read(inode), PAGE_SIZE); - if (offset >= size >> PAGE_SHIFT) + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) return VM_FAULT_SIGBUS; /* @@ -2256,8 +2256,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = round_up(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= size >> PAGE_SHIFT)) { + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) { unlock_page(page); put_page(page); return VM_FAULT_SIGBUS; @@ -2323,7 +2323,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - loff_t size; + unsigned long max_idx; struct page *head, *page; rcu_read_lock(); @@ -2369,8 +2369,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = round_up(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= size >> PAGE_SHIFT) + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (page->index >= max_idx) goto unlock; if (file->f_ra.mmap_miss > 0) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ebc93e179f3..d14dd961f626 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1564,18 +1564,16 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ClearPageDirty(page); unlock_page(page); - if (PageActive(page)) - deactivate_page(page); - if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + + mark_page_lazyfree(page); ret = true; out: spin_unlock(ptl); @@ -1724,37 +1722,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - int ret = 0; + pmd_t entry; + bool preserve_write; + int ret; ptl = __pmd_trans_huge_lock(pmd, vma); - if (ptl) { - pmd_t entry; - bool preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; + if (!ptl) + return 0; - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) { - spin_unlock(ptl); - return ret; - } + preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; - if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); - entry = pmd_modify(entry, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); - ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(vma_is_anonymous(vma) && !preserve_write && - pmd_write(entry)); - } - spin_unlock(ptl); - } + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) + goto unlock; + + if (prot_numa && pmd_protnone(*pmd)) + goto unlock; + + /* + * In case prot_numa, we are under down_read(mmap_sem). It's critical + * to not clear pmd intermittently to avoid race with MADV_DONTNEED + * which is also under down_read(mmap_sem): + * + * CPU0: CPU1: + * change_huge_pmd(prot_numa=1) + * pmdp_huge_get_and_clear_notify() + * madvise_dontneed() + * zap_pmd_range() + * pmd_trans_huge(*pmd) == 0 (without ptl) + * // skip the pmd + * set_pmd_at(); + * // pmd is re-established + * + * The race makes MADV_DONTNEED miss the huge pmd and don't clear it + * which may break userspace. + * + * pmdp_invalidate() is required to make sure we don't miss + * dirty/young flags set by hardware. + */ + entry = *pmd; + pmdp_invalidate(vma, addr, pmd); + + /* + * Recover dirty/young flags. It relies on pmdp_invalidate to not + * corrupt them. + */ + if (pmd_dirty(*pmd)) + entry = pmd_mkdirty(entry); + if (pmd_young(*pmd)) + entry = pmd_mkyoung(entry); + entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mk_savedwrite(entry); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); +unlock: + spin_unlock(ptl); return ret; } @@ -2114,15 +2144,15 @@ static void freeze_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - int ret; + bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_MIGRATION; - ret = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(ret, page); + unmap_success = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(!unmap_success, page); } static void unfreeze_page(struct page *page) @@ -2368,7 +2398,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); if (PageAnon(head)) { diff --git a/mm/internal.h b/mm/internal.h index ccfc2a2969f4..0e4f558412fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -81,11 +81,16 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; /* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* * in mm/vmscan.c: */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: @@ -178,6 +183,7 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ + struct zone *zone; unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; @@ -185,17 +191,18 @@ struct compact_control { unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + int order; /* order a direct compactor needs */ + int migratetype; /* migratetype of direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ - int order; /* order a direct compactor needs */ - const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ - struct zone *zone; bool contended; /* Signal lock or sched contention */ + bool finishing_block; /* Finishing current pageblock */ }; unsigned long @@ -481,6 +488,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, enum ttu_flags; struct tlbflush_unmap_batch; + +/* + * only for MM internal work items which do not depend on + * any allocations or locks which might depend on allocations + */ +extern struct workqueue_struct *mm_percpu_wq; + #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); @@ -498,4 +512,14 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; +static inline bool is_migrate_highatomic(enum migratetype migratetype) +{ + return migratetype == MIGRATE_HIGHATOMIC; +} + +static inline bool is_migrate_highatomic_page(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 9249649baa69..42b178148f9c 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -333,13 +333,13 @@ static void check_memory_region(unsigned long addr, check_memory_region_inline(addr, size, write, ret_ip); } -void kasan_check_read(const void *p, unsigned int size) +void kasan_check_read(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(kasan_check_read); -void kasan_check_write(const void *p, unsigned int size) +void kasan_check_write(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, true, _RET_IP_); } @@ -579,7 +579,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_double_free(cache, object, shadow_byte); + kasan_report_double_free(cache, object, + __builtin_return_address(1)); return true; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1c260e6b3b3c..75729173ade9 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -104,7 +104,7 @@ static inline bool kasan_report_enabled(void) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow); + void *ip); #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index f479365530b6..718a10a48a19 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -49,7 +49,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size) return first_bad_addr; } -static void print_error_description(struct kasan_access_info *info) +static bool addr_has_shadow(struct kasan_access_info *info) +{ + return (info->access_addr >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -96,12 +102,39 @@ static void print_error_description(struct kasan_access_info *info) break; } - pr_err("BUG: KASAN: %s in %pS at addr %p\n", - bug_type, (void *)info->ip, - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); + return bug_type; +} + +const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +static const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = get_bug_type(info); + + pr_err("BUG: KASAN: %s in %pS\n", + bug_type, (void *)info->ip); + pr_err("%s of size %zu at addr %p by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); } static inline bool kernel_or_module_addr(const void *addr) @@ -142,9 +175,9 @@ static void kasan_end_report(unsigned long *flags) kasan_enable_current(); } -static void print_track(struct kasan_track *track) +static void print_track(struct kasan_track *track, const char *prefix) { - pr_err("PID = %u\n", track->pid); + pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { struct stack_trace trace; @@ -155,59 +188,84 @@ static void print_track(struct kasan_track *track) } } -static void kasan_object_err(struct kmem_cache *cache, void *object) +static struct page *addr_to_page(const void *addr) { - struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) + return virt_to_head_page(addr); + return NULL; +} - dump_stack(); - pr_err("Object at %p, in cache %s size: %d\n", object, cache->name, - cache->object_size); +static void describe_object_addr(struct kmem_cache *cache, void *object, + const void *addr) +{ + unsigned long access_addr = (unsigned long)addr; + unsigned long object_addr = (unsigned long)object; + const char *rel_type; + int rel_bytes; - if (!(cache->flags & SLAB_KASAN)) + pr_err("The buggy address belongs to the object at %p\n" + " which belongs to the cache %s of size %d\n", + object, cache->name, cache->object_size); + + if (!addr) return; - pr_err("Allocated:\n"); - print_track(&alloc_info->alloc_track); - pr_err("Freed:\n"); - print_track(&alloc_info->free_track); + if (access_addr < object_addr) { + rel_type = "to the left"; + rel_bytes = object_addr - access_addr; + } else if (access_addr >= object_addr + cache->object_size) { + rel_type = "to the right"; + rel_bytes = access_addr - (object_addr + cache->object_size); + } else { + rel_type = "inside"; + rel_bytes = access_addr - object_addr; + } + + pr_err("The buggy address is located %d bytes %s of\n" + " %d-byte region [%p, %p)\n", + rel_bytes, rel_type, cache->object_size, (void *)object_addr, + (void *)(object_addr + cache->object_size)); } -void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow) +static void describe_object(struct kmem_cache *cache, void *object, + const void *addr) { - unsigned long flags; + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - kasan_start_report(&flags); - pr_err("BUG: Double free or freeing an invalid pointer\n"); - pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); - kasan_object_err(cache, object); - kasan_end_report(&flags); + if (cache->flags & SLAB_KASAN) { + print_track(&alloc_info->alloc_track, "Allocated"); + pr_err("\n"); + print_track(&alloc_info->free_track, "Freed"); + pr_err("\n"); + } + + describe_object_addr(cache, object, addr); } -static void print_address_description(struct kasan_access_info *info) +static void print_address_description(void *addr) { - const void *addr = info->access_addr; + struct page *page = addr_to_page(addr); - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) { - struct page *page = virt_to_head_page(addr); - - if (PageSlab(page)) { - void *object; - struct kmem_cache *cache = page->slab_cache; - object = nearest_obj(cache, page, - (void *)info->access_addr); - kasan_object_err(cache, object); - return; - } - dump_page(page, "kasan: bad access detected"); + dump_stack(); + pr_err("\n"); + + if (page && PageSlab(page)) { + struct kmem_cache *cache = page->slab_cache; + void *object = nearest_obj(cache, page, addr); + + describe_object(cache, object, addr); } - if (kernel_or_module_addr(addr)) { - if (!init_task_stack_addr(addr)) - pr_err("Address belongs to variable %pS\n", addr); + if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { + pr_err("The buggy address belongs to the variable:\n"); + pr_err(" %pS\n", addr); + } + + if (page) { + pr_err("The buggy address belongs to the page:\n"); + dump_page(page, "kasan: bad access detected"); } - dump_stack(); } static bool row_is_guilty(const void *row, const void *guilty) @@ -262,31 +320,34 @@ static void print_shadow_for_address(const void *addr) } } +void kasan_report_double_free(struct kmem_cache *cache, void *object, + void *ip) +{ + unsigned long flags; + + kasan_start_report(&flags); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + pr_err("\n"); + print_address_description(object); + pr_err("\n"); + print_shadow_for_address(object); + kasan_end_report(&flags); +} + static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; - const char *bug_type; kasan_start_report(&flags); - if (info->access_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - pr_err("BUG: KASAN: %s on address %p\n", - bug_type, info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, - task_pid_nr(current)); + print_error_description(info); + pr_err("\n"); + + if (!addr_has_shadow(info)) { dump_stack(); } else { - print_error_description(info); - print_address_description(info); + print_address_description((void *)info->access_addr); + pr_err("\n"); print_shadow_for_address(info->first_bad_addr); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ba40b7f673f4..7cb9c88bb4a3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - /* 0 stands for page_is_file_cache(page) == false */ - dec_node_page_state(page, NR_ISOLATED_ANON + 0); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); unlock_page(page); putback_lru_page(page); } @@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* * We can do it before isolate_lru_page because the @@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; @@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - /* 0 stands for page_is_file_cache(page) == false */ - inc_node_page_state(page, NR_ISOLATED_ANON + 0); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; int search_new_forks = 0; VM_BUG_ON_PAGE(!PageKsm(page), page); @@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) stable_node = page_stable_node(page); if (!stable_node) - return ret; + return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; @@ -1978,23 +1977,20 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg); - if (ret != SWAP_AGAIN) { + if (!rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } if (rwc->done && rwc->done(page)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; -out: - return ret; } #ifdef CONFIG_MIGRATION diff --git a/mm/madvise.c b/mm/madvise.c index 7a2abf0127ae..a09d2d3dfae9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, ptent = pte_mkold(ptent); ptent = pte_mkclean(ptent); set_pte_at(mm, addr, pte, ptent); - if (PageActive(page)) - deactivate_page(page); tlb_remove_tlb_entry(tlb, pte, addr); } + mark_page_lazyfree(page); } out: if (nr_swap) { @@ -651,13 +650,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_FREE: - /* - * XXX: In this implementation, MADV_FREE works like - * MADV_DONTNEED on swapless system or full swap. - */ - if (get_nr_swap_pages() > 0) - return madvise_free(vma, prev, start, end); - /* passthrough */ + return madvise_free(vma, prev, start, end); case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd7541d7c11..490d5b4676c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -104,6 +104,7 @@ static const char * const mem_cgroup_stat_names[] = { "cache", "rss", "rss_huge", + "shmem", "mapped_file", "dirty", "writeback", @@ -608,9 +609,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); - else + else { __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + if (PageSwapBacked(page)) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], + nr_pages); + } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -5208,6 +5213,8 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + seq_printf(m, "shmem %llu\n", + (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", @@ -5476,8 +5483,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, unsigned long nr_kmem, - struct page *dummy_page) + unsigned long nr_kmem, unsigned long nr_huge, + unsigned long nr_shmem, struct page *dummy_page) { unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; @@ -5495,6 +5502,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem); __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); @@ -5507,6 +5515,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; + unsigned long nr_shmem = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; @@ -5539,9 +5548,9 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); - pgpgout = nr_anon = nr_file = - nr_huge = nr_kmem = 0; + nr_kmem, nr_huge, nr_shmem, page); + pgpgout = nr_anon = nr_file = nr_kmem = 0; + nr_huge = nr_shmem = 0; } memcg = page->mem_cgroup; } @@ -5555,8 +5564,11 @@ static void uncharge_list(struct list_head *page_list) } if (PageAnon(page)) nr_anon += nr_pages; - else + else { nr_file += nr_pages; + if (PageSwapBacked(page)) + nr_shmem += nr_pages; + } pgpgout++; } else { nr_kmem += 1 << compound_order(page); @@ -5568,7 +5580,7 @@ static void uncharge_list(struct list_head *page_list) if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); + nr_kmem, nr_huge, nr_shmem, page); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 27f7210e7fab..3d3cf6add4c1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -322,7 +322,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, - int fail, struct page *page, unsigned long pfn, + bool fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -904,13 +904,13 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page); * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static int hwpoison_user_mappings(struct page *p, unsigned long pfn, +static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - int ret; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; @@ -919,20 +919,20 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * other types of pages. */ if (PageReserved(p) || PageSlab(p)) - return SWAP_SUCCESS; + return true; if (!(PageLRU(hpage) || PageHuge(p))) - return SWAP_SUCCESS; + return true; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(hpage)) - return SWAP_SUCCESS; + return true; if (PageKsm(p)) { pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); - return SWAP_FAIL; + return false; } if (PageSwapCache(p)) { @@ -971,8 +971,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(hpage, ttu); - if (ret != SWAP_SUCCESS) + unmap_success = try_to_unmap(hpage, ttu); + if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); @@ -987,10 +987,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); - kill_procs(&tokill, forcekill, trapno, - ret != SWAP_SUCCESS, p, pfn, flags); + kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags); - return ret; + return unmap_success; } static void set_page_hwpoison_huge_page(struct page *hpage) @@ -1230,8 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * When the raw error page is thp tail page, hpage points to the raw * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) - != SWAP_SUCCESS) { + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6fa7208bcd56..587eccfd4588 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1211,7 +1211,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* Reset the nr_zones, order and classzone_idx before reuse */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; } /* we can use NODE_DATA(nid) from here */ diff --git a/mm/migrate.c b/mm/migrate.c index 9a0897a14d37..410b56bdabed 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, +static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { struct page_vma_mapped_walk pvmw = { @@ -209,8 +209,11 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageTail(page), page); while (page_vma_mapped_walk(&pvmw)) { - new = page - pvmw.page->index + - linear_page_index(vma, pvmw.address); + if (PageKsm(page)) + new = page; + else + new = page - pvmw.page->index + + linear_page_index(vma, pvmw.address); get_page(new); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); @@ -250,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, update_mmu_cache(vma, pvmw.address, pvmw.pte); } - return SWAP_AGAIN; + return true; } /* @@ -1719,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, { int z; - if (!pgdat_reclaimable(pgdat)) - return false; - for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; @@ -1944,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (PageSwapBacked(page)) + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/mlock.c b/mm/mlock.c index 0dd9ca18e19e..c483c5c20b4b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) */ static void __munlock_isolated_page(struct page *page) { - int ret = SWAP_AGAIN; - /* * Optimization: if the page was mapped just once, that's our mapping * and we don't need to check all the other vmas. */ if (page_mapcount(page) > 1) - ret = try_to_munlock(page); + try_to_munlock(page); /* Did try_to_unlock() succeed or punt? */ - if (ret != SWAP_MLOCK) + if (!PageMlocked(page)) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8ac2a7fb9e7..33df0583edb9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - init_timer_deferrable(&dom->period_timer); - dom->period_timer.function = writeout_period; - dom->period_timer.data = (unsigned long)dom; + setup_deferrable_timer(&dom->period_timer, writeout_period, + (unsigned long)dom); dom->dirty_limit_tstamp = jiffies; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cbde310abed..7c86a805bb45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1090,15 +1090,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned, flags; + unsigned long flags; bool isolated_pageblocks; spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - while (count) { struct page *page; struct list_head *list; @@ -1150,13 +1146,10 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned, flags; + unsigned long flags; + spin_lock_irqsave(&zone->lock, flags); __count_vm_events(PGFREE, 1 << order); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); @@ -1844,9 +1837,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, +static int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, - int migratetype) + int migratetype, int *num_movable) { struct page *page; unsigned int order; @@ -1863,6 +1856,9 @@ int move_freepages(struct zone *zone, VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif + if (num_movable) + *num_movable = 0; + for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1873,6 +1869,15 @@ int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + page++; continue; } @@ -1888,7 +1893,7 @@ int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int migratetype, int *num_movable) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1905,7 +1910,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype); + return move_freepages(zone, start_page, end_page, migratetype, + num_movable); } static void change_pageblock_range(struct page *pageblock_page, @@ -1955,28 +1961,78 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this - * pageblock and check whether half of pages are moved or not. If half of - * pages are moved, we can change migratetype of pageblock and permanently - * use it's pages as requested migratetype in the future. + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type) + int start_type, bool whole_block) { unsigned int current_order = page_order(page); - int pages; + struct free_area *area; + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic(old_block_type)) + goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return; + goto single_page; } - pages = move_freepages_block(zone, page, start_type); + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; + } + + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); + + return; + +single_page: + area = &zone->free_area[current_order]; + list_move(&page->lru, &area->free_list[start_type]); } /* @@ -2042,11 +2098,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_HIGHATOMIC && - !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) + && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); } out_unlock: @@ -2100,8 +2156,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (get_pageblock_migratetype(page) == - MIGRATE_HIGHATOMIC) { + if (is_migrate_highatomic_page(page)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2124,7 +2179,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); if (ret) { spin_unlock_irqrestore(&zone->lock, flags); return ret; @@ -2136,8 +2192,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, return false; } -/* Remove an element from the buddy allocator from the fallback list */ -static inline struct page * +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + */ +static inline bool __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; @@ -2158,33 +2219,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && - get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) - steal_suitable_fallback(zone, page, start_migratetype); - /* Remove the page from the freelists */ - area->nr_free--; - list_del(&page->lru); - rmv_page_order(page); - - expand(zone, page, order, current_order, area, - start_migratetype); - /* - * The pcppage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * find_suitable_fallback(). This is OK as long as it does not - * differ for MIGRATE_CMA pageblocks. Those can be used as - * fallback only via special __rmqueue_cma_fallback() function - */ - set_pcppage_migratetype(page, start_migratetype); + steal_suitable_fallback(zone, page, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return page; + return true; } - return NULL; + return false; } /* @@ -2196,13 +2241,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + if (!page && __rmqueue_fallback(zone, order, migratetype)) + goto retry; } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -2373,6 +2419,13 @@ void drain_all_pages(struct zone *zone) */ static cpumask_t cpus_with_pcps; + /* + * Make sure nobody triggers this path before mm_percpu_wq is fully + * initialized. + */ + if (WARN_ON_ONCE(!mm_percpu_wq)) + return; + /* Workqueues cannot recurse */ if (current->flags & PF_WQ_WORKER) return; @@ -2422,7 +2475,7 @@ void drain_all_pages(struct zone *zone) for_each_cpu(cpu, &cpus_with_pcps) { struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); INIT_WORK(work, drain_local_pages_wq); - schedule_work_on(cpu, work); + queue_work_on(cpu, mm_percpu_wq, work); } for_each_cpu(cpu, &cpus_with_pcps) flush_work(per_cpu_ptr(&pcpu_drain, cpu)); @@ -2496,7 +2549,7 @@ void free_hot_cold_page(struct page *page, bool cold) /* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being - * offlined but treat RESERVE as movable pages so we can get those + * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ @@ -2607,7 +2660,7 @@ int __isolate_free_page(struct page *page, unsigned int order) for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && mt != MIGRATE_HIGHATOMIC) + && !is_migrate_highatomic(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -3518,19 +3571,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) } /* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - -/* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. - * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round) and no_progress_loops (number of reclaim rounds without - * any progress in a row) is considered as well as the reclaimable pages on the - * applicable zone list (with a backoff mechanism which is a function of - * no_progress_loops). + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ @@ -3575,13 +3621,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, bool wmark; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP((*no_progress_loops) * available, - MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); /* - * Would the allocation succeed if we reclaimed the whole - * available? + * Would the allocation succeed if we reclaimed all + * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, ac_classzone_idx(ac), alloc_flags, available); @@ -3632,6 +3676,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; @@ -3699,12 +3744,17 @@ retry_cpuset: /* * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. Don't try - * that for allocations that are allowed to ignore watermarks, as the - * ALLOC_NO_WATERMARKS attempt didn't yet happen. + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && - !gfp_pfmemalloc_allowed(gfp_mask)) { + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, @@ -3716,7 +3766,7 @@ retry_cpuset: * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ - if (gfp_mask & __GFP_NORETRY) { + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If @@ -3767,7 +3817,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, + warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; @@ -3797,7 +3847,7 @@ retry: * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_REPEAT)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -3967,10 +4017,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Runtime PM, block IO and its error handling path can deadlock - * because I/O on the device might not complete. + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. */ - alloc_mask = memalloc_noio_flags(gfp_mask); + alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -4505,7 +4557,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif " writeback_tmp:%lukB" " unstable:%lukB" - " pages_scanned:%lu" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -4528,8 +4579,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_SHMEM)), K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), - node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { @@ -5738,6 +5789,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); + + /* If this node has no page within this zone, return 0. */ + if (zone_start_pfn == zone_end_pfn) + return 0; + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); /* @@ -7126,6 +7182,17 @@ static unsigned long __init arch_reserved_kernel_pages(void) #endif /* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + */ +#define ADAPT_SCALE_BASE (64ul << 30) +#define ADAPT_SCALE_SHIFT 2 +#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) + +/* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries @@ -7144,6 +7211,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; + gfp_t gfp_flags; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -7155,6 +7223,14 @@ void *__init alloc_large_system_hash(const char *tablename, if (PAGE_SHIFT < 20) numentries = round_up(numentries, (1<<20)/PAGE_SIZE); + if (flags & HASH_ADAPT) { + unsigned long adapt; + + for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; + adapt <<= ADAPT_SCALE_SHIFT) + scale++; + } + /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); @@ -7188,12 +7264,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); + /* + * memblock allocator returns zeroed memory already, so HASH_ZERO is + * currently not used when HASH_EARLY is specified. + */ + gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; if (flags & HASH_EARLY) table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) - table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags, PAGE_KERNEL); else { /* * If bucketsize is not a power-of-two, we may free @@ -7201,8 +7282,8 @@ void *__init alloc_large_system_hash(const char *tablename, * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, GFP_ATOMIC); - kmemleak_alloc(table, size, 1, GFP_ATOMIC); + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -7424,7 +7505,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = memalloc_noio_flags(gfp_mask), + .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_idle.c b/mm/page_idle.c index b0ee56c56b58..1b0f48c62316 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn) return page; } -static int page_idle_clear_pte_refs_one(struct page *page, +static bool page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { @@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page, */ set_page_young(page); } - return SWAP_AGAIN; + return true; } static void page_idle_clear_pte_refs(struct page *page) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f4e17a57926a..5092e4ef00c8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -66,7 +66,8 @@ out: set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + NULL); __mod_zone_freepage_state(zone, -nr_pages, migratetype); } @@ -88,7 +89,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!is_migrate_isolate_page(page)) goto out; /* @@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * pageblock scanning for freepage moving. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); @@ -205,7 +206,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!page || !is_migrate_isolate_page(page)) continue; unset_migratetype_isolate(page, migratetype); } @@ -262,7 +263,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (page && !is_migrate_isolate_page(page)) break; } page = __first_valid_page(start_pfn, end_pfn - start_pfn); diff --git a/mm/page_owner.c b/mm/page_owner.c index 60634dc53a88..c3cee247f2e6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } @@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } diff --git a/mm/rmap.c b/mm/rmap.c index 8ffd59df8a3f..d26f9a036159 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -724,7 +724,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -static int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_referenced_arg *pra = arg; @@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ + return false; /* To break the loop */ } if (pvmw.pte) { @@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (!pra->mapcount) - return SWAP_SUCCESS; /* To break the loop */ + return false; /* To break the loop */ - return SWAP_AGAIN; + return true; } static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) @@ -812,7 +812,6 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int ret; int we_locked = 0; struct page_referenced_arg pra = { .mapcount = total_mapcount(page), @@ -846,7 +845,7 @@ int page_referenced(struct page *page, rwc.invalid_vma = invalid_page_referenced_vma; } - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); *vm_flags = pra.vm_flags; if (we_locked) @@ -855,7 +854,7 @@ int page_referenced(struct page *page, return pra.referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, +static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_vma_mapped_walk pvmw = { @@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } } - return SWAP_AGAIN; + return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) @@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound) */ } -struct rmap_private { - enum ttu_flags flags; - int lazyfreed; -}; - /* * @arg: enum ttu_flags will be passed to this argument */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, }; pte_t pteval; struct page *subpage; - int ret = SWAP_AGAIN; - struct rmap_private *rp = arg; - enum ttu_flags flags = rp->flags; + bool ret = true; + enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - return SWAP_AGAIN; + return true; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, @@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mlock_vma_page(page); } - ret = SWAP_MLOCK; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { + WARN_ON_ONCE(1); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* MADV_FREE page check */ + if (!PageSwapBacked(page)) { + if (!PageDirty(page)) { + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } - if (!PageDirty(page) && (flags & TTU_LZFREE)) { - /* It's a freeable page by MADV_FREE */ - dec_mm_counter(mm, MM_ANONPAGES); - rp->lazyfreed++; - goto discard; + /* + * If the page was redirtied, it cannot be + * discarded. Remap the page to page table. + */ + set_pte_at(mm, address, pvmw.pte, pteval); + SetPageSwapBacked(page); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page) * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. - * Return values are: * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a mapping, try again later - * SWAP_FAIL - the page is unswappable - * SWAP_MLOCK - page is mlocked. + * If unmap is successful, return true. Otherwise, false. */ -int try_to_unmap(struct page *page, enum ttu_flags flags) +bool try_to_unmap(struct page *page, enum ttu_flags flags) { - int ret; - struct rmap_private rp = { - .flags = flags, - .lazyfreed = 0, - }; - struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)flags, .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; @@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - ret = rmap_walk_locked(page, &rwc); + rmap_walk_locked(page, &rwc); else - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapcount(page)) { - ret = SWAP_SUCCESS; - if (rp.lazyfreed && !PageDirty(page)) - ret = SWAP_LZFREE; - } - return ret; + return !page_mapcount(page) ? true : false; } static int page_not_mapped(struct page *page) @@ -1550,34 +1544,23 @@ static int page_not_mapped(struct page *page) * Called from munlock code. Checks all of the VMAs mapping the page * to make sure nobody else has this page mlocked. The page will be * returned with PG_mlocked cleared if no other vmas have it mlocked. - * - * Return values are: - * - * SWAP_AGAIN - no vma is holding page mlocked, or, - * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem - * SWAP_FAIL - page cannot be located at present - * SWAP_MLOCK - page is now mlocked. */ -int try_to_munlock(struct page *page) -{ - int ret; - struct rmap_private rp = { - .flags = TTU_MUNLOCK, - .lazyfreed = 0, - }; +void try_to_munlock(struct page *page) +{ struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); + VM_BUG_ON_PAGE(PageMlocked(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - ret = rmap_walk(page, &rwc); - return ret; + rmap_walk(page, &rwc); } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1625,13 +1608,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; if (locked) { anon_vma = page_anon_vma(page); @@ -1641,7 +1623,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, anon_vma = rmap_walk_anon_lock(page, rwc); } if (!anon_vma) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1655,8 +1637,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(page)) break; @@ -1664,7 +1645,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (!locked) anon_vma_unlock_read(anon_vma); - return ret; } /* @@ -1680,13 +1660,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = page_mapping(page); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; - int ret = SWAP_AGAIN; /* * The page lock not only makes sure that page->mapping cannot @@ -1697,7 +1676,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1712,8 +1691,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(page)) goto done; @@ -1722,28 +1700,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, done: if (!locked) i_mmap_unlock_read(mapping); - return ret; } -int rmap_walk(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk(struct page *page, struct rmap_walk_control *rwc) { if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rwc); + rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc, false); + rmap_walk_anon(page, rwc, false); else - return rmap_walk_file(page, rwc, false); + rmap_walk_file(page, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_PAGE(PageKsm(page), page); if (PageAnon(page)) - return rmap_walk_anon(page, rwc, true); + rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc, true); + rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 0fd21670b513..6bb4deb12e78 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,11 +9,12 @@ * as published by the Free Software Foundation; version 2 * of the License. */ +#define pr_fmt(fmt) "rodata_test: " fmt + #include <linux/uaccess.h> #include <asm/sections.h> const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); void rodata_test(void) { @@ -23,20 +24,20 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ if (!rodata_test_data) { - pr_err("rodata_test: test 1 fails (start data)\n"); + pr_err("test 1 fails (start data)\n"); return; } /* test 2: write to the variable; this should fault */ if (!probe_kernel_write((void *)&rodata_test_data, - (void *)&zero, sizeof(zero))) { - pr_err("rodata_test: test data was not read only\n"); + (void *)&zero, sizeof(zero))) { + pr_err("test data was not read only\n"); return; } /* test 3: check the value hasn't changed */ if (rodata_test_data == zero) { - pr_err("rodata_test: test data was changed\n"); + pr_err("test data was changed\n"); return; } @@ -44,13 +45,13 @@ void rodata_test(void) start = (unsigned long)__start_rodata; end = (unsigned long)__end_rodata; if (start & (PAGE_SIZE - 1)) { - pr_err("rodata_test: start of .rodata is not page size aligned\n"); + pr_err("start of .rodata is not page size aligned\n"); return; } if (end & (PAGE_SIZE - 1)) { - pr_err("rodata_test: end of .rodata is not page size aligned\n"); + pr_err("end of .rodata is not page size aligned\n"); return; } - pr_info("rodata_test: all tests were successful\n"); + pr_info("all tests were successful\n"); } diff --git a/mm/sparse.c b/mm/sparse.c index db6bf3c97ea2..6903c8fc3085 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, unsigned long usemap_size(void) { - unsigned long size_bytes; - size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; - size_bytes = roundup(size_bytes, sizeof(unsigned long)); - return size_bytes; + return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/swap.c b/mm/swap.c index c4910f14f957..361bdb1575ab 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -46,7 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); #endif @@ -561,20 +561,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + bool active = PageActive(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec, + LRU_INACTIVE_ANON + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + /* + * lazyfree pages are clean anonymous pages. They have + * SwapBacked flag cleared to distinguish normal anonymous + * pages + */ + ClearPageSwapBacked(page); + add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, 1, 0); } } @@ -604,9 +611,9 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); activate_page_drain(cpu); } @@ -638,22 +645,22 @@ void deactivate_file_page(struct page *page) } /** - * deactivate_page - deactivate a page + * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate * - * deactivate_page() moves @page to the inactive list if @page was on the active - * list and was not an unevictable page. This is done to accelerate the reclaim - * of @page. + * mark_page_lazyfree() moves @page to the inactive file list. + * This is done to accelerate the reclaim of @page. */ -void deactivate_page(struct page *page) +void mark_page_lazyfree(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); + put_cpu_var(lru_lazyfree_pvecs); } } @@ -670,30 +677,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -/* - * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM - * workqueue, aiding in getting memory freed. - */ -static struct workqueue_struct *lru_add_drain_wq; - -static int __init lru_init(void) -{ - lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0); - - if (WARN(!lru_add_drain_wq, - "Failed to create workqueue lru_add_drain_wq")) - return -ENOMEM; - - return 0; -} -early_initcall(lru_init); - void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; int cpu; + /* + * Make sure nobody triggers this path before mm_percpu_wq is fully + * initialized. + */ + if (WARN_ON(!mm_percpu_wq)) + return; + mutex_lock(&lock); get_online_cpus(); cpumask_clear(&has_work); @@ -704,10 +700,10 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); - queue_work_on(cpu, lru_add_drain_wq, work); + queue_work_on(cpu, mm_percpu_wq, work); cpumask_set_cpu(cpu, &has_work); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 473b71e052a8..7bfb9bd1ca21 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet, while - * the other end is scheduled away waiting on discard - * I/O completion at scan_swap_map(). - * - * In order to avoid turning this transitory state - * into a permanent loop around this -EEXIST case - * if !CONFIG_PREEMPT and the I/O completion happens - * to be waiting on the CPU waitqueue where we are now - * busy looping, we just conditionally invoke the - * scheduler here, if there are some more important - * tasks to run. + * has not been brought into the swapcache yet. */ cond_resched(); continue; diff --git a/mm/swapfile.c b/mm/swapfile.c index 178130880b90..53b5881ee0d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); - unlock_cluster(ci_tail); + spin_unlock(&ci_tail->lock); cluster_set_next_flag(&list->tail, idx, 0); } } @@ -672,6 +672,9 @@ checks: else goto done; } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); if (offset == si->lowest_bit) si->lowest_bit++; @@ -685,9 +688,6 @@ checks: plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); } - si->swap_map[offset] = usage; - inc_cluster_info_page(si, si->cluster_info, offset); - unlock_cluster(ci); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -1111,6 +1111,18 @@ int page_swapcount(struct page *page) return count; } +static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +{ + int count = 0; + pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); + return count; +} + /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, @@ -1119,17 +1131,11 @@ int page_swapcount(struct page *page) int __swp_swapcount(swp_entry_t entry) { int count = 0; - pgoff_t offset; struct swap_info_struct *si; - struct swap_cluster_info *ci; si = __swap_info_get(entry); - if (si) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); - count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); - } + if (si) + count = swap_swapcount(si, entry); return count; } @@ -1291,7 +1297,8 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page))) { + (!page_mapped(page) || mem_cgroup_swap_full(page)) && + !swap_swapcount(p, entry)) { delete_from_swap_cache(page); SetPageDirty(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d..07507c5971e7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -100,6 +100,9 @@ struct scan_control { /* Can cgroups be reclaimed below their normal consumption range? */ unsigned int may_thrash:1; + /* Did we have any memcg protected by the low limit */ + unsigned int memcg_low_protection:1; + unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ @@ -230,12 +233,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) return nr; } -bool pgdat_reclaimable(struct pglist_data *pgdat) -{ - return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < - pgdat_reclaimable_pages(pgdat) * 6; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -912,7 +909,8 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page)) { + if (!page_is_file_cache(page) || + (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; return; @@ -972,8 +970,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; - bool lazyfree = false; - int ret = SWAP_SUCCESS; cond_resched(); @@ -988,13 +984,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; if (unlikely(!page_evictable(page))) - goto cull_mlocked; + goto activate_locked; if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) + if ((page_mapped(page) || PageSwapCache(page)) && + !(PageAnon(page) && !PageSwapBacked(page))) sc->nr_scanned++; may_enter_fs = (sc->gfp_mask & __GFP_FS) || @@ -1120,13 +1117,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. + * Lazyfree page could be freed directly */ - if (PageAnon(page) && !PageSwapCache(page)) { + if (PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; - lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1143,21 +1141,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (ret = try_to_unmap(page, lazyfree ? - (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : - (ttu_flags | TTU_BATCH_FLUSH))) { - case SWAP_FAIL: + if (page_mapped(page)) { + if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { nr_unmap_fail++; goto activate_locked; - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_LZFREE: - goto lazyfree; - case SWAP_SUCCESS: - ; /* try to free the page below */ } } @@ -1267,10 +1254,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } -lazyfree: - if (!mapping || !__remove_mapping(mapping, page, true)) - goto keep_locked; + if (PageAnon(page) && !PageSwapBacked(page)) { + /* follow __remove_mapping for reference */ + if (!page_ref_freeze(page, 1)) + goto keep_locked; + if (PageDirty(page)) { + page_ref_unfreeze(page, 1); + goto keep_locked; + } + count_vm_event(PGLAZYFREED); + } else if (!mapping || !__remove_mapping(mapping, page, true)) + goto keep_locked; /* * At this point, we have no other references and there is * no way to pick any more up (removed from LRU, removed @@ -1280,9 +1275,6 @@ lazyfree: */ __ClearPageLocked(page); free_it: - if (ret == SWAP_LZFREE) - count_vm_event(PGLAZYFREED); - nr_reclaimed++; /* @@ -1292,20 +1284,16 @@ free_it: list_add(&page->lru, &free_pages); continue; -cull_mlocked: - if (PageSwapCache(page)) - try_to_free_swap(page); - unlock_page(page); - list_add(&page->lru, &ret_pages); - continue; - activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && mem_cgroup_swap_full(page)) + if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || + PageMlocked(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); - SetPageActive(page); - pgactivate++; + if (!PageMlocked(page)) { + SetPageActive(page); + pgactivate++; + } keep_locked: unlock_page(page); keep: @@ -1354,7 +1342,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1478,12 +1466,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0, total_skipped = 0; + unsigned long skipped = 0; unsigned long scan, nr_pages; LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src);) { + !list_empty(src); scan++) { struct page *page; page = lru_to_page(src); @@ -1497,12 +1485,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; } - /* - * Account for scanned and skipped separetly to avoid the pgdat - * being prematurely marked unreclaimable by pgdat_reclaimable. - */ - scan++; - switch (__isolate_lru_page(page, mode)) { case 0: nr_pages = hpage_nr_pages(page); @@ -1531,6 +1513,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (!list_empty(&pages_skipped)) { int zid; + list_splice(&pages_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; @@ -1538,17 +1521,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } - - /* - * Account skipped pages as a partial scan as the pgdat may be - * close to unreclaimable. If the LRU list is empty, account - * skipped pages as a full scan. - */ - total_skipped = list_empty(src) ? skipped : skipped >> 2; - - list_splice(&pages_skipped, src); } - *nr_scanned = scan + total_skipped; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); @@ -1750,7 +1724,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else @@ -1761,7 +1734,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -1953,8 +1926,6 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); __count_vm_events(PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2123,30 +2094,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; - bool force_scan = false; unsigned long ap, fp; enum lru_list lru; - bool some_scanned; - int pass; - - /* - * If the zone or memcg is small, nr[l] can be 0. This - * results in no scanning on this priority and a potential - * priority drop. Global direct reclaim can go to the next - * zone and tends to have no problems. Global kswapd is for - * zone balancing and it needs to scan a minimum amount. When - * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount there as - * well. - */ - if (current_is_kswapd()) { - if (!pgdat_reclaimable(pgdat)) - force_scan = true; - if (!mem_cgroup_online(memcg)) - force_scan = true; - } - if (!global_reclaim(sc)) - force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { @@ -2277,55 +2226,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - some_scanned = false; - /* Only use force_scan on second pass. */ - for (pass = 0; !some_scanned && pass < 2; pass++) { - *lru_pages = 0; - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; - - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; - - if (!scan && pass && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); - - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: - /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. - */ - scan = div64_u64(scan * fraction[file], - denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) { - size = 0; - scan = 0; - } - break; - default: - /* Look ma, no brain */ - BUG(); - } + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - *lru_pages += size; - nr[lru] = scan; + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = size >> sc->priority; + /* + * If the cgroup's already been deleted, make sure to + * scrape out the remaining cache. + */ + if (!scan && !mem_cgroup_online(memcg)) + scan = min(size, SWAP_CLUSTER_MAX); + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: /* - * Skip the second pass and don't force_scan, - * if we found something to scan. + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. */ - some_scanned |= !!scan; + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) { + size = 0; + scan = 0; + } + break; + default: + /* Look ma, no brain */ + BUG(); } + + *lru_pages += size; + nr[lru] = scan; } } @@ -2557,6 +2499,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long scanned; if (mem_cgroup_low(root, memcg)) { + sc->memcg_low_protection = 1; + if (!sc->may_thrash) continue; mem_cgroup_events(memcg, MEMCG_LOW, 1); @@ -2604,22 +2548,38 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - nr_scanned, node_lru_pages); + /* + * Record the subtree's reclaim efficiency. The reclaimed + * pages from slab is excluded here because the corresponding + * scanned pages is not accounted. Moreover, freeing a page + * by slab shrinking depends on each slab's object population, + * making the cost model (i.e. scan:free) different from that + * of LRU. + */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2694,10 +2654,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2808,7 +2764,7 @@ retry: return 1; /* Untapped cgroup reserves? Don't OOM, retry. */ - if (!sc->may_thrash) { + if (sc->memcg_low_protection && !sc->may_thrash) { sc->priority = initial_priority; sc->may_thrash = 1; goto retry; @@ -2817,7 +2773,7 @@ retry: return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2825,10 +2781,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!managed_zone(zone) || - pgdat_reclaimable_pages(pgdat) == 0) + if (!managed_zone(zone)) + continue; + + if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2905,7 +2866,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2927,14 +2888,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -2950,7 +2911,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -3030,7 +2991,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, int nid; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, @@ -3084,22 +3045,44 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, int classzone_idx) +/* + * Returns true if there is an eligible zone balanced for the request order + * and classzone_idx + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone); + int i; + unsigned long mark = -1; + struct zone *zone; - if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) - return false; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + mark = high_wmark_pages(zone); + if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return true; + } /* - * If any eligible zone is balanced then the node is not considered - * to be congested or dirty + * If a node has no populated zone within classzone_idx, it does not + * need balancing by definition. This can happen if a zone-restricted + * allocation tries to wake a remote kswapd. */ - clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); - clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); + if (mark == -1) + return true; - return true; + return false; +} + +/* Clear pgdat state for congested, dirty or under writeback. */ +static void clear_pgdat_congested(pg_data_t *pgdat) +{ + clear_bit(PGDAT_CONGESTED, &pgdat->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* @@ -3110,11 +3093,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - int i; - /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3128,17 +3109,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; - if (!zone_balanced(zone, order, classzone_idx)) - return false; + if (pgdat_balanced(pgdat, order, classzone_idx)) { + clear_pgdat_congested(pgdat); + return true; } - return true; + return false; } /* @@ -3214,9 +3194,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3241,23 +3221,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Check from - * high to low zone as allocations prefer higher zones. - * Scanning from low to high zone would allow congestion to be - * cleared during a very small window when a small low - * zone was balanced even under extreme pressure when the - * overall node may be congested. Note that sc.reclaim_idx - * is not used as buffer_heads_over_limit may have adjusted - * it. + * Only reclaim if there are no eligible zones. Note that + * sc.reclaim_idx is not used as buffer_heads_over_limit may + * have adjusted it. */ - for (i = classzone_idx; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - - if (zone_balanced(zone, sc.order, classzone_idx)) - goto out; - } + if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + goto out; /* * Do some background aging of the anon list, to give @@ -3271,7 +3240,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) + if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ @@ -3295,7 +3264,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3306,10 +3275,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: /* * Return the order kswapd stopped reclaiming at as @@ -3320,6 +3293,22 @@ out: return sc.order; } +/* + * pgdat->kswapd_classzone_idx is the highest zone index that a recent + * allocation request woke kswapd for. When kswapd has not woken recently, + * the value is MAX_NR_ZONES which is not a valid index. This compares a + * given classzone and returns it or the highest classzone index kswapd + * was recently woke for. + */ +static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, + enum zone_type classzone_idx) +{ + if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) + return classzone_idx; + + return max(pgdat->kswapd_classzone_idx, classzone_idx); +} + static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int classzone_idx) { @@ -3331,7 +3320,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - /* Try to sleep for a short interval */ + /* + * Try to sleep for a short interval. Note that kcompactd will only be + * woken if it is possible to sleep for a short interval. This is + * deliberate on the assumption that if reclaim cannot keep an + * eligible zone balanced that it's also unlikely that compaction will + * succeed. + */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to @@ -3355,7 +3350,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * the previous request that slept prematurely. */ if (remaining) { - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); } @@ -3409,7 +3404,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ static int kswapd(void *p) { - unsigned int alloc_order, reclaim_order, classzone_idx; + unsigned int alloc_order, reclaim_order; + unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3439,20 +3435,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - pgdat->kswapd_order = alloc_order = reclaim_order = 0; - pgdat->kswapd_classzone_idx = classzone_idx = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; for ( ; ; ) { bool ret; + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, classzone_idx); /* Read the new order and classzone_idx */ alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; + classzone_idx = kswapd_classzone_idx(pgdat, 0); pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3478,9 +3477,6 @@ kswapd_try_sleep: reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; - - alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3496,7 +3492,6 @@ kswapd_try_sleep: void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; - int z; if (!managed_zone(zone)) return; @@ -3504,22 +3499,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, + classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Only wake kswapd if all zones are unbalanced */ - for (z = 0; z <= classzone_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; - if (zone_balanced(zone, order, classzone_idx)) - return; - } + if (pgdat_balanced(pgdat, order, classzone_idx)) + return; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3725,7 +3718,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), @@ -3779,9 +3772,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index b1947f0cbee2..757be8303aa0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -954,7 +954,6 @@ const char * const vmstat_text[] = { "nr_unevictable", "nr_isolated_anon", "nr_isolated_file", - "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", @@ -992,6 +991,7 @@ const char * const vmstat_text[] = { "pgfree", "pgactivate", "pgdeactivate", + "pglazyfree", "pgfault", "pgmajfault", @@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg) { } -/* Walk all the zones in a node and print using a callback */ +/* + * Walk zones in a node and print using a callback. + * If @assert_populated is true, only use callback for zones that are populated. + */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool assert_populated, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) + if (assert_populated && !populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, frag_show_print); + walk_zones_in_node(m, pgdat, true, frag_show_print); return 0; } @@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); return 0; } @@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); return 0; } @@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); - seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) seq_printf(m, ", %ld", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); + seq_putc(m, ')'); + + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; @@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - !pgdat_reclaimable(zone->zone_pgdat), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } /* - * Output information about zones in @pgdat. + * Output information about zones in @pgdat. All zones are printed regardless + * of whether they are populated or not: lowmem_reserve_ratio operates on the + * set of all zones and userspace would not be aware of such zones if they are + * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). */ static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); return 0; } @@ -1552,7 +1563,6 @@ static const struct file_operations proc_vmstat_file_operations = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP -static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; @@ -1587,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write, for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { - switch (i) { - case NR_PAGES_SCANNED: - /* - * This is often seen to go negative in - * recent kernels, but not to go permanently - * negative. Whilst it would be nicer not to - * have exceptions, rooting them out would be - * another task, of rather low priority. - */ - break; - default: - pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); - err = -EINVAL; - break; - } + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; } } if (err) @@ -1623,7 +1620,7 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ - queue_delayed_work_on(smp_processor_id(), vmstat_wq, + queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } @@ -1702,7 +1699,7 @@ static void vmstat_shepherd(struct work_struct *w) struct delayed_work *dw = &per_cpu(vmstat_work, cpu); if (!delayed_work_pending(dw) && need_update(cpu)) - queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } put_online_cpus(); @@ -1718,7 +1715,6 @@ static void __init start_shepherd_timer(void) INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); - vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } @@ -1764,11 +1760,16 @@ static int vmstat_cpu_dead(unsigned int cpu) #endif -static int __init setup_vmstat(void) +struct workqueue_struct *mm_percpu_wq; + +void __init init_mm_internals(void) { -#ifdef CONFIG_SMP - int ret; + int ret __maybe_unused; + + mm_percpu_wq = alloc_workqueue("mm_percpu_wq", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); +#ifdef CONFIG_SMP ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", NULL, vmstat_cpu_dead); if (ret < 0) @@ -1792,9 +1793,7 @@ static int __init setup_vmstat(void) proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #endif - return 0; } -module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) @@ -1856,7 +1855,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, unusable_show_print); + walk_zones_in_node(m, pgdat, true, unusable_show_print); return 0; } @@ -1908,7 +1907,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, extfrag_show_print); + walk_zones_in_node(m, pgdat, true, extfrag_show_print); return 0; } diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index baa3c7be04ad..832e8150dba3 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2757,13 +2757,6 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } -# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig. - if ($realfile =~ /Kconfig/ && - $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) { - WARN("CONFIG_EXPERIMENTAL", - "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); - } - # discourage the use of boolean for type definition attributes of Kconfig options if ($realfile =~ /Kconfig/ && $line =~ /^\+\s*\bboolean\b/) { @@ -3133,6 +3126,17 @@ sub process { # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# check if this appears to be the start function declaration, save the name + if ($sline =~ /^\+\{\s*$/ && + $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) { + $context_function = $1; + } + +# check if this appears to be the end of function declaration + if ($sline =~ /^\+\}\s*$/) { + undef $context_function; + } + # check indentation of any line with a bare else # (but not if it is a multiple line "if (foo) return bar; else return baz;") # if the previous line is a break or return and is indented 1 tab more... @@ -3157,12 +3161,6 @@ sub process { } } -# discourage the addition of CONFIG_EXPERIMENTAL in #if(def). - if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { - WARN("CONFIG_EXPERIMENTAL", - "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); - } - # check for RCS/CVS revision markers if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) { WARN("CVS_KEYWORD", @@ -5676,6 +5674,32 @@ sub process { } } + # check for vsprintf extension %p<foo> misuses + if ($^V && $^V ge 5.10.0 && + defined $stat && + $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && + $1 !~ /^_*volatile_*$/) { + my $bad_extension = ""; + my $lc = $stat =~ tr@\n@@; + $lc = $lc + $linenr; + for (my $count = $linenr; $count <= $lc; $count++) { + my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0)); + $fmt =~ s/%%//g; + if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGN]).)/) { + $bad_extension = $1; + last; + } + } + if ($bad_extension ne "") { + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + WARN("VSPRINTF_POINTER_EXTENSION", + "Invalid vsprintf pointer extension '$bad_extension'\n" . "$here\n$stat_real\n"); + } + } + # Check for misused memsets if ($^V && $^V ge 5.10.0 && defined $stat && diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 7986f4e0da12..7aad82406422 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/of_fdt.h> /* We need to stringify expanded macros so that they can be parsed */ @@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC) LX_VALUE(MNT_NOATIME) LX_VALUE(MNT_NODIRATIME) LX_VALUE(MNT_RELATIME) + +/* linux/of_fdt.h> */ +LX_VALUE(OF_DT_HEADER) + +/* Kernel Configs */ +LX_CONFIG(CONFIG_OF) diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 38b1f09d1cd9..086d27223c0c 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -16,6 +16,7 @@ from linux import constants from linux import utils from linux import tasks from linux import lists +from struct import * class LxCmdLine(gdb.Command): @@ -195,3 +196,75 @@ values of that process namespace""" info_opts(MNT_INFO, m_flags))) LxMounts() + + +class LxFdtDump(gdb.Command): + """Output Flattened Device Tree header and dump FDT blob to the filename + specified as the command argument. Equivalent to + 'cat /proc/fdt > fdtdump.dtb' on a running target""" + + def __init__(self): + super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA, + gdb.COMPLETE_FILENAME) + + def fdthdr_to_cpu(self, fdt_header): + + fdt_header_be = ">IIIIIII" + fdt_header_le = "<IIIIIII" + + if utils.get_target_endianness() == 1: + output_fmt = fdt_header_le + else: + output_fmt = fdt_header_be + + return unpack(output_fmt, pack(fdt_header_be, + fdt_header['magic'], + fdt_header['totalsize'], + fdt_header['off_dt_struct'], + fdt_header['off_dt_strings'], + fdt_header['off_mem_rsvmap'], + fdt_header['version'], + fdt_header['last_comp_version'])) + + def invoke(self, arg, from_tty): + + if not constants.LX_CONFIG_OF: + raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n") + + if len(arg) == 0: + filename = "fdtdump.dtb" + else: + filename = arg + + py_fdt_header_ptr = gdb.parse_and_eval( + "(const struct fdt_header *) initial_boot_params") + py_fdt_header = py_fdt_header_ptr.dereference() + + fdt_header = self.fdthdr_to_cpu(py_fdt_header) + + if fdt_header[0] != constants.LX_OF_DT_HEADER: + raise gdb.GdbError("No flattened device tree magic found\n") + + gdb.write("fdt_magic: 0x{:02X}\n".format(fdt_header[0])) + gdb.write("fdt_totalsize: 0x{:02X}\n".format(fdt_header[1])) + gdb.write("off_dt_struct: 0x{:02X}\n".format(fdt_header[2])) + gdb.write("off_dt_strings: 0x{:02X}\n".format(fdt_header[3])) + gdb.write("off_mem_rsvmap: 0x{:02X}\n".format(fdt_header[4])) + gdb.write("version: {}\n".format(fdt_header[5])) + gdb.write("last_comp_version: {}\n".format(fdt_header[6])) + + inf = gdb.inferiors()[0] + fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr, + fdt_header[1]).tobytes() + + try: + f = open(filename, 'wb') + except: + raise gdb.GdbError("Could not open file to dump fdt") + + f.write(fdt_buf) + f.close() + + gdb.write("Dumped fdt blob to " + filename + "\n") + +LxFdtDump() diff --git a/usr/Kconfig b/usr/Kconfig index 6278f135256d..c0c48507e44e 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -21,6 +21,16 @@ config INITRAMFS_SOURCE If you are not sure, leave it blank. +config INITRAMFS_FORCE + bool "Ignore the initramfs passed by the bootloader" + depends on CMDLINE_EXTEND || CMDLINE_FORCE + help + This option causes the kernel to ignore the initramfs image + (or initrd image) passed to it by the bootloader. This is + analogous to CMDLINE_FORCE, which is found on some architectures, + and is useful if you cannot or don't want to change the image + your bootloader passes to the kernel. + config INITRAMFS_ROOT_UID int "User ID to map to 0 (user root)" depends on INITRAMFS_SOURCE!="" |