diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-04-12 16:44:46 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-04-12 16:44:48 +1000 |
commit | b3857aacfa35f5097984626733a2d09eb6c3bcb3 (patch) | |
tree | 222d458b0ad3fc84e96c4d80ee071f654e5fb29d | |
parent | 8ac0ff067dfee4f357f61c509bedef45a50d7b92 (diff) | |
parent | 8bc56dc7e20fd98df3d5aa9380edfa964de0b031 (diff) |
Merge branch 'akpm-current/current'
151 files changed, 3585 insertions, 2274 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 49d7c997fa1e..e50b95c25868 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back. Amount of memory used in network transmission buffers + shmem + + Amount of cached filesystem data that is swap-backed, + such as tmpfs, shm segments, shared anonymous mmap()s + file_mapped Amount of cached filesystem data mapped with mmap() diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 415484f3d59a..918972babcd8 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -134,6 +134,23 @@ use the boot option: fail_futex= mmc_core.fail_request=<interval>,<probability>,<space>,<times> +o proc entries + +- /proc/<pid>/fail-nth: +- /proc/self/task/<tid>/fail-nth: + + Write to this file of integer N makes N-th call in the task fail. + Read from this file returns a integer value. A value of '0' indicates + that the fault setup with a previous write to this file was injected. + A positive integer N indicates that the fault wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + How to add new fault injection capability ----------------------------------------- @@ -278,3 +295,65 @@ allocation failure. # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call. + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/syscall.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> + +int main() +{ + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 1;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + pread(fail_nth, buf, sizeof(buf), 0); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y', + res, err); + if (atoi(buf)) + break; + } + return 0; +} + +An example output: + +1-th fault Y: res=-1/23 +2-th fault Y: res=-1/23 +3-th fault Y: res=-1/12 +4-th fault Y: res=-1/12 +5-th fault Y: res=-1/23 +6-th fault Y: res=-1/23 +7-th fault Y: res=-1/23 +8-th fault Y: res=-1/12 +9-th fault Y: res=-1/12 +10-th fault Y: res=-1/12 +11-th fault Y: res=-1/12 +12-th fault Y: res=-1/12 +13-th fault Y: res=-1/12 +14-th fault Y: res=-1/12 +15-th fault Y: res=-1/12 +16-th fault N: res=0/12 diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 9036dbf16156..4cddbce85ac9 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -413,6 +413,7 @@ Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB +LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB @@ -442,6 +443,11 @@ accessed. "Anonymous" shows the amount of memory that does not belong to any file. Even a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. +"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE). +The memory isn't freed immediately with madvise(). It's freed in memory +pressure if the memory is clean. Please note that the printed value might +be lower than the real value due to optimizations used in the current +implementation. If this is not desirable please file a bug report. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by huge pages. diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 19b1e3d09a19..9cabaf8a207e 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -55,10 +55,14 @@ as follows: booted with restricted memory. By default, the boot memory size will be the larger of 5% of system RAM or 256MB. Alternatively, user can also specify boot memory size - through boot parameter 'fadump_reserve_mem=' which will - override the default calculated size. Use this option - if default boot memory size is not sufficient for second - kernel to boot successfully. + through boot parameter 'crashkernel=' which will override + the default calculated size. Use this option if default + boot memory size is not sufficient for second kernel to + boot successfully. For syntax of crashkernel= parameter, + refer to Documentation/kdump/kdump.txt. If any offset is + provided in crashkernel= parameter, it will be ignored + as fadump reserves memory at end of RAM for boot memory + dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the firmware will reset PCI and other hardware state. It will @@ -158,13 +162,16 @@ How to enable firmware-assisted dump (fadump): 1. Set config option CONFIG_FA_DUMP=y and build kernel. 2. Boot into linux kernel with 'fadump=on' kernel cmdline option. -3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline +3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. -NOTE: If firmware-assisted dump fails to reserve memory then it will - fallback to existing kdump mechanism if 'crashkernel=' option - is set at kernel cmdline. +NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead + use 'crashkernel=' to specify size of the memory to reserve + for boot memory dump preservation. + 2. If firmware-assisted dump fails to reserve memory then it + will fallback to existing kdump mechanism if 'crashkernel=' + option is set at kernel cmdline. Sysfs/debugfs files: ------------ diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 6a5e2a102a45..11d3d8dcb449 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -12,6 +12,8 @@ highmem.txt - Outline of highmem and common issues. hugetlbpage.txt - a brief summary of hugetlbpage support in the Linux kernel. +hugetlbfs_reserv.txt + - A brief overview of hugetlbfs reservation design/implementation. hwpoison.txt - explains what hwpoison is idle_page_tracking.txt diff --git a/Documentation/vm/hugetlbfs_reserv.txt b/Documentation/vm/hugetlbfs_reserv.txt new file mode 100644 index 000000000000..9aca09a76bed --- /dev/null +++ b/Documentation/vm/hugetlbfs_reserv.txt @@ -0,0 +1,529 @@ +Hugetlbfs Reservation Overview +------------------------------ +Huge pages as described at 'Documentation/vm/hugetlbpage.txt' are typically +preallocated for application use. These huge pages are instantiated in a +task's address space at page fault time if the VMA indicates huge pages are +to be used. If no huge page exists at page fault time, the task is sent +a SIGBUS and often dies an unhappy death. Shortly after huge page support +was added, it was determined that it would be better to detect a shortage +of huge pages at mmap() time. The idea is that if there were not enough +huge pages to cover the mapping, the mmap() would fail. This was first +done with a simple check in the code at mmap() time to determine if there +were enough free huge pages to cover the mapping. Like most things in the +kernel, the code has evolved over time. However, the basic idea was to +'reserve' huge pages at mmap() time to ensure that huge pages would be +available for page faults in that mapping. The description below attempts to +describe how huge page reserve processing is done in the v4.10 kernel. + + +Audience +-------- +This description is primarily targeted at kernel developers who are modifying +hugetlbfs code. + + +The Data Structures +------------------- +resv_huge_pages + This is a global (per-hstate) count of reserved huge pages. Reserved + huge pages are only available to the task which reserved them. + Therefore, the number of huge pages generally available is computed + as (free_huge_pages - resv_huge_pages). +Reserve Map + A reserve map is described by the structure: + struct resv_map { + struct kref refs; + spinlock_t lock; + struct list_head regions; + long adds_in_progress; + struct list_head region_cache; + long region_cache_count; + }; + There is one reserve map for each huge page mapping in the system. + The regions list within the resv_map describes the regions within + the mapping. A region is described as: + struct file_region { + struct list_head link; + long from; + long to; + }; + The 'from' and 'to' fields of the file region structure are huge page + indices into the mapping. Depending on the type of mapping, a + region in the reserv_map may indicate reservations exist for the + range, or reservations do not exist. +Flags for MAP_PRIVATE Reservations + These are stored in the bottom bits of the reservation map pointer. + #define HPAGE_RESV_OWNER (1UL << 0) Indicates this task is the + owner of the reservations associated with the mapping. + #define HPAGE_RESV_UNMAPPED (1UL << 1) Indicates task originally + mapping this range (and creating reserves) has unmapped a + page from this task (the child) due to a failed COW. +Page Flags + The PagePrivate page flag is used to indicate that a huge page + reservation must be restored when the huge page is freed. More + details will be discussed in the "Freeing huge pages" section. + + +Reservation Map Location (Private or Shared) +-------------------------------------------- +A huge page mapping or segment is either private or shared. If private, +it is typically only available to a single address space (task). If shared, +it can be mapped into multiple address spaces (tasks). The location and +semantics of the reservation map is significantly different for two types +of mappings. Location differences are: +- For private mappings, the reservation map hangs off the the VMA structure. + Specifically, vma->vm_private_data. This reserve map is created at the + time the mapping (mmap(MAP_PRIVATE)) is created. +- For shared mappings, the reservation map hangs off the inode. Specifically, + inode->i_mapping->private_data. Since shared mappings are always backed + by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode + contains a reservation map. As a result, the reservation map is allocated + when the inode is created. + + +Creating Reservations +--------------------- +Reservations are created when a huge page backed shared memory segment is +created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB). +These operations result in a call to the routine hugetlb_reserve_pages() + +int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) + +The first thing hugetlb_reserve_pages() does is check for the NORESERVE +flag was specified in either the shmget() or mmap() call. If NORESERVE +was specified, then this routine returns immediately as no reservation +are desired. + +The arguments 'from' and 'to' are huge page indices into the mapping or +underlying file. For shmget(), 'from' is always 0 and 'to' corresponds to +the length of the segment/mapping. For mmap(), the offset argument could +be used to specify the offset into the underlying file. In such a case +the 'from' and 'to' arguments have been adjusted by this offset. + +One of the big differences between PRIVATE and SHARED mappings is the way +in which reservations are represented in the reservation map. +- For shared mappings, an entry in the reservation map indicates a reservation + exists or did exist for the corresponding page. As reservations are + consumed, the reservation map is not modified. +- For private mappings, the lack of an entry in the reservation map indicates + a reservation exists for the corresponding page. As reservations are + consumed, entries are added to the reservation map. Therefore, the + reservation map can also be used to determine which reservations have + been consumed. + +For private mappings, hugetlb_reserve_pages() creates the reservation map and +hangs it off the VMA structure. In addition, the HPAGE_RESV_OWNER flag is set +to indicate this VMA owns the reservations. + +The reservation map is consulted to determine how many huge page reservations +are needed for the current mapping/segment. For private mappings, this is +always the value (to - from). However, for shared mappings it is possible that some reservations may already exist within the range (to - from). See the +section "Reservation Map Modifications" for details on how this is accomplished. + +The mapping may be associated with a subpool. If so, the subpool is consulted +to ensure there is sufficient space for the mapping. It is possible that the +subpool has set aside reservations that can be used for the mapping. See the +section "Subpool Reservations" for more details. + +After consulting the reservation map and subpool, the number of needed new +reservations is known. The routine hugetlb_acct_memory() is called to check +for and take the requested number of reservations. hugetlb_acct_memory() +calls into routines that potentially allocate and adjust surplus page counts. +However, within those routines the code is simply checking to ensure there +are enough free huge pages to accommodate the reservation. If there are, +the global reservation count resv_huge_pages is adjusted something like the +following. + if (resv_needed <= (resv_huge_pages - free_huge_pages)) + resv_huge_pages += resv_needed; +Note that the global lock hugetlb_lock is held when checking and adjusting +these counters. + +If there were enough free huge pages and the global count resv_huge_pages +was adjusted, then the reservation map associated with the mapping is +modified to reflect the reservations. In the case of a shared mapping, a +file_region will exist that includes the range 'from' 'to'. For private +mappings, no modifications are made to the reservation map as lack of an +entry indicates a reservation exists. + +If hugetlb_reserve_pages() was successful, the global reservation count and +reservation map associated with the mapping will be modified as required to +ensure reservations exist for the range 'from' - 'to'. + + +Consuming Reservations/Allocating a Huge Page +--------------------------------------------- +Reservations are consumed when huge pages associated with the reservations +are allocated and instantiated in the corresponding mapping. The allocation +is performed within the routine alloc_huge_page(). +struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +alloc_huge_page is passed a VMA pointer and a virtual address, so it can +consult the reservation map to determine if a reservation exists. In addition, +alloc_huge_page takes the argument avoid_reserve which indicates reserves +should not be used even if it appears they have been set aside for the +specified address. The avoid_reserve argument is most often used in the case +of Copy on Write and Page Migration where additional copies of an existing +page are being allocated. + +The helper routine vma_needs_reservation() is called to determine if a +reservation exists for the address within the mapping(vma). See the section +"Reservation Map Helper Routines" for detailed information on what this +routine does. The value returned from vma_needs_reservation() is generally +0 or 1. 0 if a reservation exists for the address, 1 if no reservation exists. +If a reservation does not exist, and there is a subpool associated with the +mapping the subpool is consulted to determine if it contains reservations. +If the subpool contains reservations, one can be used for this allocation. +However, in every case the avoid_reserve argument overrides the use of +a reservation for the allocation. After determining whether a reservation +exists and can be used for the allocation, the routine dequeue_huge_page_vma() +is called. This routine takes two arguments related to reservations: +- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- chg, even though this argument is of type long only the values 0 or 1 are + passed to dequeue_huge_page_vma. If the value is 0, it indicates a + reservation exists (see the section "Memory Policy and Reservations" for + possible issues). If the value is 1, it indicates a reservation does not + exist and the page must be taken from the global free pool if possible. +The free lists associated with the memory policy of the VMA are searched for +a free page. If a page is found, the value free_huge_pages is decremented +when the page is removed from the free list. If there was a reservation +associated with the page, the following adjustments are made: + SetPagePrivate(page); /* Indicates allocating this page consumed + * a reservation, and if an error is + * encountered such that the page must be + * freed, the reservation will be restored. */ + resv_huge_pages--; /* Decrement the global reservation count */ +Note, if no huge page can be found that satisfies the VMA's memory policy +an attempt will be made to allocate one using the buddy allocator. This +brings up the issue of surplus huge pages and overcommit which is beyond +the scope reservations. Even if a surplus page is allocated, the same +reservation based adjustments as above will be made: SetPagePrivate(page) and +resv_huge_pages--. + +After obtaining a new huge page, (page)->private is set to the value of +the subpool associated with the page if it exists. This will be used for +subpool accounting when the page is freed. + +The routine vma_commit_reservation() is then called to adjust the reserve +map based on the consumption of the reservation. In general, this involves +ensuring the page is represented within a file_region structure of the region +map. For shared mappings where the the reservation was present, an entry +in the reserve map already existed so no change is made. However, if there +was no reservation in a shared mapping or this was a private mapping a new +entry must be created. + +It is possible that the reserve map could have been changed between the call +to vma_needs_reservation() at the beginning of alloc_huge_page() and the +call to vma_commit_reservation() after the page was allocated. This would +be possible if hugetlb_reserve_pages was called for the same page in a shared +mapping. In such cases, the reservation count and subpool free page count +will be off by one. This rare condition can be identified by comparing the +return value from vma_needs_reservation and vma_commit_reservation. If such +a race is detected, the subpool and global reserve counts are adjusted to +compensate. See the section "Reservation Map Helper Routines" for more +information on these routines. + + +Instantiate Huge Pages +---------------------- +After huge page allocation, the page is typically added to the page tables +of the allocating task. Before this, pages in a shared mapping are added +to the page cache and pages in private mappings are added to an anonymous +reverse mapping. In both cases, the PagePrivate flag is cleared. Therefore, +when a huge page that has been instantiated is freed no adjustment is made +to the global reservation count (resv_huge_pages). + + +Freeing Huge Pages +------------------ +Huge page freeing is performed by the routine free_huge_page(). This routine +is the destructor for hugetlbfs compound pages. As a result, it is only +passed a pointer to the page struct. When a huge page is freed, reservation +accounting may need to be performed. This would be the case if the page was +associated with a subpool that contained reserves, or the page is being freed +on an error path where a global reserve count must be restored. + +The page->private field points to any subpool associated with the page. +If the PagePrivate flag is set, it indicates the global reserve count should +be adjusted (see the section "Consuming Reservations/Allocating a Huge Page" +for information on how these are set). + +The routine first calls hugepage_subpool_put_pages() for the page. If this +routine returns a value of 0 (which does not equal the value passed 1) it +indicates reserves are associated with the subpool, and this newly free page +must be used to keep the number of subpool reserves above the minimum size. +Therefore, the global resv_huge_pages counter is incremented in this case. + +If the PagePrivate flag was set in the page, the global resv_huge_pages counter +will always be incremented. + + +Subpool Reservations +-------------------- +There is a struct hstate associated with each huge page size. The hstate +tracks all huge pages of the specified size. A subpool represents a subset +of pages within a hstate that is associated with a mounted hugetlbfs +filesystem. + +When a hugetlbfs filesystem is mounted a min_size option can be specified +which indicates the minimum number of huge pages required by the filesystem. +If this option is specified, the number of huge pages corresponding to +min_size are reserved for use by the filesystem. This number is tracked in +the min_hpages field of a struct hugepage_subpool. At mount time, +hugetlb_acct_memory(min_hpages) is called to reserve the specified number of +huge pages. If they can not be reserved, the mount fails. + +The routines hugepage_subpool_get/put_pages() are called when pages are +obtained from or released back to a subpool. They perform all subpool +accounting, and track any reservations associated with the subpool. +hugepage_subpool_get/put_pages are passed the number of huge pages by which +to adjust the subpool 'used page' count (down for get, up for put). Normally, +they return the same value that was passed or an error if not enough pages +exist in the subpool. + +However, if reserves are associated with the subpool a return value less +than the passed value may be returned. This return value indicates the +number of additional global pool adjustments which must be made. For example, +suppose a subpool contains 3 reserved huge pages and someone asks for 5. +The 3 reserved pages associated with the subpool can be used to satisfy part +of the request. But, 2 pages must be obtained from the global pools. To +relay this information to the caller, the value 2 is returned. The caller +is then responsible for attempting to obtain the additional two pages from +the global pools. + + +COW and Reservations +-------------------- +Since shared mappings all point to and use the same underlying pages, the +biggest reservation concern for COW is private mappings. In this case, +two tasks can be pointing at the same previously allocated page. One task +attempts to write to the page, so a new page must be allocated so that each +task points to its own page. + +When the page was originally allocated, the reservation for that page was +consumed. When an attempt to allocate a new page is made as a result of +COW, it is possible that no free huge pages are free and the allocation +will fail. + +When the private mapping was originally created, the owner of the mapping +was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation +map of the owner. Since the owner created the mapping, the owner owns all +the reservations associated with the mapping. Therefore, when a write fault +occurs and there is no page available, different action is taken for the owner +and non-owner of the reservation. + +In the case where the faulting task is not the owner, the fault will fail and +the task will typically receive a SIGBUS. + +If the owner is the faulting task, we want it to succeed since it owned the +original reservation. To accomplish this, the page is unmapped from the +non-owning task. In this way, the only reference is from the owning task. +In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer +of the non-owning task. The non-owning task may receive a SIGBUS if it later +faults on a non-present page. But, the original owner of the +mapping/reservation will behave as expected. + + +Reservation Map Modifications +----------------------------- +The following low level routines are used to make modifications to a +reservation map. Typically, these routines are not called directly. Rather, +a reservation map helper routine is called which calls one of these low level +routines. These low level routines are fairly well documented in the source +code (mm/hugetlb.c). These routines are: +long region_chg(struct resv_map *resv, long f, long t); +long region_add(struct resv_map *resv, long f, long t); +void region_abort(struct resv_map *resv, long f, long t); +long region_count(struct resv_map *resv, long f, long t); + +Operations on the reservation map typically involve two operations: +1) region_chg() is called to examine the reserve map and determine how + many pages in the specified range [f, t) are NOT currently represented. + + The calling code performs global checks and allocations to determine if + there are enough huge pages for the operation to succeed. + +2a) If the operation can succeed, region_add() is called to actually modify + the reservation map for the same range [f, t) previously passed to + region_chg(). +2b) If the operation can not succeed, region_abort is called for the same range + [f, t) to abort the operation. + +Note that this is a two step process where region_add() and region_abort() +are guaranteed to succeed after a prior call to region_chg() for the same +range. region_chg() is responsible for pre-allocating any data structures +necessary to ensure the subsequent operations (specifically region_add())) +will succeed. + +As mentioned above, region_chg() determines the number of pages in the range +which are NOT currently represented in the map. This number is returned to +the caller. region_add() returns the number of pages in the range added to +the map. In most cases, the return value of region_add() is the same as the +return value of region_chg(). However, in the case of shared mappings it is +possible for changes to the reservation map to be made between the calls to +region_chg() and region_add(). In this case, the return value of region_add() +will not match the return value of region_chg(). It is likely that in such +cases global counts and subpool accounting will be incorrect and in need of +adjustment. It is the responsibility of the caller to check for this condition +and make the appropriate adjustments. + +The routine region_del() is called to remove regions from a reservation map. +It is typically called in the following situations: +- When a file in the hugetlbfs filesystem is being removed, the inode will + be released and the reservation map freed. Before freeing the reservation + map, all the individual file_region structures must be freed. In this case + region_del is passed the range [0, LONG_MAX). +- When a hugetlbfs file is being truncated. In this case, all allocated pages + after the new file size must be freed. In addition, any file_region entries + in the reservation map past the new end of file must be deleted. In this + case, region_del is passed the range [new_end_of_file, LONG_MAX). +- When a hole is being punched in a hugetlbfs file. In this case, huge pages + are removed from the middle of the file one at a time. As the pages are + removed, region_del() is called to remove the corresponding entry from the + reservation map. In this case, region_del is passed the range + [page_idx, page_idx + 1). +In every case, region_del() will return the number of pages removed from the +reservation map. In VERY rare cases, region_del() can fail. This can only +happen in the hole punch case where it has to split an existing file_region +entry and can not allocate a new structure. In this error case, region_del() +will return -ENOMEM. The problem here is that the reservation map will +indicate that there is a reservation for the page. However, the subpool and +global reservation counts will not reflect the reservation. To handle this +situation, the routine hugetlb_fix_reserve_counts() is called to adjust the +counters so that they correspond with the reservation map entry that could +not be deleted. + +region_count() is called when unmapping a private huge page mapping. In +private mappings, the lack of a entry in the reservation map indicates that +a reservation exists. Therefore, by counting the number of entries in the +reservation map we know how many reservations were consumed and how many are +outstanding (outstanding = (end - start) - region_count(resv, start, end)). +Since the mapping is going away, the subpool and global reservation counts +are decremented by the number of outstanding reservations. + + +Reservation Map Helper Routines +------------------------------- +Several helper routines exist to query and modify the reservation maps. +These routines are only interested with reservations for a specific huge +page, so they just pass in an address instead of a range. In addition, +they pass in the associated VMA. From the VMA, the type of mapping (private +or shared) and the location of the reservation map (inode or VMA) can be +determined. These routines simply call the underlying routines described +in the section "Reservation Map Modifications". However, they do take into +account the 'opposite' meaning of reservation map entries for private and +shared mappings and hide this detail from the caller. + +long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +This routine calls region_chg() for the specified page. If no reservation +exists, 1 is returned. If a reservation exists, 0 is returned. + +long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +This calls region_add() for the specified page. As in the case of region_chg +and region_add, this routine is to be called after a previous call to +vma_needs_reservation. It will add a reservation entry for the page. It +returns 1 if the reservation was added and 0 if not. The return value should +be compared with the return value of the previous call to +vma_needs_reservation. An unexpected difference indicates the reservation +map was modified between calls. + +void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +This calls region_abort() for the specified page. As in the case of region_chg +and region_abort, this routine is to be called after a previous call to +vma_needs_reservation. It will abort/end the in progress reservation add +operation. + +long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +This is a special wrapper routine to help facilitate reservation cleanup +on error paths. It is only called from the routine restore_reserve_on_error(). +This routine is used in conjunction with vma_needs_reservation in an attempt +to add a reservation to the reservation map. It takes into account the +different reservation map semantics for private and shared mappings. Hence, +region_add is called for shared mappings (as an entry present in the map +indicates a reservation), and region_del is called for private mappings (as +the absence of an entry in the map indicates a reservation). See the section +"Reservation cleanup in error paths" for more information on what needs to +be done on error paths. + + +Reservation Cleanup in Error Paths +---------------------------------- +As mentioned in the section "Reservation Map Helper Routines", reservation +map modifications are performed in two steps. First vma_needs_reservation +is called before a page is allocated. If the allocation is successful, +then vma_commit_reservation is called. If not, vma_end_reservation is called. +Global and subpool reservation counts are adjusted based on success or failure +of the operation and all is well. + +Additionally, after a huge page is instantiated the PagePrivate flag is +cleared so that accounting when the page is ultimately freed is correct. + +However, there are several instances where errors are encountered after a huge +page is allocated but before it is instantiated. In this case, the page +allocation has consumed the reservation and made the appropriate subpool, +reservation map and global count adjustments. If the page is freed at this +time (before instantiation and clearing of PagePrivate), then free_huge_page +will increment the global reservation count. However, the reservation map +indicates the reservation was consumed. This resulting inconsistent state +will cause the 'leak' of a reserved huge page. The global reserve count will +be higher than it should and prevent allocation of a pre-allocated page. + +The routine restore_reserve_on_error() attempts to handle this situation. It +is fairly well documented. The intention of this routine is to restore +the reservation map to the way it was before the page allocation. In this +way, the state of the reservation map will correspond to the global reservation +count after the page is freed. + +The routine restore_reserve_on_error itself may encounter errors while +attempting to restore the reservation map entry. In this case, it will +simply clear the PagePrivate flag of the page. In this way, the global +reserve count will not be incremented when the page is freed. However, the +reservation map will continue to look as though the reservation was consumed. +A page can still be allocated for the address, but it will not use a reserved +page as originally intended. + +There is some code (most notably userfaultfd) which can not call +restore_reserve_on_error. In this case, it simply modifies the PagePrivate +so that a reservation will not be leaked when the huge page is freed. + + +Reservations and Memory Policy +------------------------------ +Per-node huge page lists existed in struct hstate when git was first used +to manage Linux code. The concept of reservations was added some time later. +When reservations were added, no attempt was made to take memory policy +into account. While cpusets are not exactly the same as memory policy, this +comment in hugetlb_acct_memory sums up the interaction between reservations +and cpusets/memory policy. + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + +Huge page reservations were added to prevent unexpected page allocation +failures (OOM) at page fault time. However, if an application makes use +of cpusets or memory policy there is no guarantee that huge pages will be +available on the required nodes. This is true even if there are a sufficient +number of global reservations. + + +Mike Kravetz, 7 April 2017 @@ -7,29 +7,12 @@ # 4) Check for missing system calls # 5) Generate constants.py (may need bounds.h) -# Default sed regexp - multiline due to syntax constraints -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef +include scripts/Makefile.asm-offsets # Use filechk to avoid rebuilds when a header changes, but the resulting file # does not define filechk_offsets - (set -e; \ - echo "#ifndef $2"; \ - echo "#define $2"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y); \ - echo ""; \ - echo "#endif" ) + $(call gen_header_from_asm_offsets,$2) endef ##### diff --git a/arch/Kconfig b/arch/Kconfig index 2ecfc5825fc0..6c00e5b00f8b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -2,7 +2,11 @@ # General architecture dependent options # +config CRASH_CORE + bool + config KEXEC_CORE + select CRASH_CORE bool config HAVE_IMA_KEXEC diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include <linux/personality.h> /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include <asm/page-nommu.h> diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 3686d6abafde..186ba553ff26 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -50,25 +50,12 @@ CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 # The gate DSO image is built using a special linker script. include $(src)/Makefile.gate +include $(srctree)/scripts/Makefile.asm-offsets + # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config -define sed-y - "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}" -endef quiet_cmd_nr_irqs = GEN $@ define cmd_nr_irqs - (set -e; \ - echo "#ifndef __ASM_NR_IRQS_H__"; \ - echo "#define __ASM_NR_IRQS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ + $(call gen_header_from_asm_offsets,__ASM_NR_IRQS_H__) < $< > $@ endef # We use internal kbuild rules to avoid the "is up to date" message from make diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index 2955f359e2a7..75859a07d75b 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -27,28 +27,6 @@ static int kdump_freeze_monarch; static int kdump_on_init = 1; static int kdump_on_fatal_mca = 1; -static inline Elf64_Word -*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note *note = (struct elf_note *)buf; - note->n_namesz = strlen(name) + 1; - note->n_descsz = data_len; - note->n_type = type; - buf += (sizeof(*note) + 3)/4; - memcpy(buf, name, note->n_namesz); - buf += (note->n_namesz + 3)/4; - memcpy(buf, data, data_len); - buf += (data_len + 3)/4; - return buf; -} - -static void -final_note(void *buf) -{ - memset(buf, 0, sizeof(struct elf_note)); -} - extern void ia64_dump_cpu_regs(void *); static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus); diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 9452b02ce079..313a88b2973f 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -618,7 +618,7 @@ static int mipspmu_event_init(struct perf_event *event) return -ENOENT; } - if (event->cpu >= nr_cpumask_bits || + if ((unsigned int)event->cpu >= nr_cpumask_bits || (event->cpu >= 0 && !cpu_online(event->cpu))) return -ENODEV; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 61169f201d04..2bfd49098c5e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -522,21 +522,23 @@ config RELOCATABLE_TEST relocation code. config CRASH_DUMP - bool "Build a kdump crash kernel" + bool "Build a dump capture kernel" depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE help - Build a kernel suitable for use as a kdump capture kernel. + Build a kernel suitable for use as a dump capture kernel. The same kernel binary can be used as production kernel and dump capture kernel. config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC_CORE + depends on PPC64 && PPC_RTAS + select CRASH_CORE + select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with assistance from firmware. This approach does not use kexec, - instead firmware assists in booting the kdump kernel + instead firmware assists in booting the capture kernel while preserving memory contents. Firmware-assisted dump is meant to be a kdump replacement offering robustness and speed not possible without system firmware assistance. diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 0031806475f0..60b91084f33c 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -73,6 +73,8 @@ reg_entry++; \ }) +extern int crashing_cpu; + /* Kernel Dump section info */ struct fadump_section { __be32 request_flag; diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 47b63de81f9b..cbabb5adccd9 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -43,8 +43,6 @@ #define IPI_TIMEOUT 10000 #define REAL_MODE_TIMEOUT 10000 -/* This keeps a track of which one is the crashing cpu. */ -int crashing_cpu = -1; static int time_to_dump; #define CRASH_HANDLER_MAX 3 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 243dbef7e926..466569e26278 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -209,14 +209,20 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, */ static inline unsigned long fadump_calculate_reserve_size(void) { - unsigned long size; + int ret; + unsigned long long base, size; /* - * Check if the size is specified through fadump_reserve_mem= cmdline - * option. If yes, then use that. + * Check if the size is specified through crashkernel= cmdline + * option. If yes, then use that but ignore base as fadump + * reserves memory at end of RAM. */ - if (fw_dump.reserve_bootvar) + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &size, &base); + if (ret == 0 && size > 0) { + fw_dump.reserve_bootvar = (unsigned long)size; return fw_dump.reserve_bootvar; + } /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -371,15 +377,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, &p); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc; @@ -527,34 +524,6 @@ fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) return reg_entry; } -static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type, - void *data, size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void fadump_final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -565,8 +534,8 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) * prstatus.pr_pid = ???? */ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); - buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); + buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); return buf; } @@ -707,7 +676,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) note_buf = fadump_regs_to_elf_notes(note_buf, ®s); } } - fadump_final_note(note_buf); + final_note(note_buf); if (fdh) { pr_debug("Updating elfcore header (%llx) with cpu notes\n", diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 5c10b5925ac2..69e077180db6 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -125,6 +125,11 @@ int ppc_do_canonicalize_irqs; EXPORT_SYMBOL(ppc_do_canonicalize_irqs); #endif +#ifdef CONFIG_CRASH_CORE +/* This keeps a track of which one is the crashing cpu. */ +int crashing_cpu = -1; +#endif + /* also used by kexec */ void machine_shutdown(void) { diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 9a4f279d25ca..ca960d0370d5 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -823,7 +823,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event) } /* Check online status of the CPU to which the event is pinned */ - if (event->cpu >= nr_cpumask_bits || + if ((unsigned int)event->cpu >= nr_cpumask_bits || (event->cpu >= 0 && !cpu_online(event->cpu))) return -ENODEV; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 085c3b300d32..b41ee7a91fe9 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -351,6 +351,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/block/genhd.c b/block/genhd.c index 7421e4aec34b..a561a40c151b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -890,7 +890,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7be139f50a55..52a10d66b992 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/fs.h> #include <linux/bio.h> #include <linux/stat.h> @@ -210,7 +211,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, struct socket *sock = nbd->socks[index]->sock; int result; struct msghdr msg; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; if (unlikely(!sock)) { dev_err_ratelimited(disk_to_dev(nbd->disk), @@ -221,7 +222,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, msg.msg_iter = *iter; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; msg.msg_name = NULL; @@ -244,7 +245,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, *sent += result; } while (msg_data_left(&msg)); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return result; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1710b06f04a7..783ac67273a3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -177,15 +177,17 @@ static bool page_same_filled(void *ptr, unsigned long *element) { unsigned int pos; unsigned long *page; + unsigned long val; page = (unsigned long *)ptr; + val = page[0]; - for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) { - if (page[pos] != page[pos + 1]) + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (val != page[pos]) return false; } - *element = page[pos]; + *element = val; return true; } diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index a95b5808d383..556a9dd8ac9f 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -551,7 +551,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, rc = __dax_dev_pud_fault(dax_dev, vmf); break; default: - return VM_FAULT_FALLBACK; + rc = VM_FAULT_SIGBUS; } srcu_read_unlock(&dax_srcu, id); diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index 498c0854305f..06c4974ee8dd 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -298,8 +298,11 @@ static void *qp_alloc_queue(u64 size, u32 flags) size_t pas_size; size_t vas_size; size_t queue_size = sizeof(*queue) + sizeof(*queue->kernel_if); - const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; + u64 num_pages; + if (size > SIZE_MAX - PAGE_SIZE) + return NULL; + num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / (sizeof(*queue->kernel_if->u.g.pas) + @@ -624,9 +627,12 @@ static struct vmci_queue *qp_host_alloc_queue(u64 size) { struct vmci_queue *queue; size_t queue_page_size; - const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; + u64 num_pages; const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if)); + if (size > SIZE_MAX - PAGE_SIZE) + return NULL; + num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / sizeof(*queue->kernel_if->u.h.page)) return NULL; diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c index cef818f535ed..03a0d057bf2f 100644 --- a/drivers/mtd/nand/nandsim.c +++ b/drivers/mtd/nand/nandsim.c @@ -40,6 +40,7 @@ #include <linux/list.h> #include <linux/random.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/seq_file.h> @@ -1368,31 +1369,18 @@ static int get_pages(struct nandsim *ns, struct file *file, size_t count, loff_t return 0; } -static int set_memalloc(void) -{ - if (current->flags & PF_MEMALLOC) - return 0; - current->flags |= PF_MEMALLOC; - return 1; -} - -static void clear_memalloc(int memalloc) -{ - if (memalloc) - current->flags &= ~PF_MEMALLOC; -} - static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t pos) { ssize_t tx; - int err, memalloc; + int err; + unsigned int noreclaim_flag; err = get_pages(ns, file, count, pos); if (err) return err; - memalloc = set_memalloc(); + noreclaim_flag = memalloc_noreclaim_save(); tx = kernel_read(file, pos, buf, count); - clear_memalloc(memalloc); + memalloc_noreclaim_restore(noreclaim_flag); put_pages(ns); return tx; } @@ -1400,14 +1388,15 @@ static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_ static ssize_t write_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t pos) { ssize_t tx; - int err, memalloc; + int err; + unsigned int noreclaim_flag; err = get_pages(ns, file, count, pos); if (err) return err; - memalloc = set_memalloc(); + noreclaim_flag = memalloc_noreclaim_save(); tx = kernel_write(file, buf, count, pos); - clear_memalloc(memalloc); + memalloc_noreclaim_restore(noreclaim_flag); put_pages(ns); return tx; } diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index bbea8eac9abb..4842fc0e809d 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -30,6 +30,7 @@ #include <linux/types.h> #include <linux/inet.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/delay.h> @@ -371,10 +372,10 @@ static inline int iscsi_sw_tcp_xmit_qlen(struct iscsi_conn *conn) static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task) { struct iscsi_conn *conn = task->conn; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; int rc = 0; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); while (iscsi_sw_tcp_xmit_qlen(conn)) { rc = iscsi_sw_tcp_xmit(conn); @@ -387,7 +388,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task) rc = 0; } - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return rc; } diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 677f0ddc986c..3ffc1ce29023 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -372,7 +372,7 @@ static void moom_callback(struct work_struct *ignored) mutex_lock(&oom_lock); if (!out_of_memory(&oc)) - pr_info("OOM request ignored because killer is disabled\n"); + pr_info("OOM request ignored. No task eligible\n"); mutex_unlock(&oom_lock); } diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 150ce2abf6c8..d3eca879a0a8 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -243,11 +243,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list)); /* Get the physical addresses of the source buffer */ - down_read(¤t->mm->mmap_sem); - num_pinned = get_user_pages(param.local_vaddr - lb_offset, - num_pages, (param.source == -1) ? 0 : FOLL_WRITE, - pages, NULL); - up_read(¤t->mm->mmap_sem); + num_pinned = get_user_pages_unlocked(param.local_vaddr - lb_offset, + num_pages, pages, (param.source == -1) ? 0 : FOLL_WRITE); if (num_pinned != num_pages) { /* get_user_pages() failed */ diff --git a/firmware/Makefile b/firmware/Makefile index e297e1b52636..fa3e81c2a97b 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -176,7 +176,8 @@ quiet_cmd_fwbin = MK_FW $@ wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \ include/config/ppc32.h include/config/ppc64.h \ include/config/superh32.h include/config/superh64.h \ - include/config/x86_32.h include/config/x86_64.h) + include/config/x86_32.h include/config/x86_64.h \ + firmware/Makefile) $(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps) $(call cmd,fwbin,$(patsubst %.gen.S,%,$@)) diff --git a/fs/buffer.c b/fs/buffer.c index 9196f2a270da..2ba905716d34 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1938,7 +1938,7 @@ EXPORT_SYMBOL(page_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, - struct iomap *iomap) + const struct iomap *iomap) { loff_t offset = block << inode->i_blkbits; @@ -1991,7 +1991,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap) + get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -555,21 +555,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { + struct inode *inode = mapping->host; struct page *page; int ret; /* Hole page already exists? Return it... */ if (!radix_tree_exceptional_entry(*entry)) { page = *entry; - goto out; + goto finish_fault; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - out: + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: vmf->page = page; ret = finish_fault(vmf); vmf->page = NULL; @@ -577,8 +581,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry, if (!ret) { /* Grab reference for PTE that is now referencing the page */ get_page(page); - return VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; } +out: + trace_dax_load_hole(inode, vmf, ret); return ret; } @@ -847,6 +853,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); + trace_dax_writeback_one(mapping->host, index, dax.size >> PAGE_SHIFT); unmap: dax_unmap_atomic(bdev, &dax); put_locked_mapping_entry(mapping, index, entry); @@ -882,6 +889,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; + trace_dax_writeback_range(inode, start_index, end_index); + tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); @@ -902,10 +911,12 @@ int dax_writeback_mapping_range(struct address_space *mapping, ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) - return ret; + goto out; } } - return 0; +out: + trace_dax_writeback_range_done(inode, start_index, end_index); + return (ret < 0 ? ret : 0); } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -930,6 +941,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(ret); *entryp = ret; + trace_dax_insert_mapping(mapping->host, vmf, ret); return vm_insert_mixed(vma, vaddr, dax.pfn); } @@ -941,6 +953,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -950,6 +963,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) if (entry) put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); + trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); @@ -962,6 +976,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) */ finish_mkwrite_fault(vmf); put_locked_mapping_entry(mapping, index, entry); + trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -1002,14 +1017,14 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) +static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) { return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); } static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { struct iov_iter *iter = data; loff_t end = pos + length, done = 0; @@ -1142,13 +1157,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, int vmf_ret = 0; void *entry; + trace_dax_pte_fault(inode, vmf, vmf_ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + if (pos >= i_size_read(inode)) { + vmf_ret = VM_FAULT_SIGBUS; + goto out; + } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; @@ -1159,8 +1177,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); - if (error) - return dax_fault_return(error); + if (error) { + vmf_ret = dax_fault_return(error); + goto out; + } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ goto finish_iomap; @@ -1244,6 +1264,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } +out: + trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } diff --git a/fs/internal.h b/fs/internal.h index 11c6d89dce9c..cef253ac176c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -42,7 +42,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) */ extern void guard_bio_eod(int rw, struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap); + get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c @@ -179,7 +179,7 @@ extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); * iomap support: */ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); + void *data, const struct iomap *iomap); loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, diff --git a/fs/iomap.c b/fs/iomap.c index 141c3cd55a8b..fb17dba3dac5 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -108,7 +108,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap) + struct page **pagep, const struct iomap *iomap) { pgoff_t index = pos >> PAGE_SHIFT; struct page *page; @@ -151,7 +151,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len, static loff_t iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { struct iov_iter *i = data; long status = 0; @@ -273,7 +273,7 @@ __iomap_read_page(struct inode *inode, loff_t offset) static loff_t iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { long status = 0; ssize_t written = 0; @@ -338,7 +338,7 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, EXPORT_SYMBOL_GPL(iomap_file_dirty); static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap) + unsigned bytes, const struct iomap *iomap) { struct page *page; int status; @@ -355,7 +355,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, } static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, - struct iomap *iomap) + const struct iomap *iomap) { sector_t sector = iomap->blkno + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); @@ -365,7 +365,7 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap) + void *data, const struct iomap *iomap) { bool *did_zero = data; loff_t written = 0; @@ -434,7 +434,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page); static loff_t iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, const struct iomap *iomap) { struct page *page = data; int ret; @@ -525,7 +525,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, static loff_t iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + const struct iomap *iomap) { struct fiemap_ctx *ctx = data; loff_t ret = length; @@ -710,7 +710,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) } static blk_qc_t -iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, +iomap_dio_zero(struct iomap_dio *dio, const struct iomap *iomap, loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); @@ -734,7 +734,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, static loff_t iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, const struct iomap *iomap) { struct iomap_dio *dio = data; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 92b255e1ba58..7fe5428dd170 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -43,6 +43,7 @@ #include <linux/backing-dev.h> #include <linux/bitops.h> #include <linux/ratelimit.h> +#include <linux/sched/mm.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -206,6 +207,14 @@ static int kjournald2(void *arg) wake_up(&journal->j_wait_done_commit); /* + * Make sure that no allocations from this kernel thread will ever + * recurse to the fs layer because we are responsible for the + * transaction commit and any fs involvement might get stuck waiting for + * the trasn. commit. + */ + memalloc_nofs_save(); + + /* * And now, wait forever for commit wakeup events. */ write_lock(&journal->j_state_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5e659ee08d6a..9ee4832b6f8b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -29,6 +29,7 @@ #include <linux/backing-dev.h> #include <linux/bug.h> #include <linux/module.h> +#include <linux/sched/mm.h> #include <trace/events/jbd2.h> @@ -388,6 +389,11 @@ repeat: rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); + /* + * Ensure that no allocations done while the transaction is open are + * going to recurse back to the fs layer. + */ + handle->saved_alloc_context = memalloc_nofs_save(); return 0; } @@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); + return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); jbd2_free_handle(handle); return err; } diff --git a/fs/nsfs.c b/fs/nsfs.c index 1656843e87d2..495f12b83a7b 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -195,9 +195,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, { struct ns_common *ns; int res = -ENOENT; + const char *name; ns = ns_ops->get(task); if (ns) { - res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + name = ns_ops->real_ns_name ? : ns_ops->name; + res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f6e871760f8d..0da0332725aa 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2242,13 +2242,13 @@ unlock: spin_unlock(&o2hb_live_lock); } -static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, const char *page, size_t count) { unsigned long tmp; @@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, } -CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_dead_threshold, &o2hb_heartbeat_group_attr_mode, NULL, }; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 5b51c31c892d..a4a6ba23b9a6 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = o2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; + setup_timer(&sc->sc_idle_timeout, o2net_idle_timer, + (unsigned long)sc); sclog(sc, "alloced\n"); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..f0072145eead 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2612,20 +2612,48 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + if (ret == -EEXIST) { + if (oldmle) + __dlm_put_mle(oldmle); + + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + mlog(0, "another process is already migrating it\n"); + goto fail; + } + + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again. Because it has been unhasded from the map + * in the function dlm_add_migration_mle. + * Otherwise the memory will be leaked. It will not be found again from + * the hash map. + */ + if (oldmle) { + /* master is known, detach if not already detached */ + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + + /* + * If the type of the mle is BLOCK, it should be put once for + * release. Otherwise a memory leak may be caused because + * oldmle has been unhashed from the hash map and it will not + * be found any more. + */ + if (oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + /* get an extra reference on the mle. * otherwise the assert_master from the new * master will destroy this. */ dlm_get_mle_inuse(mle); + mle_added = 1; + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); - if (ret == -EEXIST) { - mlog(0, "another process is already migrating it\n"); - goto fail; - } - mle_added = 1; - /* * set the MIGRATING flag and flush asts * if we fail after this we need to re-dirty the lockres @@ -2642,12 +2670,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (ret != -EEXIST && oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (ret < 0) { if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); @@ -3182,16 +3204,24 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, if (ret < 0) kmem_cache_free(dlm_mle_cache, mle); + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again because it has been unhashed from the map + * in the dlm_add_migration_mle(). + * Otherwise the memory will be leaked. It will not be found again from + * the hash map. + */ + if (oldmle) { + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); - if (oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (res) dlm_lockres_put(res); leave: diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..908b05942282 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct dlm_lock *lock, *next; unsigned int freed = 0; + struct list_head *queue = NULL; + int i; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2280,31 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ /* TODO: check pending_asts, pending_basts here */ - list_for_each_entry_safe(lock, next, &res->granted, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->converting, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->blocked, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* + * Can't schedule DLM_UNLOCK_FREE_LOCK: do + * manually + */ + dlm_lock_put(lock); + freed++; + } } } diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e3ac5c11780..ea1039d158bc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1358,6 +1358,48 @@ static const struct file_operations proc_fault_inject_operations = { .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + unsigned int n; + + err = kstrtoint_from_user(buf, count, 0, &n); + if (err) + return err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + task->fail_nth = n; + put_task_struct(task); + + return count; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char numbuf[PROC_NUMBUF]; + ssize_t len; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); + len = simple_read_from_buffer(buf, count, ppos, numbuf, len); + put_task_struct(task); + + return len; +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; #endif @@ -2922,6 +2964,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_ELF_CORE REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), @@ -3314,6 +3357,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2cc7a8030275..e250910cffc8 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) struct proc_inode *ei; struct inode *inode; - ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 766f0c637ad1..3803b24ca220 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = { #endif #ifdef CONFIG_PID_NS &pidns_operations, + &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f08bd31c1081..f0c8b33d99b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -441,6 +441,7 @@ struct mem_size_stats { unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; + unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long swap; @@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; - if (PageAnon(page)) + if (PageAnon(page)) { mss->anonymous += size; + if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + mss->lazyfree += size; + } mss->resident += size; /* Accumulate the size in pages that have been accessed. */ @@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" "AnonHugePages: %8lu kB\n" "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" @@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.private_dirty >> 10, mss.referenced >> 10, mss.anonymous >> 10, + mss.lazyfree >> 10, mss.anonymous_thp >> 10, mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, @@ -900,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + pmd_t pmd = *pmdp; + + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index aca73dd73906..e3c558d1b78c 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi) } static struct item_operations errcatch_ops = { - errcatch_bytes_number, - errcatch_decrement_key, - errcatch_is_left_mergeable, - errcatch_print_item, - errcatch_check_item, - - errcatch_create_vi, - errcatch_check_left, - errcatch_check_right, - errcatch_part_size, - errcatch_unit_num, - errcatch_print_vi + .bytes_number = errcatch_bytes_number, + .decrement_key = errcatch_decrement_key, + .is_left_mergeable = errcatch_is_left_mergeable, + .print_item = errcatch_print_item, + .check_item = errcatch_check_item, + + .create_vi = errcatch_create_vi, + .check_left = errcatch_check_left, + .check_right = errcatch_check_right, + .part_size = errcatch_part_size, + .unit_num = errcatch_unit_num, + .print_vi = errcatch_print_vi }; #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 70a5b55e0870..780fc8986dab 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { - unsigned noio_flag = 0; + unsigned nofs_flag = 0; void *ptr; gfp_t lflags; @@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * __vmalloc() will allocate data pages and auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - noio_flag = memalloc_noio_save(); + if (flags & KM_NOFS) + nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - memalloc_noio_restore(noio_flag); + if (flags & KM_NOFS) + memalloc_nofs_restore(nofs_flag); return ptr; } diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index f0fc84fcaac2..d6ea520162b2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if (flags & KM_NOFS) lflags &= ~__GFP_FS; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 92aa20d56553..5392674bf893 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2886,7 +2886,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; + unsigned long new_pflags = PF_MEMALLOC_NOFS; /* * we are in a transaction context here, but may also be doing work diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index eef453adbc06..907afa2dcd2d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return 0; } @@ -252,7 +252,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -1016,7 +1016,7 @@ xfs_do_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) goto redirty; /* diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b6208728ba39..ca09061369cb 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -443,17 +443,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; - unsigned noio_flag; + unsigned nofs_flag; /* * vm_map_ram() will allocate auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * that we are in such a context via PF_MEMALLOC_NOFS to prevent * memory reclaim re-entering the filesystem here and * potentially deadlocking. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -461,7 +461,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); if (!bp->b_addr) return -ENOMEM; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index a280e126491f..8b89a45303dc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,7 +134,7 @@ xfs_trans_reserve( bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); /* * Attempt to reserve the needed disk blocks by decrementing @@ -144,7 +144,7 @@ xfs_trans_reserve( if (blocks > 0) { error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; } tp->t_blk_res += blocks; @@ -221,7 +221,7 @@ undo_blocks: tp->t_blk_res = 0; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return error; } @@ -936,7 +936,7 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -966,7 +966,7 @@ out_unreserve: if (commit_lsn == -1 && !error) error = -EIO; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); @@ -1020,7 +1020,7 @@ xfs_trans_cancel( xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 96f1e88b767c..a3ba193f042e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -40,9 +40,9 @@ extern int nr_cpu_ids; #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ -#define nr_cpumask_bits nr_cpu_ids +#define nr_cpumask_bits ((unsigned int)nr_cpu_ids) #else -#define nr_cpumask_bits NR_CPUS +#define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h new file mode 100644 index 000000000000..eb71a70ea2b5 --- /dev/null +++ b/include/linux/crash_core.h @@ -0,0 +1,71 @@ +#ifndef LINUX_CRASH_CORE_H +#define LINUX_CRASH_CORE_H + +#include <linux/linkage.h> +#include <linux/elfcore.h> +#include <linux/elf.h> + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + CRASH_CORE_NOTE_NAME_BYTES + \ + CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + VMCOREINFO_NOTE_NAME_BYTES + \ + VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; + +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) + +extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern size_t vmcoreinfo_size; +extern size_t vmcoreinfo_max_size; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); + +#endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/elf.h b/include/linux/elf.h index 20fa8d8ae313..ba069e8f4f78 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_note elf32_note #define elf_addr_t Elf32_Off #define Elf_Half Elf32_Half +#define Elf_Word Elf32_Word #else @@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_note elf64_note #define elf_addr_t Elf64_Off #define Elf_Half Elf64_Half +#define Elf_Word Elf64_Word #endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index db373b9d3223..2b1a44f5bdb6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -40,6 +40,11 @@ struct vm_area_struct; #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u +#ifdef CONFIG_LOCKDEP +#define ___GFP_NOLOCKDEP 0x4000000u +#else +#define ___GFP_NOLOCKDEP 0 +#endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -179,8 +184,11 @@ struct vm_area_struct; #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) +/* Disable lockdep for GFP context tracking */ +#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) + /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 25 +#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* @@ -202,8 +210,16 @@ struct vm_area_struct; * * GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. * * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. * * GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware @@ -297,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the - * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long - * and there are 16 of them to cover all possible combinations of + * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT + * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 9d84942ae2e5..71fd92d81b26 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -8,8 +8,7 @@ #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ /* used by in-kernel data structures */ -struct kern_ipc_perm -{ +struct kern_ipc_perm { spinlock_t lock; bool deleted; int id; @@ -18,9 +17,9 @@ struct kern_ipc_perm kgid_t gid; kuid_t cuid; kgid_t cgid; - umode_t mode; + umode_t mode; unsigned long seq; void *security; -}; +} ____cacheline_aligned_in_smp; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index dfaa1f4dcb0c..606b6bce3a5b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -491,6 +491,8 @@ struct jbd2_journal_handle unsigned long h_start_jiffies; unsigned int h_requested_credits; + + unsigned int saved_alloc_context; }; diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 624215cebee5..36872fbb815d 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -1,6 +1,7 @@ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H +#include <linux/cache.h> #include <linux/math64.h> #include <linux/kernel.h> #include <linux/types.h> @@ -63,19 +64,13 @@ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -/* some arch's have a small-data section that can be accessed register-relative - * but that can only take up to, say, 4-byte variables. jiffies being part of - * an 8-byte variable may not be correctly accessed unless we force the issue - */ -#define __jiffy_data __attribute__((section(".data"))) - /* * The 64-bit value is not atomic - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. */ -extern u64 __jiffy_data jiffies_64; -extern unsigned long volatile __jiffy_data jiffies; +extern u64 __cacheline_aligned_in_smp jiffies_64; +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d419d0e51fe5..c9481ebcbc0c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -14,17 +14,15 @@ #if !defined(__ASSEMBLY__) +#include <linux/crash_core.h> #include <asm/io.h> #include <uapi/linux/kexec.h> #ifdef CONFIG_KEXEC_CORE #include <linux/list.h> -#include <linux/linkage.h> #include <linux/compat.h> #include <linux/ioport.h> -#include <linux/elfcore.h> -#include <linux/elf.h> #include <linux/module.h> #include <asm/kexec.h> @@ -62,19 +60,15 @@ #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif -#define KEXEC_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define KEXEC_CORE_NOTE_NAME "CORE" -#define KEXEC_CORE_NOTE_NAME_BYTES ALIGN(sizeof(KEXEC_CORE_NOTE_NAME), 4) -#define KEXEC_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) +#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME + /* * The per-cpu notes area is a list of notes terminated by a "NULL" * note header. For kdump, the code in vmcore.c runs in the context * of the second kernel to combine them into one note. */ #ifndef KEXEC_NOTE_BYTES -#define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) + \ - KEXEC_CORE_NOTE_NAME_BYTES + \ - KEXEC_CORE_NOTE_DESC_BYTES ) +#define KEXEC_NOTE_BYTES CRASH_CORE_NOTE_BYTES #endif /* @@ -256,33 +250,6 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); -void crash_save_vmcoreinfo(void); -void arch_crash_save_vmcoreinfo(void); -__printf(1, 2) -void vmcoreinfo_append_str(const char *fmt, ...); -phys_addr_t paddr_vmcoreinfo_note(void); - -#define VMCOREINFO_OSRELEASE(value) \ - vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(name)) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) -#define VMCOREINFO_NUMBER(name) \ - vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) -#define VMCOREINFO_CONFIG(name) \ - vmcoreinfo_append_str("CONFIG_%s=y\n", #name) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -303,31 +270,15 @@ extern int kexec_load_disabled; #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) -#define VMCOREINFO_BYTES (4096) -#define VMCOREINFO_NOTE_NAME "VMCOREINFO" -#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) -#define VMCOREINFO_NOTE_SIZE (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \ - + VMCOREINFO_NOTE_NAME_BYTES) - /* Location of a reserved region to hold the crash kernel. */ extern struct resource crashk_res; extern struct resource crashk_low_res; -typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4]; extern note_buf_t __percpu *crash_notes; -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -extern size_t vmcoreinfo_size; -extern size_t vmcoreinfo_max_size; /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); -int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); -int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e1cfda4bee58..78b44a024eaa 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page, struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); #else /* !CONFIG_KSM */ @@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page, return 0; } -static inline int rmap_walk_ksm(struct page *page, +static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { - return 0; } static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bb7250c45cb8..899949bbb2f9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,48 +35,43 @@ struct page; struct mm_struct; struct kmem_cache; -/* - * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c, - * These two lists should keep in accord with each other. - */ -enum mem_cgroup_stat_index { - /* - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. - */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */ - MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ - MEM_CGROUP_STAT_NSTATS, - /* default hierarchy stats */ - MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS, +/* Cgroup-specific page state, on top of universal node page state */ +enum memcg_stat_item { + MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS, + MEMCG_RSS, + MEMCG_RSS_HUGE, + MEMCG_SWAP, + MEMCG_SOCK, + /* XXX: why are these zone and not node counters? */ + MEMCG_KERNEL_STACK_KB, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, - MEMCG_SOCK, MEMCG_NR_STAT, }; +/* Cgroup-specific events, on top of universal VM events */ +enum memcg_event_item { + MEMCG_LOW = NR_VM_EVENT_ITEMS, + MEMCG_HIGH, + MEMCG_MAX, + MEMCG_OOM, + MEMCG_NR_EVENTS, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; unsigned int generation; }; -enum mem_cgroup_events_index { - MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ - MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ - MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ - MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ - MEM_CGROUP_EVENTS_NSTATS, - /* default hierarchy events */ - MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS, - MEMCG_HIGH, - MEMCG_MAX, - MEMCG_OOM, - MEMCG_NR_EVENTS, +#ifdef CONFIG_MEMCG + +#define MEM_CGROUP_ID_SHIFT 16 +#define MEM_CGROUP_ID_MAX USHRT_MAX + +struct mem_cgroup_id { + int id; + atomic_t ref; }; /* @@ -92,16 +87,6 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -#ifdef CONFIG_MEMCG - -#define MEM_CGROUP_ID_SHIFT 16 -#define MEM_CGROUP_ID_MAX USHRT_MAX - -struct mem_cgroup_id { - int id; - atomic_t ref; -}; - struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; @@ -283,17 +268,10 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -/** - * mem_cgroup_events - count memory events against a cgroup - * @memcg: the memory cgroup - * @idx: the event index - * @nr: the number of events to account for - */ -static inline void mem_cgroup_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx, - unsigned int nr) +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) { - this_cpu_add(memcg->stat->events[idx], nr); + this_cpu_inc(memcg->stat->events[event]); cgroup_file_notify(&memcg->events_file); } @@ -494,8 +472,42 @@ extern int do_swap_account; void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + long val = 0; + int cpu; + + for_each_possible_cpu(cpu) + val += per_cpu(memcg->stat->count[idx], cpu); + + if (val < 0) + val = 0; + + return val; +} + +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val) +{ + if (!mem_cgroup_disabled()) + this_cpu_add(memcg->stat->count[idx], val); +} + +static inline void inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, 1); +} + +static inline void dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, -1); +} + /** - * mem_cgroup_update_page_stat - update page state statistics + * mod_memcg_page_state - update page state statistics * @page: the page * @idx: page state item to account * @val: number of pages (positive or negative) @@ -506,28 +518,28 @@ void unlock_page_memcg(struct page *page); * * lock_page(page) or lock_page_memcg(page) * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(page, state, -1); + * mod_memcg_page_state(page, state, -1); * unlock_page(page) or unlock_page_memcg(page) + * + * Kernel pages are an exception to this, since they'll never move. */ -static inline void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, int val) +static inline void mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, int val) { - VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page))); - if (page->mem_cgroup) - this_cpu_add(page->mem_cgroup->stat->count[idx], val); + mod_memcg_state(page->mem_cgroup, idx, val); } -static inline void mem_cgroup_inc_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { - mem_cgroup_update_page_stat(page, idx, 1); + mod_memcg_page_state(page, idx, 1); } -static inline void mem_cgroup_dec_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { - mem_cgroup_update_page_stat(page, idx, -1); + mod_memcg_page_state(page, idx, -1); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -544,20 +556,8 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - goto out; - - switch (idx) { - case PGFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); - break; - case PGMAJFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); - break; - default: - BUG(); - } -out: + if (likely(memcg)) + this_cpu_inc(memcg->stat->events[idx]); rcu_read_unlock(); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -576,9 +576,8 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline void mem_cgroup_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx, - unsigned int nr) +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) { } @@ -740,19 +739,41 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -static inline void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, - int nr) +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + return 0; +} + +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, + int nr) +{ +} + +static inline void inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ +} + +static inline void dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ +} + +static inline void mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, + int nr) { } -static inline void mem_cgroup_inc_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { } -static inline void mem_cgroup_dec_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { } @@ -872,7 +893,7 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) * @val: number of pages (positive or negative) */ static inline void memcg_kmem_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, int val) + enum memcg_stat_item idx, int val) { if (memcg_kmem_enabled() && page->mem_cgroup) this_cpu_add(page->mem_cgroup->stat->count[idx], val); @@ -901,7 +922,7 @@ static inline void memcg_put_cache_ids(void) } static inline void memcg_kmem_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, int val) + enum memcg_stat_item idx, int val) { } #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fa76b516fa47..48e24844b3c5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); -extern int migrate_page(struct address_space *, - struct page *, struct page *, enum migrate_mode); +extern int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/mm.h b/include/linux/mm.h index a835edd2db34..cbdbe0be3127 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2501,7 +2501,6 @@ extern long copy_huge_page_from_user(struct page *dst_page, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ extern struct page_ext_operations debug_guardpage_ops; -extern struct page_ext_operations page_poisoning_ops; #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 51891fb0d3ce..c91b3bcd158f 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pud; \ }) -#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ - pmd_t ___pmd; \ - \ - ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ - mmu_notifier_invalidate_range(__mm, ___haddr, \ - ___haddr + HPAGE_PMD_SIZE); \ - \ - ___pmd; \ -}) - /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU @@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush -#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e02b3750fe0..ebaccd4e7d8c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,7 +35,7 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 -enum { +enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, @@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES]; # define is_migrate_cma_page(_page) false #endif +static inline bool is_migrate_movable(int mt) +{ + return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; +} + #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) @@ -149,7 +154,6 @@ enum node_stat_item { NR_UNEVICTABLE, /* " " " " " */ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ - NR_PAGES_SCANNED, /* pages scanned since last reclaim */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_NODERECLAIM, @@ -226,6 +230,8 @@ struct lruvec { struct zone_reclaim_stat reclaim_stat; /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; + /* Refaults at the time of last reclaim cycle */ + unsigned long refaults; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -630,6 +636,8 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_classzone_idx; + int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 047d64706f2a..d4cd2014fa6f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); -int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, - int migratetype); + int migratetype, int *num_movable); /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. diff --git a/include/linux/printk.h b/include/linux/printk.h index 571257e0f53d..e10f27468322 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -198,7 +198,7 @@ extern void wake_up_klogd(void); char *log_buf_addr_get(void); u32 log_buf_len_get(void); -void log_buf_kexec_setup(void); +void log_buf_vmcoreinfo_setup(void); void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); @@ -246,7 +246,7 @@ static inline u32 log_buf_len_get(void) return 0; } -static inline void log_buf_kexec_setup(void) +static inline void log_buf_vmcoreinfo_setup(void) { } diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 12cb8bd81d2d..58ab28d81fc2 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -14,6 +14,7 @@ struct inode; struct proc_ns_operations { const char *name; + const char *real_ns_name; int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); @@ -26,6 +27,7 @@ extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; diff --git a/include/linux/reboot.h b/include/linux/reboot.h index a7ff409f386d..ecbf7b56b9db 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -5,6 +5,8 @@ #include <linux/notifier.h> #include <uapi/linux/reboot.h> +struct device; + #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN #define SYS_HALT 0x0002 /* Notify of system halt */ @@ -38,6 +40,8 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); +extern int devm_register_reboot_notifier(struct device *, struct notifier_block *); + extern int register_restart_handler(struct notifier_block *); extern int unregister_restart_handler(struct notifier_block *); extern void do_kernel_restart(char *cmd); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8c89e902df3e..43ef2c30cb0f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -83,19 +83,17 @@ struct anon_vma_chain { }; enum ttu_flags { - TTU_UNMAP = 1, /* unmap mode */ - TTU_MIGRATION = 2, /* migration mode */ - TTU_MUNLOCK = 4, /* munlock mode */ - TTU_LZFREE = 8, /* lazy free mode */ - TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */ - - TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ - TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ - TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ - TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible + TTU_MIGRATION = 0x1, /* migration mode */ + TTU_MUNLOCK = 0x2, /* munlock mode */ + + TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ + TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ + TTU_IGNORE_ACCESS = 0x10, /* don't age */ + TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ + TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ - TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock: + TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock: * caller holds it */ }; @@ -193,9 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound) int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); -#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) - -int try_to_unmap(struct page *, enum ttu_flags flags); +bool try_to_unmap(struct page *, enum ttu_flags flags); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) @@ -239,7 +235,7 @@ int page_mkclean(struct page *); * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ -int try_to_munlock(struct page *); +void try_to_munlock(struct page *); void remove_migration_ptes(struct page *old, struct page *new, bool locked); @@ -261,15 +257,19 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; - int (*rmap_one)(struct page *page, struct vm_area_struct *vma, + /* + * Return false if page table scanning in rmap_walk should be stopped. + * Otherwise, return true. + */ + bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -int rmap_walk(struct page *page, struct rmap_walk_control *rwc); -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ @@ -285,7 +285,7 @@ static inline int page_referenced(struct page *page, int is_locked, return 0; } -#define try_to_unmap(page, refs) SWAP_FAIL +#define try_to_unmap(page, refs) false static inline int page_mkclean(struct page *page) { @@ -295,13 +295,4 @@ static inline int page_mkclean(struct page *page) #endif /* CONFIG_MMU */ -/* - * Return values of try_to_unmap - */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 -#define SWAP_MLOCK 3 -#define SWAP_LZFREE 4 - #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h index ea05f6c51413..84766bcdd01f 100644 --- a/include/linux/rodata_test.h +++ b/include/linux/rodata_test.h @@ -14,7 +14,6 @@ #define _RODATA_TEST_H #ifdef CONFIG_DEBUG_RODATA_TEST -extern const int rodata_test_data; void rodata_test(void); #else static inline void rodata_test(void) {} diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f3698395628..f6c828ff13d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -947,6 +947,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; + unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call @@ -1224,9 +1225,9 @@ extern struct pid *cad_pid; #define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ -#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */ -#define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 830953ebb391..2b24a6974847 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk) return ret; } -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags - * __GFP_FS is also cleared as it implies __GFP_IO. +/* + * Applies per-task gfp context to the given allocation flags. + * PF_MEMALLOC_NOIO implies GFP_NOIO + * PF_MEMALLOC_NOFS implies GFP_NOFS */ -static inline gfp_t memalloc_noio_flags(gfp_t flags) +static inline gfp_t current_gfp_context(gfp_t flags) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precendence + */ if (unlikely(current->flags & PF_MEMALLOC_NOIO)) flags &= ~(__GFP_IO | __GFP_FS); + else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) + flags &= ~__GFP_FS; return flags; } @@ -171,4 +179,28 @@ static inline void memalloc_noio_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } +static inline unsigned int memalloc_nofs_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOFS; + current->flags |= PF_MEMALLOC_NOFS; + return flags; +} + +static inline void memalloc_nofs_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; +} + +static inline unsigned int memalloc_noreclaim_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + return flags; +} + +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/sem.h b/include/linux/sem.h index 4fc222f8755d..9edec926e9d9 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -10,8 +10,7 @@ struct task_struct; /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { - struct kern_ipc_perm ____cacheline_aligned_in_smp - sem_perm; /* permissions .. see ipc.h */ + struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct list_head pending_alter; /* pending operations */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 45e91dd6716d..ba5882419a7d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); -extern void deactivate_page(struct page *page); +extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); @@ -411,9 +411,6 @@ struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); -extern int get_swap_slots(int n, swp_entry_t *slots); -extern void swapcache_free_batch(swp_entry_t *entries, int n); - #else /* CONFIG_SWAP */ #define swap_address_space(entry) (NULL) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a80b7b59cf33..d84ae90ccd5c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), FOR_ALL_ZONES(ALLOCSTALL), FOR_ALL_ZONES(PGSCAN_SKIP), - PGFREE, PGACTIVATE, PGDEACTIVATE, + PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, PGREFILL, diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index c566ddc87f73..08bb3ed18dcc 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -150,6 +150,136 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); +DECLARE_EVENT_CLASS(dax_pte_fault_class, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), + TP_ARGS(inode, vmf, result), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(pgoff_t, pgoff) + __field(dev_t, dev) + __field(unsigned int, flags) + __field(int, result) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->flags = vmf->flags; + __entry->pgoff = vmf->pgoff; + __entry->result = result; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), + __entry->address, + __entry->pgoff, + __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE) + ) +) + +#define DEFINE_PTE_FAULT_EVENT(name) \ +DEFINE_EVENT(dax_pte_fault_class, name, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \ + TP_ARGS(inode, vmf, result)) + +DEFINE_PTE_FAULT_EVENT(dax_pte_fault); +DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); +DEFINE_PTE_FAULT_EVENT(dax_load_hole); + +TRACE_EVENT(dax_insert_mapping, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), + TP_ARGS(inode, vmf, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(void *, radix_entry) + __field(dev_t, dev) + __field(int, write) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->write = vmf->flags & FAULT_FLAG_WRITE; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->write ? "write" : "read", + __entry->address, + (unsigned long)__entry->radix_entry + ) +) + +DECLARE_EVENT_CLASS(dax_writeback_range_class, + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index), + TP_ARGS(inode, start_index, end_index), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, start_index) + __field(pgoff_t, end_index) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start_index = start_index; + __entry->end_index = end_index; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->start_index, + __entry->end_index + ) +) + +#define DEFINE_WRITEBACK_RANGE_EVENT(name) \ +DEFINE_EVENT(dax_writeback_range_class, name, \ + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),\ + TP_ARGS(inode, start_index, end_index)) + +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range); +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done); + +TRACE_EVENT(dax_writeback_one, + TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen), + TP_ARGS(inode, pgoff, pglen), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, pgoff) + __field(pgoff_t, pglen) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgoff = pgoff; + __entry->pglen = pglen; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->pglen + ) +) + #endif /* _TRACE_FS_DAX_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index e13d48058b8d..c6602bdd2a10 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -26,6 +26,10 @@ #include <linux/types.h> #include <linux/compiler.h> +#ifndef __KERNEL__ +#include <stddef.h> /* For size_t. */ +#endif + #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is the largest acceptable value for the nlen diff --git a/init/do_mounts.h b/init/do_mounts.h index 067af1d9e8b6..282d65bfd674 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -19,29 +19,15 @@ static inline int create_dev(char *name, dev_t dev) return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); } -#if BITS_PER_LONG == 32 static inline u32 bstat(char *name) { - struct stat64 stat; - if (sys_stat64(name, &stat) != 0) + struct kstat stat; + if (vfs_stat(name, &stat) != 0) return 0; - if (!S_ISBLK(stat.st_mode)) + if (!S_ISBLK(stat.mode)) return 0; - if (stat.st_rdev != (u32)stat.st_rdev) - return 0; - return stat.st_rdev; -} -#else -static inline u32 bstat(char *name) -{ - struct stat stat; - if (sys_newstat(name, &stat) != 0) - return 0; - if (!S_ISBLK(stat.st_mode)) - return 0; - return stat.st_rdev; + return stat.rdev; } -#endif #ifdef CONFIG_BLK_DEV_RAM diff --git a/init/initramfs.c b/init/initramfs.c index 981f286c1d16..a5b686696535 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -312,10 +312,10 @@ static int __init maybe_link(void) static void __init clean_path(char *path, umode_t fmode) { - struct stat st; + struct kstat st; - if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) { - if (S_ISDIR(st.st_mode)) + if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) { + if (S_ISDIR(st.mode)) sys_rmdir(path); else sys_unlink(path); @@ -581,13 +581,13 @@ static void __init clean_rootfs(void) num = sys_getdents64(fd, dirp, BUF_SIZE); while (num > 0) { while (num > 0) { - struct stat st; + struct kstat st; int ret; - ret = sys_newlstat(dirp->d_name, &st); + ret = vfs_lstat(dirp->d_name, &st); WARN_ON_ONCE(ret); if (!ret) { - if (S_ISDIR(st.st_mode)) + if (S_ISDIR(st.mode)) sys_rmdir(dirp->d_name); else sys_unlink(dirp->d_name); @@ -611,7 +611,7 @@ static int __init populate_rootfs(void) char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic("%s", err); /* Failed to decompress INTERNAL initramfs */ - if (initrd_start) { + if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { #ifdef CONFIG_BLK_DEV_RAM int fd; printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); diff --git a/ipc/shm.c b/ipc/shm.c index 481d2a9c298a..34c4344e8d4b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1095,11 +1095,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long shmlba) { struct shmid_kernel *shp; - unsigned long addr; + unsigned long addr = (unsigned long)shmaddr; unsigned long size; struct file *file; int err; - unsigned long flags; + unsigned long flags = MAP_SHARED; unsigned long prot; int acc_mode; struct ipc_namespace *ns; @@ -1111,7 +1111,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, err = -EINVAL; if (shmid < 0) goto out; - else if ((addr = (ulong)shmaddr)) { + + if (addr) { if (addr & (shmlba - 1)) { /* * Round down to the nearest multiple of shmlba. @@ -1126,13 +1127,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, #endif goto out; } - flags = MAP_SHARED | MAP_FIXED; - } else { - if ((shmflg & SHM_REMAP)) - goto out; - flags = MAP_SHARED; - } + flags |= MAP_FIXED; + } else if ((shmflg & SHM_REMAP)) + goto out; if (shmflg & SHM_RDONLY) { prot = PROT_READ; diff --git a/kernel/Makefile b/kernel/Makefile index b302b4731d16..72aa080f91f0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c new file mode 100644 index 000000000000..fcbd568f1e95 --- /dev/null +++ b/kernel/crash_core.c @@ -0,0 +1,439 @@ +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/crash_core.h> +#include <linux/utsname.h> +#include <linux/vmalloc.h> + +#include <asm/page.h> +#include <asm/sections.h> + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *name, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); + + return buf; +} + +void final_note(Elf_Word *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +phys_addr_t __weak paddr_vmcoreinfo_note(void) +{ + return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmap_area_list); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _refcount); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_dtor); + VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#endif + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); diff --git a/kernel/fork.c b/kernel/fork.c index a35ffd667581..1fbcd86772b5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -179,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack) */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); + +static int free_vm_stack_cache(unsigned int cpu) +{ + struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *vm_stack = cached_vm_stacks[i]; + + if (!vm_stack) + continue; + + vfree(vm_stack->addr); + cached_vm_stacks[i] = NULL; + } + + return 0; +} #endif static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) @@ -467,6 +485,11 @@ void __init fork_init(void) for (i = 0; i < UCOUNT_COUNTS; i++) { init_user_ns.ucount_max[i] = max_threads/2; } + +#ifdef CONFIG_VMAP_STACK + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", + NULL, free_vm_stack_cache); +#endif } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -556,6 +579,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f0f8e2a8496f..751593ed7c0b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; +static bool hung_task_show_lock; static struct task_struct *watchdog_task; @@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_all_locks(); + hung_task_show_lock = true; } touch_nmi_watchdog(); if (sysctl_hung_task_panic) { + if (hung_task_show_lock) + debug_show_all_locks(); trigger_all_cpu_backtrace(); panic("hung_task: blocked tasks"); } @@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; + hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { if (!max_count--) @@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); + if (hung_task_show_lock) + debug_show_all_locks(); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/kcov.c b/kernel/kcov.c index 85e5546cd791..cd771993f96f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. - * The checks for whether we are in an interrupt are open-coded, because - * 1. We can't use in_interrupt() here, since it also returns true - * when we are inside local_bh_disable() section. - * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), - * since that leads to slower generated code (three separate tests, - * one for each of the flags). */ - if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET - | NMI_MASK))) + if (!t || !in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bfe62d5b3872..ae1a3ba24df5 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; @@ -996,34 +990,6 @@ unlock: return ret; } -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - void crash_save_cpu(struct pt_regs *regs, int cpu) { struct elf_prstatus prstatus; @@ -1085,403 +1051,6 @@ subsys_initcall(crash_notes_memory_init); /* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -phys_addr_t __weak paddr_vmcoreinfo_note(void) -{ - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _refcount); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); - VMCOREINFO_OFFSET(page, compound_head); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - -/* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0999679d6f26..23cd70651238 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_CRASH_CORE + static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_CORE */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = { &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, +#endif +#ifdef CONFIG_CRASH_CORE &vmcoreinfo_attr.attr, #endif #ifndef CONFIG_TINY_RCU diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 23ca53e8d3fb..c0e31bfee25c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/sched/task.h> +#include <linux/sched/mm.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/proc_fs.h> @@ -2876,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (unlikely(!debug_locks)) return; + gfp_mask = current_gfp_context(gfp_mask); + /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; @@ -2885,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) + if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) return; /* @@ -2894,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; + /* Disable lockdep if explicitly requested */ + if (gfp_mask & __GFP_NOLOCKDEP) + return; + mark_held_locks(curr, RECLAIM_FS); } @@ -3947,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { - current->lockdep_reclaim_gfp = gfp_mask; + current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); } EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index de461aa0bf9a..4dd02ff0b0bd 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -374,6 +374,20 @@ static struct ns_common *pidns_get(struct task_struct *task) return ns ? &ns->ns : NULL; } +static struct ns_common *pidns_for_children_get(struct task_struct *task) +{ + struct pid_namespace *ns = NULL; + + task_lock(task); + if (task->nsproxy) { + ns = task->nsproxy->pid_ns_for_children; + get_pid_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); @@ -443,6 +457,17 @@ const struct proc_ns_operations pidns_operations = { .get_parent = pidns_get_parent, }; +const struct proc_ns_operations pidns_for_children_operations = { + .name = "pid_for_children", + .real_ns_name = "pid", + .type = CLONE_NEWPID, + .get = pidns_for_children_get, + .put = pidns_put, + .install = pidns_install, + .owner = pidns_owner, + .get_parent = pidns_get_parent, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9d2e08213c80..480f13acea5c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,7 @@ #include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/syscalls.h> -#include <linux/kexec.h> +#include <linux/crash_core.h> #include <linux/kdb.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> @@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_CORE /* * This appends the listed symbols to /proc/vmcore * @@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = { * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ -void log_buf_kexec_setup(void) +void log_buf_vmcoreinfo_setup(void) { VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); diff --git a/kernel/reboot.c b/kernel/reboot.c index bd30a973fe94..e4ced883d8de 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +static void devm_unregister_reboot_notifier(struct device *dev, void *res) +{ + WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res)); +} + +int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb) +{ + struct notifier_block **rcnb; + int ret; + + rcnb = devres_alloc(devm_unregister_reboot_notifier, + sizeof(*rcnb), GFP_KERNEL); + if (!rcnb) + return -ENOMEM; + + ret = register_reboot_notifier(nb); + if (!ret) { + *rcnb = nb; + devres_add(dev, rcnb); + } else { + devres_free(rcnb); + } + + return ret; +} +EXPORT_SYMBOL(devm_register_reboot_notifier); + /* * Notifier list for kernel code which wants to be called * to restart the system. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c8714fcb53c..335973971a96 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2574,7 +2574,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > LONG_MAX / HZ) + if (*lvalp > INT_MAX / HZ) return 1; *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); } else { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8a5e44236f78..4559e914452b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -30,6 +30,7 @@ #include <linux/pid_namespace.h> #include <net/genetlink.h> #include <linux/atomic.h> +#include <linux/sched/cputime.h> /* * Maximum length of a cpumask that can be specified in @@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; + u64 delta, utime, stime; + u64 start_time; /* * Add additional stats from live tasks except zombie thread group @@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) memset(stats, 0, sizeof(*stats)); tsk = first; + start_time = ktime_get_ns(); do { if (tsk->exit_state) continue; @@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) */ delayacct_add_tsk(stats, tsk); + /* calculate task elapsed time in nsec */ + delta = start_time - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime += delta; + + task_cputime(tsk, &utime, &stime); + stats->ac_utime += div_u64(utime, NSEC_PER_USEC); + stats->ac_stime += div_u64(stime, NSEC_PER_USEC); + stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; } while_each_thread(first, tsk); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 22c75bcfb498..77fadface4f9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1719,19 +1719,21 @@ config LKDTM Documentation/fault-injection/provoke-crashes.txt config TEST_LIST_SORT - bool "Linked list sorting test" - depends on DEBUG_KERNEL + tristate "Linked list sorting test" + depends on DEBUG_KERNEL || m help Enable this to turn on 'list_sort()' function test. This test is - executed only once during system boot, so affects only boot time. + executed only once during system boot (so affects only boot time), + or at module load time. If unsure, say N. config TEST_SORT - bool "Array-based sort test" - depends on DEBUG_KERNEL + tristate "Array-based sort test" + depends on DEBUG_KERNEL || m help - This option enables the self-test function of 'sort()' at boot. + This option enables the self-test function of 'sort()' at boot, + or at module load time. If unsure, say N. diff --git a/lib/Makefile b/lib/Makefile index a155c73e3437..0166fbc0fa81 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o +obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_SORT) += test_sort.o diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 6a823a53e357..09ac73c177fd 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -56,7 +56,7 @@ static void fail_dump(struct fault_attr *attr) static bool fail_task(struct fault_attr *attr, struct task_struct *task) { - return !in_interrupt() && task->make_it_fail; + return in_task() && task->make_it_fail; } #define MAX_STACK_TRACE_DEPTH 32 @@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + if (in_task() && current->fail_nth) { + if (--current->fail_nth == 0) + goto fail; + return false; + } + /* No need to check any other properties if the probability is 0 */ if (attr->probability == 0) return false; @@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (!fail_stacktrace(attr)) return false; +fail: fail_dump(attr); if (atomic_read(&attr->times) != -1) diff --git a/lib/list_sort.c b/lib/list_sort.c index 3fe401067e20..9e9acc37652f 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,6 +1,3 @@ - -#define pr_fmt(fmt) "list_sort_test: " fmt - #include <linux/kernel.h> #include <linux/bug.h> #include <linux/compiler.h> @@ -145,149 +142,3 @@ void list_sort(void *priv, struct list_head *head, merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); } EXPORT_SYMBOL(list_sort); - -#ifdef CONFIG_TEST_LIST_SORT - -#include <linux/slab.h> -#include <linux/random.h> - -/* - * The pattern of set bits in the list length determines which cases - * are hit in list_sort(). - */ -#define TEST_LIST_LEN (512+128+2) /* not including head */ - -#define TEST_POISON1 0xDEADBEEF -#define TEST_POISON2 0xA324354C - -struct debug_el { - unsigned int poison1; - struct list_head list; - unsigned int poison2; - int value; - unsigned serial; -}; - -/* Array, containing pointers to all elements in the test list */ -static struct debug_el **elts __initdata; - -static int __init check(struct debug_el *ela, struct debug_el *elb) -{ - if (ela->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", ela->serial); - return -EINVAL; - } - if (elb->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", elb->serial); - return -EINVAL; - } - if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - pr_err("error: phantom element\n"); - return -EINVAL; - } - if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); - return -EINVAL; - } - if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); - return -EINVAL; - } - return 0; -} - -static int __init cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct debug_el *ela, *elb; - - ela = container_of(a, struct debug_el, list); - elb = container_of(b, struct debug_el, list); - - check(ela, elb); - return ela->value - elb->value; -} - -static int __init list_sort_test(void) -{ - int i, count = 1, err = -ENOMEM; - struct debug_el *el; - struct list_head *cur; - LIST_HEAD(head); - - pr_debug("start testing list_sort()\n"); - - elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); - if (!elts) { - pr_err("error: cannot allocate memory\n"); - return err; - } - - for (i = 0; i < TEST_LIST_LEN; i++) { - el = kmalloc(sizeof(*el), GFP_KERNEL); - if (!el) { - pr_err("error: cannot allocate memory\n"); - goto exit; - } - /* force some equivalencies */ - el->value = prandom_u32() % (TEST_LIST_LEN / 3); - el->serial = i; - el->poison1 = TEST_POISON1; - el->poison2 = TEST_POISON2; - elts[i] = el; - list_add_tail(&el->list, &head); - } - - list_sort(NULL, &head, cmp); - - err = -EINVAL; - for (cur = head.next; cur->next != &head; cur = cur->next) { - struct debug_el *el1; - int cmp_result; - - if (cur->next->prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - cmp_result = cmp(NULL, cur, cur->next); - if (cmp_result > 0) { - pr_err("error: list is not sorted\n"); - goto exit; - } - - el = container_of(cur, struct debug_el, list); - el1 = container_of(cur->next, struct debug_el, list); - if (cmp_result == 0 && el->serial >= el1->serial) { - pr_err("error: order of equivalent elements not " - "preserved\n"); - goto exit; - } - - if (check(el, el1)) { - pr_err("error: element check failed\n"); - goto exit; - } - count++; - } - if (head.prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - - if (count != TEST_LIST_LEN) { - pr_err("error: bad list length %d", count); - goto exit; - } - - err = 0; -exit: - for (i = 0; i < TEST_LIST_LEN; i++) - kfree(elts[i]); - kfree(elts); - return err; -} -late_initcall(list_sort_test); -#endif /* CONFIG_TEST_LIST_SORT */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 691a9ad48497..898e87998417 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu) void __init radix_tree_init(void) { int ret; + + BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c new file mode 100644 index 000000000000..28e817387b04 --- /dev/null +++ b/lib/test_list_sort.c @@ -0,0 +1,150 @@ +#define pr_fmt(fmt) "list_sort_test: " fmt + +#include <linux/kernel.h> +#include <linux/list_sort.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/random.h> + +/* + * The pattern of set bits in the list length determines which cases + * are hit in list_sort(). + */ +#define TEST_LIST_LEN (512+128+2) /* not including head */ + +#define TEST_POISON1 0xDEADBEEF +#define TEST_POISON2 0xA324354C + +struct debug_el { + unsigned int poison1; + struct list_head list; + unsigned int poison2; + int value; + unsigned serial; +}; + +/* Array, containing pointers to all elements in the test list */ +static struct debug_el **elts __initdata; + +static int __init check(struct debug_el *ela, struct debug_el *elb) +{ + if (ela->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", ela->serial); + return -EINVAL; + } + if (elb->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", elb->serial); + return -EINVAL; + } + if (elts[ela->serial] != ela || elts[elb->serial] != elb) { + pr_err("error: phantom element\n"); + return -EINVAL; + } + if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); + return -EINVAL; + } + if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); + return -EINVAL; + } + return 0; +} + +static int __init cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct debug_el *ela, *elb; + + ela = container_of(a, struct debug_el, list); + elb = container_of(b, struct debug_el, list); + + check(ela, elb); + return ela->value - elb->value; +} + +static int __init list_sort_test(void) +{ + int i, count = 1, err = -ENOMEM; + struct debug_el *el; + struct list_head *cur; + LIST_HEAD(head); + + pr_debug("start testing list_sort()\n"); + + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); + if (!elts) { + pr_err("error: cannot allocate memory\n"); + return err; + } + + for (i = 0; i < TEST_LIST_LEN; i++) { + el = kmalloc(sizeof(*el), GFP_KERNEL); + if (!el) { + pr_err("error: cannot allocate memory\n"); + goto exit; + } + /* force some equivalencies */ + el->value = prandom_u32() % (TEST_LIST_LEN / 3); + el->serial = i; + el->poison1 = TEST_POISON1; + el->poison2 = TEST_POISON2; + elts[i] = el; + list_add_tail(&el->list, &head); + } + + list_sort(NULL, &head, cmp); + + err = -EINVAL; + for (cur = head.next; cur->next != &head; cur = cur->next) { + struct debug_el *el1; + int cmp_result; + + if (cur->next->prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + cmp_result = cmp(NULL, cur, cur->next); + if (cmp_result > 0) { + pr_err("error: list is not sorted\n"); + goto exit; + } + + el = container_of(cur, struct debug_el, list); + el1 = container_of(cur->next, struct debug_el, list); + if (cmp_result == 0 && el->serial >= el1->serial) { + pr_err("error: order of equivalent elements not " + "preserved\n"); + goto exit; + } + + if (check(el, el1)) { + pr_err("error: element check failed\n"); + goto exit; + } + count++; + } + if (head.prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + + if (count != TEST_LIST_LEN) { + pr_err("error: bad list length %d", count); + goto exit; + } + + err = 0; +exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); + kfree(elts); + return err; +} +module_init(list_sort_test); +MODULE_LICENSE("GPL"); diff --git a/lib/test_sort.c b/lib/test_sort.c index 4db3911db50a..d389c1cc2f6c 100644 --- a/lib/test_sort.c +++ b/lib/test_sort.c @@ -1,11 +1,8 @@ #include <linux/sort.h> #include <linux/slab.h> -#include <linux/init.h> +#include <linux/module.h> -/* - * A simple boot-time regression test - * License: GPL - */ +/* a simple boot-time regression test */ #define TEST_LEN 1000 @@ -41,4 +38,6 @@ exit: kfree(a); return err; } -subsys_initcall(test_sort_init); + +module_init(test_sort_init); +MODULE_LICENSE("GPL"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 176641cc549d..2d41de3f98a1 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1477,6 +1477,9 @@ int kptr_restrict __read_mostly; * by an extra set of alphanumeric characters that are extended format * specifiers. * + * Please update scripts/checkpatch.pl when adding/removing conversion + * characters. (Search for "check for vsprintf extension"). + * * Right now we handle: * * - 'F' For symbolic function descriptor pointers with offset diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c index 3fe6ce5b53e5..028943052926 100644 --- a/lib/zlib_inflate/inftrees.c +++ b/lib/zlib_inflate/inftrees.c @@ -109,7 +109,7 @@ int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes, *bits = 1; return 0; /* no symbols, but wait for decoding to report error */ } - for (min = 1; min <= MAXBITS; min++) + for (min = 1; min < MAXBITS; min++) if (count[min] != 0) break; if (root < min) root = min; diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 79d0fd13b5b3..5b0adf1435de 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_EXTENSION select PAGE_POISONING_NO_SANITY if HIBERNATION ---help--- Fill the pages with poison patterns after free_pages() and verify diff --git a/mm/compaction.c b/mm/compaction.c index 81e1eaa2a2cf..613c59e928cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,11 +89,6 @@ static void map_pages(struct list_head *list) list_splice(&tmp_list, list); } -static inline bool migrate_async_suitable(int migratetype) -{ - return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; -} - #ifdef CONFIG_COMPACTION int PageMovable(struct page *page) @@ -988,13 +983,26 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct compact_control *cc, +static bool suitable_migration_source(struct compact_control *cc, struct page *page) { - if (cc->ignore_block_suitable) + int block_mt; + + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; + block_mt = get_pageblock_migratetype(page); + + if (cc->migratetype == MIGRATE_MOVABLE) + return is_migrate_movable(block_mt); + else + return block_mt == cc->migratetype; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) +{ /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1006,8 +1014,11 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } + if (cc->ignore_block_suitable) + return true; + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(get_pageblock_migratetype(page))) + if (is_migrate_movable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Async compaction is optimistic to see if the minimum amount * of work satisfies the allocation. */ - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(get_pageblock_migratetype(page))) + if (!suitable_migration_source(cc, page)) continue; /* Perform the isolation */ @@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result __compact_finished(struct zone *zone, + struct compact_control *cc) { unsigned int order; - unsigned long watermark; + const int migratetype = cc->migratetype; if (cc->contended || fatal_signal_pending(current)) return COMPACT_CONTENDED; @@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - /* Compaction run is not finished if the watermark is not met */ - watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; - - if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, - cc->alloc_flags)) - return COMPACT_CONTINUE; + if (cc->finishing_block) { + /* + * We have finished the pageblock, but better check again that + * we really succeeded. + */ + if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + cc->finishing_block = false; + else + return COMPACT_CONTINUE; + } /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { @@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) - return COMPACT_SUCCESS; + true, &can_steal) != -1) { + + /* movable pages are OK in any pageblock */ + if (migratetype == MIGRATE_MOVABLE) + return COMPACT_SUCCESS; + + /* + * We are stealing for a non-movable allocation. Make + * sure we finish compacting the current pageblock + * first so it is as free as possible and we won't + * have to steal another one soon. This only applies + * to sync compaction, as async compaction operates + * on pageblocks of the same migratetype. + */ + if (cc->mode == MIGRATE_ASYNC || + IS_ALIGNED(cc->migrate_pfn, + pageblock_nr_pages)) { + return COMPACT_SUCCESS; + } + + cc->finishing_block = true; + return COMPACT_CONTINUE; + } } return COMPACT_NO_SUITABLE_PAGE; } static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc, - const int migratetype) + struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc, migratetype); + ret = __compact_finished(zone, cc); trace_mm_compaction_finished(zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); - const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ @@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro migrate_prep_local(); - while ((ret = compact_finished(zone, cc, migratetype)) == - COMPACT_CONTINUE) { + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { int err; switch (isolate_migratepages(zone, cc)) { diff --git a/mm/filemap.c b/mm/filemap.c index c5808b7a5fb1..d04e475bcf9e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2204,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf) struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; + pgoff_t max_off; struct page *page; - loff_t size; int ret = 0; - size = round_up(i_size_read(inode), PAGE_SIZE); - if (offset >= size >> PAGE_SHIFT) + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) return VM_FAULT_SIGBUS; /* @@ -2258,8 +2258,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = round_up(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= size >> PAGE_SHIFT)) { + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) { unlock_page(page); put_page(page); return VM_FAULT_SIGBUS; @@ -2325,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - loff_t size; + unsigned long max_idx; struct page *head, *page; rcu_read_lock(); @@ -2371,8 +2371,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = round_up(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= size >> PAGE_SHIFT) + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (page->index >= max_idx) goto unlock; if (file->f_ra.mmap_miss > 0) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fef4cf210cc7..b787c4cfda0e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1564,18 +1564,16 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ClearPageDirty(page); unlock_page(page); - if (PageActive(page)) - deactivate_page(page); - if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + + mark_page_lazyfree(page); ret = true; out: spin_unlock(ptl); @@ -1724,37 +1722,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - int ret = 0; + pmd_t entry; + bool preserve_write; + int ret; ptl = __pmd_trans_huge_lock(pmd, vma); - if (ptl) { - pmd_t entry; - bool preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; + if (!ptl) + return 0; - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) { - spin_unlock(ptl); - return ret; - } + preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; - if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); - entry = pmd_modify(entry, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); - ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(vma_is_anonymous(vma) && !preserve_write && - pmd_write(entry)); - } - spin_unlock(ptl); - } + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) + goto unlock; + + if (prot_numa && pmd_protnone(*pmd)) + goto unlock; + + /* + * In case prot_numa, we are under down_read(mmap_sem). It's critical + * to not clear pmd intermittently to avoid race with MADV_DONTNEED + * which is also under down_read(mmap_sem): + * + * CPU0: CPU1: + * change_huge_pmd(prot_numa=1) + * pmdp_huge_get_and_clear_notify() + * madvise_dontneed() + * zap_pmd_range() + * pmd_trans_huge(*pmd) == 0 (without ptl) + * // skip the pmd + * set_pmd_at(); + * // pmd is re-established + * + * The race makes MADV_DONTNEED miss the huge pmd and don't clear it + * which may break userspace. + * + * pmdp_invalidate() is required to make sure we don't miss + * dirty/young flags set by hardware. + */ + entry = *pmd; + pmdp_invalidate(vma, addr, pmd); + + /* + * Recover dirty/young flags. It relies on pmdp_invalidate to not + * corrupt them. + */ + if (pmd_dirty(*pmd)) + entry = pmd_mkdirty(entry); + if (pmd_young(*pmd)) + entry = pmd_mkyoung(entry); + entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mk_savedwrite(entry); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); +unlock: + spin_unlock(ptl); return ret; } @@ -2114,15 +2144,15 @@ static void freeze_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - int ret; + bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_MIGRATION; - ret = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(ret, page); + unmap_success = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(!unmap_success, page); } static void unfreeze_page(struct page *page) @@ -2368,7 +2398,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); if (PageAnon(head)) { diff --git a/mm/internal.h b/mm/internal.h index 266efaeaa370..0e4f558412fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -81,11 +81,16 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; /* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* * in mm/vmscan.c: */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: @@ -178,6 +183,7 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ + struct zone *zone; unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; @@ -185,17 +191,18 @@ struct compact_control { unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + int order; /* order a direct compactor needs */ + int migratetype; /* migratetype of direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ - int order; /* order a direct compactor needs */ - const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ - struct zone *zone; bool contended; /* Signal lock or sched contention */ + bool finishing_block; /* Finishing current pageblock */ }; unsigned long @@ -505,4 +512,14 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; +static inline bool is_migrate_highatomic(enum migratetype migratetype) +{ + return migratetype == MIGRATE_HIGHATOMIC; +} + +static inline bool is_migrate_highatomic_page(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 4b20061102f6..e4c2975fcae8 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_double_free(cache, object, shadow_byte); + kasan_report_double_free(cache, object, + __builtin_return_address(1)); return true; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index dd2dea8eb077..1229298cce64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow); + void *ip); #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ab42a0803f16..beee0e980e2d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size) return first_bad_addr; } -static void print_error_description(struct kasan_access_info *info) +static bool addr_has_shadow(struct kasan_access_info *info) +{ + return (info->access_addr >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info) break; } - pr_err("BUG: KASAN: %s in %pS at addr %p\n", - bug_type, (void *)info->ip, - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); + return bug_type; +} + +const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +static const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = get_bug_type(info); + + pr_err("BUG: KASAN: %s in %pS\n", + bug_type, (void *)info->ip); + pr_err("%s of size %zu at addr %p by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); } static inline bool kernel_or_module_addr(const void *addr) @@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags) kasan_enable_current(); } -static void print_track(struct kasan_track *track) +static void print_track(struct kasan_track *track, const char *prefix) { - pr_err("PID = %u\n", track->pid); + pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { struct stack_trace trace; @@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track) } } -static void kasan_object_err(struct kmem_cache *cache, void *object) +static struct page *addr_to_page(const void *addr) { - struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) + return virt_to_head_page(addr); + return NULL; +} - dump_stack(); - pr_err("Object at %p, in cache %s size: %d\n", object, cache->name, - cache->object_size); +static void describe_object_addr(struct kmem_cache *cache, void *object, + const void *addr) +{ + unsigned long access_addr = (unsigned long)addr; + unsigned long object_addr = (unsigned long)object; + const char *rel_type; + int rel_bytes; - if (!(cache->flags & SLAB_KASAN)) + pr_err("The buggy address belongs to the object at %p\n" + " which belongs to the cache %s of size %d\n", + object, cache->name, cache->object_size); + + if (!addr) return; - pr_err("Allocated:\n"); - print_track(&alloc_info->alloc_track); - pr_err("Freed:\n"); - print_track(&alloc_info->free_track); + if (access_addr < object_addr) { + rel_type = "to the left"; + rel_bytes = object_addr - access_addr; + } else if (access_addr >= object_addr + cache->object_size) { + rel_type = "to the right"; + rel_bytes = access_addr - (object_addr + cache->object_size); + } else { + rel_type = "inside"; + rel_bytes = access_addr - object_addr; + } + + pr_err("The buggy address is located %d bytes %s of\n" + " %d-byte region [%p, %p)\n", + rel_bytes, rel_type, cache->object_size, (void *)object_addr, + (void *)(object_addr + cache->object_size)); } -void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow) +static void describe_object(struct kmem_cache *cache, void *object, + const void *addr) { - unsigned long flags; + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - kasan_start_report(&flags); - pr_err("BUG: Double free or freeing an invalid pointer\n"); - pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); - kasan_object_err(cache, object); - kasan_end_report(&flags); + if (cache->flags & SLAB_KASAN) { + print_track(&alloc_info->alloc_track, "Allocated"); + pr_err("\n"); + print_track(&alloc_info->free_track, "Freed"); + pr_err("\n"); + } + + describe_object_addr(cache, object, addr); } -static void print_address_description(struct kasan_access_info *info) +static void print_address_description(void *addr) { - const void *addr = info->access_addr; + struct page *page = addr_to_page(addr); - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) { - struct page *page = virt_to_head_page(addr); - - if (PageSlab(page)) { - void *object; - struct kmem_cache *cache = page->slab_cache; - object = nearest_obj(cache, page, - (void *)info->access_addr); - kasan_object_err(cache, object); - return; - } - dump_page(page, "kasan: bad access detected"); + dump_stack(); + pr_err("\n"); + + if (page && PageSlab(page)) { + struct kmem_cache *cache = page->slab_cache; + void *object = nearest_obj(cache, page, addr); + + describe_object(cache, object, addr); } - if (kernel_or_module_addr(addr)) { - if (!init_task_stack_addr(addr)) - pr_err("Address belongs to variable %pS\n", addr); + if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { + pr_err("The buggy address belongs to the variable:\n"); + pr_err(" %pS\n", addr); + } + + if (page) { + pr_err("The buggy address belongs to the page:\n"); + dump_page(page, "kasan: bad access detected"); } - dump_stack(); } static bool row_is_guilty(const void *row, const void *guilty) @@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr) } } +void kasan_report_double_free(struct kmem_cache *cache, void *object, + void *ip) +{ + unsigned long flags; + + kasan_start_report(&flags); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + pr_err("\n"); + print_address_description(object); + pr_err("\n"); + print_shadow_for_address(object); + kasan_end_report(&flags); +} + static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; - const char *bug_type; kasan_start_report(&flags); - if (info->access_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - pr_err("BUG: KASAN: %s on address %p\n", - bug_type, info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, - task_pid_nr(current)); + print_error_description(info); + pr_err("\n"); + + if (!addr_has_shadow(info)) { dump_stack(); } else { - print_error_description(info); - print_address_description(info); + print_address_description((void *)info->access_addr); + pr_err("\n"); print_shadow_for_address(info->first_bad_addr); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ba40b7f673f4..7cb9c88bb4a3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - /* 0 stands for page_is_file_cache(page) == false */ - dec_node_page_state(page, NR_ISOLATED_ANON + 0); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); unlock_page(page); putback_lru_page(page); } @@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* * We can do it before isolate_lru_page because the @@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; @@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - /* 0 stands for page_is_file_cache(page) == false */ - inc_node_page_state(page, NR_ISOLATED_ANON + 0); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; int search_new_forks = 0; VM_BUG_ON_PAGE(!PageKsm(page), page); @@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) stable_node = page_stable_node(page); if (!stable_node) - return ret; + return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; @@ -1978,23 +1977,20 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg); - if (ret != SWAP_AGAIN) { + if (!rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } if (rwc->done && rwc->done(page)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; -out: - return ret; } #ifdef CONFIG_MIGRATION diff --git a/mm/madvise.c b/mm/madvise.c index 7a2abf0127ae..31da09412934 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, ptent = pte_mkold(ptent); ptent = pte_mkclean(ptent); set_pte_at(mm, addr, pte, ptent); - if (PageActive(page)) - deactivate_page(page); tlb_remove_tlb_entry(tlb, pte, addr); } + mark_page_lazyfree(page); } out: if (nr_swap) { @@ -606,34 +605,40 @@ static long madvise_remove(struct vm_area_struct *vma, /* * Error injection support for memory error handling. */ -static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) +static int madvise_inject_error(int behavior, + unsigned long start, unsigned long end) { - struct page *p; + struct page *page; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + for (; start < end; start += PAGE_SIZE << - compound_order(compound_head(p))) { + compound_order(compound_head(page))) { int ret; - ret = get_user_pages_fast(start, 1, 0, &p); + ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; - if (PageHWPoison(p)) { - put_page(p); + if (PageHWPoison(page)) { + put_page(page); continue; } - if (bhv == MADV_SOFT_OFFLINE) { - pr_info("Soft offlining page %#lx at %#lx\n", - page_to_pfn(p), start); - ret = soft_offline_page(p, MF_COUNT_INCREASED); + + if (behavior == MADV_SOFT_OFFLINE) { + pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", + page_to_pfn(page), start); + + ret = soft_offline_page(page, MF_COUNT_INCREASED); if (ret) return ret; continue; } - pr_info("Injecting memory failure for page %#lx at %#lx\n", - page_to_pfn(p), start); - ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", + page_to_pfn(page), start); + + ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED); if (ret) return ret; } @@ -651,13 +656,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_FREE: - /* - * XXX: In this implementation, MADV_FREE works like - * MADV_DONTNEED on swapless system or full swap. - */ - if (get_nr_swap_pages() > 0) - return madvise_free(vma, prev, start, end); - /* passthrough */ + return madvise_free(vma, prev, start, end); case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -763,7 +762,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_hwpoison(behavior, start, start+len_in); + return madvise_inject_error(behavior, start, start + len_in); #endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd7541d7c11..ff73899af61a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -100,24 +100,7 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; } -static const char * const mem_cgroup_stat_names[] = { - "cache", - "rss", - "rss_huge", - "mapped_file", - "dirty", - "writeback", - "swap", -}; - -static const char * const mem_cgroup_events_names[] = { - "pgpgin", - "pgpgout", - "pgfault", - "pgmajfault", -}; - -static const char * const mem_cgroup_lru_names[] = { +static const char *const mem_cgroup_lru_names[] = { "inactive_anon", "active_anon", "inactive_file", @@ -568,32 +551,15 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) * common workload, threshold and synchronization as vmstat[] should be * implemented. */ -static unsigned long -mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) -{ - long val = 0; - int cpu; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - /* - * Summing races with updates, so val may be negative. Avoid exposing - * transient negative values. - */ - if (val < 0) - val = 0; - return val; -} -static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static unsigned long memcg_sum_events(struct mem_cgroup *memcg, + enum memcg_event_item event) { unsigned long val = 0; int cpu; for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->events[idx], cpu); + val += per_cpu(memcg->stat->events[event], cpu); return val; } @@ -606,23 +572,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], - nr_pages); - else - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], - nr_pages); + __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); + else { + __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); + if (PageSwapBacked(page)) + __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); + } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], - nr_pages); + __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + __this_cpu_inc(memcg->stat->events[PGPGIN]); else { - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + __this_cpu_inc(memcg->stat->events[PGPGOUT]); nr_pages = -nr_pages; /* for event */ } @@ -1144,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } +unsigned int memcg1_stats[] = { + MEMCG_CACHE, + MEMCG_RSS, + MEMCG_RSS_HUGE, + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", + "rss_huge", + "shmem", + "mapped_file", + "dirty", + "writeback", + "swap", +}; + #define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. @@ -1188,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) continue; - pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], - K(mem_cgroup_read_stat(iter, i))); + pr_cont(" %s:%luKB", memcg1_stat_names[i], + K(memcg_page_state(iter, memcg1_stats[i]))); } for (i = 0; i < NR_LRU_LISTS; i++) @@ -1837,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); + mem_cgroup_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1928,7 +1916,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); + mem_cgroup_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1971,7 +1959,7 @@ retry: if (fatal_signal_pending(current)) goto force; - mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); + mem_cgroup_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2381,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -2391,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); + this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); } /** @@ -2725,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += mem_cgroup_read_stat(iter, i); + stat[i] += memcg_page_state(iter, i); } } @@ -2738,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < MEMCG_NR_EVENTS; i++) - events[i] += mem_cgroup_read_events(iter, i); + events[i] += memcg_sum_events(iter, i); } } @@ -2750,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) struct mem_cgroup *iter; for_each_mem_cgroup_tree(iter, memcg) { - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_RSS); + val += memcg_page_state(iter, MEMCG_CACHE); + val += memcg_page_state(iter, MEMCG_RSS); if (swap) - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_SWAP); + val += memcg_page_state(iter, MEMCG_SWAP); } } else { if (!swap) @@ -3131,6 +3116,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ +/* Universal VM events cgroup1 shows, original sort order */ +unsigned int memcg1_events[] = { + PGPGIN, + PGPGOUT, + PGFAULT, + PGMAJFAULT, +}; + +static const char *const memcg1_event_names[] = { + "pgpgin", + "pgpgout", + "pgfault", + "pgmajfault", +}; + static int memcg_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -3138,22 +3138,20 @@ static int memcg_stat_show(struct seq_file *m, void *v) struct mem_cgroup *mi; unsigned int i; - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != - MEM_CGROUP_STAT_NSTATS); - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != - MEM_CGROUP_EVENTS_NSTATS); + BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], - mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) - seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], - mem_cgroup_read_events(memcg, i)); + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "%s %lu\n", memcg1_event_names[i], + memcg_sum_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], @@ -3171,23 +3169,23 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long long val = 0; - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); + val += memcg_page_state(mi, memcg1_stats[i]) * + PAGE_SIZE; + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { unsigned long long val = 0; for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_read_events(mi, i); - seq_printf(m, "total_%s %llu\n", - mem_cgroup_events_names[i], val); + val += memcg_sum_events(mi, memcg1_events[i]); + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); } for (i = 0; i < NR_LRU_LISTS; i++) { @@ -3652,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ - *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | (1 << LRU_ACTIVE_FILE)); *pheadroom = PAGE_COUNTER_MAX; @@ -4511,33 +4509,29 @@ static int mem_cgroup_move_account(struct page *page, spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); + __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); + __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); } /* * move_lock grabbed above and caller set from->moving_account, so - * mem_cgroup_update_page_stat() will serialize updates to PageDirty. + * mod_memcg_page_state will serialize updates to PageDirty. * So mapping should be stable for dirty pages. */ if (!anon && PageDirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], + __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], + __this_cpu_add(to->stat->count[NR_FILE_DIRTY], nr_pages); } } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); + __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); + __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); } /* @@ -5154,7 +5148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_events(memcg, MEMCG_OOM, 1); + mem_cgroup_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5167,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); + seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); + seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); + seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); return 0; } @@ -5197,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v) tree_events(memcg, events); seq_printf(m, "anon %llu\n", - (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); + (u64)stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + (u64)stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", @@ -5208,12 +5202,14 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + seq_printf(m, "shmem %llu\n", + (u64)stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); + (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); + (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); + (u64)stat[NR_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5232,10 +5228,15 @@ static int memory_stat_show(struct seq_file *m, void *v) /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", - events[MEM_CGROUP_EVENTS_PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", - events[MEM_CGROUP_EVENTS_PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + + seq_printf(m, "workingset_refault %lu\n", + stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + stat[WORKINGSET_NODERECLAIM]); return 0; } @@ -5476,8 +5477,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, unsigned long nr_kmem, - struct page *dummy_page) + unsigned long nr_kmem, unsigned long nr_huge, + unsigned long nr_shmem, struct page *dummy_page) { unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; @@ -5492,10 +5493,11 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, } local_irq_save(flags); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); + __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); + __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); local_irq_restore(flags); @@ -5507,6 +5509,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; + unsigned long nr_shmem = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; @@ -5539,9 +5542,9 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); - pgpgout = nr_anon = nr_file = - nr_huge = nr_kmem = 0; + nr_kmem, nr_huge, nr_shmem, page); + pgpgout = nr_anon = nr_file = nr_kmem = 0; + nr_huge = nr_shmem = 0; } memcg = page->mem_cgroup; } @@ -5555,8 +5558,11 @@ static void uncharge_list(struct list_head *page_list) } if (PageAnon(page)) nr_anon += nr_pages; - else + else { nr_file += nr_pages; + if (PageSwapBacked(page)) + nr_shmem += nr_pages; + } pgpgout++; } else { nr_kmem += 1 << compound_order(page); @@ -5568,7 +5574,7 @@ static void uncharge_list(struct list_head *page_list) if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); + nr_kmem, nr_huge, nr_shmem, page); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 27f7210e7fab..92865bb1816d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -322,7 +322,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, - int fail, struct page *page, unsigned long pfn, + bool fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -904,13 +904,13 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page); * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static int hwpoison_user_mappings(struct page *p, unsigned long pfn, +static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - int ret; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; @@ -919,20 +919,20 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * other types of pages. */ if (PageReserved(p) || PageSlab(p)) - return SWAP_SUCCESS; + return true; if (!(PageLRU(hpage) || PageHuge(p))) - return SWAP_SUCCESS; + return true; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(hpage)) - return SWAP_SUCCESS; + return true; if (PageKsm(p)) { pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); - return SWAP_FAIL; + return false; } if (PageSwapCache(p)) { @@ -971,8 +971,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(hpage, ttu); - if (ret != SWAP_SUCCESS) + unmap_success = try_to_unmap(hpage, ttu); + if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); @@ -987,10 +987,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); - kill_procs(&tokill, forcekill, trapno, - ret != SWAP_SUCCESS, p, pfn, flags); + kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags); - return ret; + return unmap_success; } static void set_page_hwpoison_huge_page(struct page *hpage) @@ -1230,8 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * When the raw error page is thp tail page, hpage points to the raw * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) - != SWAP_SUCCESS) { + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1543,8 +1541,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ put_hwpoison_page(page); - pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", - pfn, page->flags); + pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", + pfn, page->flags, &page->flags); return -EIO; } } @@ -1585,8 +1583,8 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", - pfn, ret, page->flags); + pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pfn, ret, page->flags, &page->flags); /* * We know that soft_offline_huge_page() tries to migrate * only one hugepage pointed to by hpage, so we need not @@ -1677,14 +1675,14 @@ static int __soft_offline_page(struct page *page, int flags) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", - pfn, ret, page->flags); + pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pfn, ret, page->flags, &page->flags); if (ret > 0) ret = -EIO; } } else { - pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", - pfn, ret, page_count(page), page->flags); + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", + pfn, ret, page_count(page), page->flags, &page->flags); } return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6fa7208bcd56..b63d7d1239df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1208,7 +1208,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) arch_refresh_nodedata(nid, pgdat); } else { - /* Reset the nr_zones, order and classzone_idx before reuse */ + /* + * Reset the nr_zones, order and classzone_idx before reuse. + * Note that kswapd will init kswapd_classzone_idx properly + * when it starts in the near future. + */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = 0; diff --git a/mm/migrate.c b/mm/migrate.c index ed97c2c14fa8..410b56bdabed 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, +static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { struct page_vma_mapped_walk pvmw = { @@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, update_mmu_cache(vma, pvmw.address, pvmw.pte); } - return SWAP_AGAIN; + return true; } /* @@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, { int z; - if (!pgdat_reclaimable(pgdat)) - return false; - for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; @@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (PageSwapBacked(page)) + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/mlock.c b/mm/mlock.c index 0dd9ca18e19e..c483c5c20b4b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) */ static void __munlock_isolated_page(struct page *page) { - int ret = SWAP_AGAIN; - /* * Optimization: if the page was mapped just once, that's our mapping * and we don't need to check all the other vmas. */ if (page_mapcount(page) > 1) - ret = try_to_munlock(page); + try_to_munlock(page); /* Did try_to_unlock() succeed or punt? */ - if (ret != SWAP_MLOCK) + if (!PageMlocked(page)) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index bfbe8856d134..f82741e199c0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, struct user_struct *user = NULL; struct hstate *hs; - hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d083714a2bb9..04c9143a8625 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -685,6 +685,7 @@ void exit_oom_victim(void) void oom_killer_enable(void) { oom_killer_disabled = false; + pr_info("OOM killer enabled.\n"); } /** @@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout) oom_killer_enable(); return false; } + pr_info("OOM killer disabled.\n"); return true; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8ac2a7fb9e7..2359608d2568 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - init_timer_deferrable(&dom->period_timer); - dom->period_timer.function = writeout_period; - dom->period_timer.data = (unsigned long)dom; + setup_deferrable_timer(&dom->period_timer, writeout_period, + (unsigned long)dom); dom->dirty_limit_tstamp = jiffies; @@ -2428,7 +2427,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); + inc_memcg_page_state(page, NR_FILE_DIRTY); __inc_node_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); @@ -2450,7 +2449,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); + dec_memcg_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -2707,7 +2706,7 @@ int clear_page_dirty_for_io(struct page *page) */ wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); + dec_memcg_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -2754,7 +2753,7 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + dec_memcg_page_state(page, NR_WRITEBACK); dec_node_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); @@ -2809,7 +2808,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + inc_memcg_page_state(page, NR_WRITEBACK); inc_node_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f233f53d399a..abed5dc65bae 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1091,15 +1091,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned, flags; + unsigned long flags; bool isolated_pageblocks; spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - while (count) { struct page *page; struct list_head *list; @@ -1151,13 +1147,10 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned, flags; + unsigned long flags; + spin_lock_irqsave(&zone->lock, flags); __count_vm_events(PGFREE, 1 << order); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); @@ -1696,10 +1689,10 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(bool poisoned) +static inline bool free_pages_prezeroed(void) { return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled() && poisoned; + page_poisoning_enabled(); } #ifdef CONFIG_DEBUG_VM @@ -1753,17 +1746,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { int i; - bool poisoned = true; - - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - if (poisoned) - poisoned &= page_is_poisoned(p); - } post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); @@ -1845,9 +1831,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, +static int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, - int migratetype) + int migratetype, int *num_movable) { struct page *page; unsigned int order; @@ -1864,6 +1850,9 @@ int move_freepages(struct zone *zone, VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif + if (num_movable) + *num_movable = 0; + for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1874,6 +1863,15 @@ int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + page++; continue; } @@ -1889,7 +1887,7 @@ int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int migratetype, int *num_movable) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1906,7 +1904,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype); + return move_freepages(zone, start_page, end_page, migratetype, + num_movable); } static void change_pageblock_range(struct page *pageblock_page, @@ -1956,28 +1955,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this - * pageblock and check whether half of pages are moved or not. If half of - * pages are moved, we can change migratetype of pageblock and permanently - * use it's pages as requested migratetype in the future. + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type) + int start_type, bool whole_block) { unsigned int current_order = page_order(page); - int pages; + struct free_area *area; + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic(old_block_type)) + goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return; + goto single_page; } - pages = move_freepages_block(zone, page, start_type); + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; + } + + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; + + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); + + return; + +single_page: + area = &zone->free_area[current_order]; + list_move(&page->lru, &area->free_list[start_type]); } /* @@ -2043,11 +2093,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_HIGHATOMIC && - !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) + && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); } out_unlock: @@ -2101,8 +2151,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (get_pageblock_migratetype(page) == - MIGRATE_HIGHATOMIC) { + if (is_migrate_highatomic_page(page)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2125,7 +2174,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); if (ret) { spin_unlock_irqrestore(&zone->lock, flags); return ret; @@ -2137,8 +2187,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, return false; } -/* Remove an element from the buddy allocator from the fallback list */ -static inline struct page * +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + */ +static inline bool __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; @@ -2159,33 +2214,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && - get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) - steal_suitable_fallback(zone, page, start_migratetype); - /* Remove the page from the freelists */ - area->nr_free--; - list_del(&page->lru); - rmv_page_order(page); - - expand(zone, page, order, current_order, area, - start_migratetype); - /* - * The pcppage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * find_suitable_fallback(). This is OK as long as it does not - * differ for MIGRATE_CMA pageblocks. Those can be used as - * fallback only via special __rmqueue_cma_fallback() function - */ - set_pcppage_migratetype(page, start_migratetype); + steal_suitable_fallback(zone, page, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return page; + return true; } - return NULL; + return false; } /* @@ -2197,13 +2236,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + if (!page && __rmqueue_fallback(zone, order, migratetype)) + goto retry; } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -2352,9 +2392,9 @@ static void drain_local_pages_wq(struct work_struct *work) * cpu which is allright but we also have to make sure to not move to * a different one. */ - preempt_disable(); + local_bh_disable(); drain_local_pages(NULL); - preempt_enable(); + local_bh_enable(); } /* @@ -2489,7 +2529,11 @@ void free_hot_cold_page(struct page *page, bool cold) unsigned long pfn = page_to_pfn(page); int migratetype; - if (in_interrupt()) { + /* + * Exclude (hard) IRQ and NMI context from using the pcplists. + * But allow softirq context, via disabling BH. + */ + if (in_irq() || irqs_disabled()) { __free_pages_ok(page, 0); return; } @@ -2499,12 +2543,12 @@ void free_hot_cold_page(struct page *page, bool cold) migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - preempt_disable(); + local_bh_disable(); /* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being - * offlined but treat RESERVE as movable pages so we can get those + * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ @@ -2530,7 +2574,7 @@ void free_hot_cold_page(struct page *page, bool cold) } out: - preempt_enable(); + local_bh_enable(); } /* @@ -2615,7 +2659,7 @@ int __isolate_free_page(struct page *page, unsigned int order) for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && mt != MIGRATE_HIGHATOMIC) + && !is_migrate_highatomic(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -2655,7 +2699,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, { struct page *page; - VM_BUG_ON(in_interrupt()); + VM_BUG_ON(in_irq() || irqs_disabled()); do { if (list_empty(list)) { @@ -2688,7 +2732,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; - preempt_disable(); + local_bh_disable(); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); @@ -2696,7 +2740,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); } - preempt_enable(); + local_bh_enable(); return page; } @@ -2712,7 +2756,11 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; - if (likely(order == 0) && !in_interrupt()) { + /* + * Exclude (hard) IRQ and NMI context from using the pcplists. + * But allow softirq context, via disabling BH. + */ + if (likely(order == 0) && !(in_irq() || irqs_disabled())) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype); goto out; @@ -3114,8 +3162,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; pr_warn("%s: ", current->comm); @@ -3249,14 +3296,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned int noreclaim_flag; if (!order) return NULL; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3403,12 +3451,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, { struct reclaim_state reclaim_state; int progress; + unsigned int noreclaim_flag; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3418,7 +3467,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3526,19 +3575,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) } /* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - -/* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. - * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round) and no_progress_loops (number of reclaim rounds without - * any progress in a row) is considered as well as the reclaimable pages on the - * applicable zone list (with a backoff mechanism which is a function of - * no_progress_loops). + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ @@ -3583,13 +3625,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, bool wmark; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP((*no_progress_loops) * available, - MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); /* - * Would the allocation succeed if we reclaimed the whole - * available? + * Would the allocation succeed if we reclaimed all + * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, ac_classzone_idx(ac), alloc_flags, available); @@ -3640,6 +3680,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; @@ -3707,12 +3748,17 @@ retry_cpuset: /* * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. Don't try - * that for allocations that are allowed to ignore watermarks, as the - * ALLOC_NO_WATERMARKS attempt didn't yet happen. + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && - !gfp_pfmemalloc_allowed(gfp_mask)) { + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, @@ -3724,7 +3770,7 @@ retry_cpuset: * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ - if (gfp_mask & __GFP_NORETRY) { + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If @@ -3775,7 +3821,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, + warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; @@ -3805,7 +3851,7 @@ retry: * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_REPEAT)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -3975,10 +4021,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Runtime PM, block IO and its error handling path can deadlock - * because I/O on the device might not complete. + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. */ - alloc_mask = memalloc_noio_flags(gfp_mask); + alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -4514,7 +4562,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif " writeback_tmp:%lukB" " unstable:%lukB" - " pages_scanned:%lu" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -4537,8 +4584,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), - node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { @@ -5747,6 +5794,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); + + /* If this node has no page within this zone, return 0. */ + if (zone_start_pfn == zone_end_pfn) + return 0; + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); /* @@ -7433,7 +7485,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = memalloc_noio_flags(gfp_mask), + .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcffc4ec1..88ccc044b09a 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,9 +59,6 @@ static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, -#ifdef CONFIG_PAGE_POISONING - &page_poisoning_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif @@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_idle.c b/mm/page_idle.c index b0ee56c56b58..1b0f48c62316 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn) return page; } -static int page_idle_clear_pte_refs_one(struct page *page, +static bool page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { @@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page, */ set_page_young(page); } - return SWAP_AGAIN; + return true; } static void page_idle_clear_pte_refs(struct page *page) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f4e17a57926a..5092e4ef00c8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -66,7 +66,8 @@ out: set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + NULL); __mod_zone_freepage_state(zone, -nr_pages, migratetype); } @@ -88,7 +89,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!is_migrate_isolate_page(page)) goto out; /* @@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * pageblock scanning for freepage moving. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); @@ -205,7 +206,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!page || !is_migrate_isolate_page(page)) continue; unset_migratetype_isolate(page, migratetype); } @@ -262,7 +263,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (page && !is_migrate_isolate_page(page)) break; } page = __first_valid_page(start_pfn, end_pfn - start_pfn); diff --git a/mm/page_owner.c b/mm/page_owner.c index 60634dc53a88..c3cee247f2e6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } @@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } diff --git a/mm/page_poison.c b/mm/page_poison.c index 2e647c65916b..be19e989ccff 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,7 +6,6 @@ #include <linux/poison.h> #include <linux/ratelimit.h> -static bool __page_poisoning_enabled __read_mostly; static bool want_page_poisoning __read_mostly; static int early_page_poison_param(char *buf) @@ -19,74 +18,21 @@ early_param("page_poison", early_page_poison_param); bool page_poisoning_enabled(void) { - return __page_poisoning_enabled; -} - -static bool need_page_poisoning(void) -{ - return want_page_poisoning; -} - -static void init_page_poisoning(void) -{ /* - * page poisoning is debug page alloc for some arches. If either - * of those options are enabled, enable poisoning + * Assumes that debug_pagealloc_enabled is set before + * free_all_bootmem. + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. */ - if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { - if (!want_page_poisoning && !debug_pagealloc_enabled()) - return; - } else { - if (!want_page_poisoning) - return; - } - - __page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -bool page_is_poisoned(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); + return (want_page_poisoning || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())); } static void poison_page(struct page *page) { void *addr = kmap_atomic(page); - set_page_poison(page); memset(addr, PAGE_POISON, PAGE_SIZE); kunmap_atomic(addr); } @@ -140,12 +86,13 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_is_poisoned(page)) - return; - addr = kmap_atomic(page); + /* + * Page poisoning when enabled poisons each and every page + * that is freed to buddy. Thus no extra check is done to + * see if a page was posioned. + */ check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); kunmap_atomic(addr); } diff --git a/mm/rmap.c b/mm/rmap.c index 1b776f793998..d405f0e0ee96 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -724,7 +724,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -static int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_referenced_arg *pra = arg; @@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ + return false; /* To break the loop */ } if (pvmw.pte) { @@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (!pra->mapcount) - return SWAP_SUCCESS; /* To break the loop */ + return false; /* To break the loop */ - return SWAP_AGAIN; + return true; } static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) @@ -812,7 +812,6 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int ret; int we_locked = 0; struct page_referenced_arg pra = { .mapcount = total_mapcount(page), @@ -846,7 +845,7 @@ int page_referenced(struct page *page, rwc.invalid_vma = invalid_page_referenced_vma; } - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); *vm_flags = pra.vm_flags; if (we_locked) @@ -855,7 +854,7 @@ int page_referenced(struct page *page, return pra.referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, +static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_vma_mapped_walk pvmw = { @@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } } - return SWAP_AGAIN; + return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) @@ -1159,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound) goto out; } __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); + mod_memcg_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1199,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); + mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound) */ } -struct rmap_private { - enum ttu_flags flags; - int lazyfreed; -}; - /* * @arg: enum ttu_flags will be passed to this argument */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, }; pte_t pteval; struct page *subpage; - int ret = SWAP_AGAIN; - struct rmap_private *rp = arg; - enum ttu_flags flags = rp->flags; + bool ret = true; + enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - return SWAP_AGAIN; + return true; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, @@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mlock_vma_page(page); } - ret = SWAP_MLOCK; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { + WARN_ON_ONCE(1); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* MADV_FREE page check */ + if (!PageSwapBacked(page)) { + if (!PageDirty(page)) { + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } - if (!PageDirty(page) && (flags & TTU_LZFREE)) { - /* It's a freeable page by MADV_FREE */ - dec_mm_counter(mm, MM_ANONPAGES); - rp->lazyfreed++; - goto discard; + /* + * If the page was redirtied, it cannot be + * discarded. Remap the page to page table. + */ + set_pte_at(mm, address, pvmw.pte, pteval); + SetPageSwapBacked(page); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page) * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. - * Return values are: * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a mapping, try again later - * SWAP_FAIL - the page is unswappable - * SWAP_MLOCK - page is mlocked. + * If unmap is successful, return true. Otherwise, false. */ -int try_to_unmap(struct page *page, enum ttu_flags flags) +bool try_to_unmap(struct page *page, enum ttu_flags flags) { - int ret; - struct rmap_private rp = { - .flags = flags, - .lazyfreed = 0, - }; - struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)flags, .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; @@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - ret = rmap_walk_locked(page, &rwc); + rmap_walk_locked(page, &rwc); else - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapcount(page)) { - ret = SWAP_SUCCESS; - if (rp.lazyfreed && !PageDirty(page)) - ret = SWAP_LZFREE; - } - return ret; + return !page_mapcount(page) ? true : false; } static int page_not_mapped(struct page *page) @@ -1550,34 +1544,22 @@ static int page_not_mapped(struct page *page) * Called from munlock code. Checks all of the VMAs mapping the page * to make sure nobody else has this page mlocked. The page will be * returned with PG_mlocked cleared if no other vmas have it mlocked. - * - * Return values are: - * - * SWAP_AGAIN - no vma is holding page mlocked, or, - * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem - * SWAP_FAIL - page cannot be located at present - * SWAP_MLOCK - page is now mlocked. */ -int try_to_munlock(struct page *page) -{ - int ret; - struct rmap_private rp = { - .flags = TTU_MUNLOCK, - .lazyfreed = 0, - }; +void try_to_munlock(struct page *page) +{ struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - ret = rmap_walk(page, &rwc); - return ret; + rmap_walk(page, &rwc); } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1625,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; if (locked) { anon_vma = page_anon_vma(page); @@ -1641,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, anon_vma = rmap_walk_anon_lock(page, rwc); } if (!anon_vma) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1655,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(page)) break; @@ -1664,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (!locked) anon_vma_unlock_read(anon_vma); - return ret; } /* @@ -1680,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = page_mapping(page); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; - int ret = SWAP_AGAIN; /* * The page lock not only makes sure that page->mapping cannot @@ -1697,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1712,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(page)) goto done; @@ -1722,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, done: if (!locked) i_mmap_unlock_read(mapping); - return ret; } -int rmap_walk(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk(struct page *page, struct rmap_walk_control *rwc) { if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rwc); + rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc, false); + rmap_walk_anon(page, rwc, false); else - return rmap_walk_file(page, rwc, false); + rmap_walk_file(page, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_PAGE(PageKsm(page), page); if (PageAnon(page)) - return rmap_walk_anon(page, rwc, true); + rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc, true); + rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 0fd21670b513..6bb4deb12e78 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,11 +9,12 @@ * as published by the Free Software Foundation; version 2 * of the License. */ +#define pr_fmt(fmt) "rodata_test: " fmt + #include <linux/uaccess.h> #include <asm/sections.h> const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); void rodata_test(void) { @@ -23,20 +24,20 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ if (!rodata_test_data) { - pr_err("rodata_test: test 1 fails (start data)\n"); + pr_err("test 1 fails (start data)\n"); return; } /* test 2: write to the variable; this should fault */ if (!probe_kernel_write((void *)&rodata_test_data, - (void *)&zero, sizeof(zero))) { - pr_err("rodata_test: test data was not read only\n"); + (void *)&zero, sizeof(zero))) { + pr_err("test data was not read only\n"); return; } /* test 3: check the value hasn't changed */ if (rodata_test_data == zero) { - pr_err("rodata_test: test data was changed\n"); + pr_err("test data was changed\n"); return; } @@ -44,13 +45,13 @@ void rodata_test(void) start = (unsigned long)__start_rodata; end = (unsigned long)__end_rodata; if (start & (PAGE_SIZE - 1)) { - pr_err("rodata_test: start of .rodata is not page size aligned\n"); + pr_err("start of .rodata is not page size aligned\n"); return; } if (end & (PAGE_SIZE - 1)) { - pr_err("rodata_test: end of .rodata is not page size aligned\n"); + pr_err("end of .rodata is not page size aligned\n"); return; } - pr_info("rodata_test: all tests were successful\n"); + pr_info("all tests were successful\n"); } diff --git a/mm/slab.h b/mm/slab.h index 9cfcf099709c..cb17d3e63a0d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -384,6 +384,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; page = virt_to_head_page(x); + BUG_ON(!PageSlab(page)); cachep = page->slab_cache; if (slab_equal_or_root(cachep, s)) return cachep; diff --git a/mm/sparse.c b/mm/sparse.c index db6bf3c97ea2..6903c8fc3085 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, unsigned long usemap_size(void) { - unsigned long size_bytes; - size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; - size_bytes = roundup(size_bytes, sizeof(unsigned long)); - return size_bytes; + return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/swap.c b/mm/swap.c index 5dabf444d724..361bdb1575ab 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -46,7 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); #endif @@ -561,20 +561,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + bool active = PageActive(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec, + LRU_INACTIVE_ANON + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + /* + * lazyfree pages are clean anonymous pages. They have + * SwapBacked flag cleared to distinguish normal anonymous + * pages + */ + ClearPageSwapBacked(page); + add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, 1, 0); } } @@ -604,9 +611,9 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); activate_page_drain(cpu); } @@ -638,22 +645,22 @@ void deactivate_file_page(struct page *page) } /** - * deactivate_page - deactivate a page + * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate * - * deactivate_page() moves @page to the inactive list if @page was on the active - * list and was not an unevictable page. This is done to accelerate the reclaim - * of @page. + * mark_page_lazyfree() moves @page to the inactive file list. + * This is done to accelerate the reclaim of @page. */ -void deactivate_page(struct page *page) +void mark_page_lazyfree(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); + put_cpu_var(lru_lazyfree_pvecs); } } @@ -693,7 +700,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index b1ccb58ad397..aa1c415f4abd 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -241,8 +241,10 @@ int enable_swap_slots_cache(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", alloc_swap_slot_cache, free_slot_cache); - if (ret < 0) + if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " + "without swap slots cache.\n", __func__)) goto out_unlock; + swap_slot_cache_initialized = true; __reenable_swap_slots_cache(); out_unlock: diff --git a/mm/swap_state.c b/mm/swap_state.c index 473b71e052a8..7bfb9bd1ca21 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet, while - * the other end is scheduled away waiting on discard - * I/O completion at scan_swap_map(). - * - * In order to avoid turning this transitory state - * into a permanent loop around this -EEXIST case - * if !CONFIG_PREEMPT and the I/O completion happens - * to be waiting on the CPU waitqueue where we are now - * busy looping, we just conditionally invoke the - * scheduler here, if there are some more important - * tasks to run. + * has not been brought into the swapcache yet. */ cond_resched(); continue; diff --git a/mm/swapfile.c b/mm/swapfile.c index 178130880b90..fba7960bb026 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,6 +37,7 @@ #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> +#include <linux/sort.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -335,7 +336,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); - unlock_cluster(ci_tail); + spin_unlock(&ci_tail->lock); cluster_set_next_flag(&list->tail, idx, 0); } } @@ -672,6 +673,9 @@ checks: else goto done; } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); if (offset == si->lowest_bit) si->lowest_bit++; @@ -685,9 +689,6 @@ checks: plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); } - si->swap_map[offset] = usage; - inc_cluster_info_page(si, si->cluster_info, offset); - unlock_cluster(ci); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -1065,6 +1066,13 @@ void swapcache_free(swp_entry_t entry) } } +static int swp_entry_cmp(const void *ent1, const void *ent2) +{ + const swp_entry_t *e1 = ent1, *e2 = ent2; + + return (long)(swp_type(*e1) - swp_type(*e2)); +} + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; @@ -1075,6 +1083,10 @@ void swapcache_free_entries(swp_entry_t *entries, int n) prev = NULL; p = NULL; + + /* Sort swap entries by swap device, so each lock is only taken once. */ + if (nr_swapfiles > 1) + sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) @@ -1111,6 +1123,18 @@ int page_swapcount(struct page *page) return count; } +static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +{ + int count = 0; + pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); + return count; +} + /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, @@ -1119,17 +1143,11 @@ int page_swapcount(struct page *page) int __swp_swapcount(swp_entry_t entry) { int count = 0; - pgoff_t offset; struct swap_info_struct *si; - struct swap_cluster_info *ci; si = __swap_info_get(entry); - if (si) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); - count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); - } + if (si) + count = swap_swapcount(si, entry); return count; } @@ -1291,7 +1309,8 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page))) { + (!page_mapped(page) || mem_cgroup_swap_full(page)) && + !swap_swapcount(p, entry)) { delete_from_swap_cache(page); SetPageDirty(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d..5ebf468c5429 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -97,8 +97,13 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; - /* Can cgroups be reclaimed below their normal consumption range? */ - unsigned int may_thrash:1; + /* + * Cgroups are not reclaimed below their configured memory.low, + * unless we threaten to OOM. If any cgroups are skipped due to + * memory.low and nothing was reclaimed, go back for memory.low. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; unsigned int hibernation_mode:1; @@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) return nr; } -bool pgdat_reclaimable(struct pglist_data *pgdat) -{ - return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < - pgdat_reclaimable_pages(pgdat) * 6; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page)) { + if (!page_is_file_cache(page) || + (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; return; @@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; - bool lazyfree = false; - int ret = SWAP_SUCCESS; cond_resched(); @@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; if (unlikely(!page_evictable(page))) - goto cull_mlocked; + goto activate_locked; if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) + if ((page_mapped(page) || PageSwapCache(page)) && + !(PageAnon(page) && !PageSwapBacked(page))) sc->nr_scanned++; may_enter_fs = (sc->gfp_mask & __GFP_FS) || @@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. + * Lazyfree page could be freed directly */ - if (PageAnon(page) && !PageSwapCache(page)) { + if (PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; - lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (ret = try_to_unmap(page, lazyfree ? - (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : - (ttu_flags | TTU_BATCH_FLUSH))) { - case SWAP_FAIL: + if (page_mapped(page)) { + if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { nr_unmap_fail++; goto activate_locked; - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_LZFREE: - goto lazyfree; - case SWAP_SUCCESS: - ; /* try to free the page below */ } } @@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } -lazyfree: - if (!mapping || !__remove_mapping(mapping, page, true)) - goto keep_locked; + if (PageAnon(page) && !PageSwapBacked(page)) { + /* follow __remove_mapping for reference */ + if (!page_ref_freeze(page, 1)) + goto keep_locked; + if (PageDirty(page)) { + page_ref_unfreeze(page, 1); + goto keep_locked; + } + count_vm_event(PGLAZYFREED); + } else if (!mapping || !__remove_mapping(mapping, page, true)) + goto keep_locked; /* * At this point, we have no other references and there is * no way to pick any more up (removed from LRU, removed @@ -1280,9 +1277,6 @@ lazyfree: */ __ClearPageLocked(page); free_it: - if (ret == SWAP_LZFREE) - count_vm_event(PGLAZYFREED); - nr_reclaimed++; /* @@ -1292,20 +1286,16 @@ free_it: list_add(&page->lru, &free_pages); continue; -cull_mlocked: - if (PageSwapCache(page)) - try_to_free_swap(page); - unlock_page(page); - list_add(&page->lru, &ret_pages); - continue; - activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && mem_cgroup_swap_full(page)) + if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || + PageMlocked(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); - SetPageActive(page); - pgactivate++; + if (!PageMlocked(page)) { + SetPageActive(page); + pgactivate++; + } keep_locked: unlock_page(page); keep: @@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0, total_skipped = 0; + unsigned long skipped = 0; unsigned long scan, nr_pages; LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src);) { + !list_empty(src); scan++) { struct page *page; page = lru_to_page(src); @@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; } - /* - * Account for scanned and skipped separetly to avoid the pgdat - * being prematurely marked unreclaimable by pgdat_reclaimable. - */ - scan++; - switch (__isolate_lru_page(page, mode)) { case 0: nr_pages = hpage_nr_pages(page); @@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (!list_empty(&pages_skipped)) { int zid; + list_splice(&pages_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; @@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } - - /* - * Account skipped pages as a partial scan as the pgdat may be - * close to unreclaimable. If the LRU list is empty, account - * skipped pages as a full scan. - */ - total_skipped = list_empty(src) ? skipped : skipped >> 2; - - list_splice(&pages_skipped, src); } - *nr_scanned = scan + total_skipped; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); @@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else @@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); __count_vm_events(PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2033,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan, * Both inactive lists should also be large enough that each inactive * page has a chance to be referenced again before it is reclaimed. * + * If that fails and refaulting is observed, the inactive list grows. + * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages * on this LRU, maintained by the pageout code. A zone->inactive_ratio * of 3 means 3:1 or 25% of the pages are kept on the inactive list. @@ -2049,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool trace) + struct mem_cgroup *memcg, + struct scan_control *sc, bool actual_reclaim) { - unsigned long inactive_ratio; - unsigned long inactive, active; - enum lru_list inactive_lru = file * LRU_FILE; enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + enum lru_list inactive_lru = file * LRU_FILE; + unsigned long inactive, active; + unsigned long inactive_ratio; + unsigned long refaults; unsigned long gb; /* @@ -2067,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); + if (memcg) + refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); else - inactive_ratio = 1; + refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - if (trace) - trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, - sc->reclaim_idx, - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, - inactive_ratio, file); + /* + * When refaults are being observed, it means a new workingset + * is being established. Disable active list protection to get + * rid of the stale workingset quickly. + */ + if (file && actual_reclaim && lruvec->refaults != refaults) { + inactive_ratio = 0; + } else { + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; + } + + if (actual_reclaim) + trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, + inactive_ratio, file); return inactive * inactive_ratio < active; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct scan_control *sc) + struct lruvec *lruvec, struct mem_cgroup *memcg, + struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), + memcg, sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2123,30 +2116,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; - bool force_scan = false; unsigned long ap, fp; enum lru_list lru; - bool some_scanned; - int pass; - - /* - * If the zone or memcg is small, nr[l] can be 0. This - * results in no scanning on this priority and a potential - * priority drop. Global direct reclaim can go to the next - * zone and tends to have no problems. Global kswapd is for - * zone balancing and it needs to scan a minimum amount. When - * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount there as - * well. - */ - if (current_is_kswapd()) { - if (!pgdat_reclaimable(pgdat)) - force_scan = true; - if (!mem_cgroup_online(memcg)) - force_scan = true; - } - if (!global_reclaim(sc)) - force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { @@ -2218,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, sc, false) && + if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2277,55 +2248,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - some_scanned = false; - /* Only use force_scan on second pass. */ - for (pass = 0; !some_scanned && pass < 2; pass++) { - *lru_pages = 0; - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; - - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; - - if (!scan && pass && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); - - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: - /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. - */ - scan = div64_u64(scan * fraction[file], - denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) { - size = 0; - scan = 0; - } - break; - default: - /* Look ma, no brain */ - BUG(); - } + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - *lru_pages += size; - nr[lru] = scan; + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = size >> sc->priority; + /* + * If the cgroup's already been deleted, make sure to + * scrape out the remaining cache. + */ + if (!scan && !mem_cgroup_online(memcg)) + scan = min(size, SWAP_CLUSTER_MAX); + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: /* - * Skip the second pass and don't force_scan, - * if we found something to scan. + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. */ - some_scanned |= !!scan; + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) { + size = 0; + scan = 0; + } + break; + default: + /* Look ma, no brain */ + BUG(); } + + *lru_pages += size; + nr[lru] = scan; } } @@ -2376,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); + lruvec, memcg, sc); } } @@ -2443,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc, true)) + if (inactive_list_is_low(lruvec, false, memcg, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2557,9 +2521,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long scanned; if (mem_cgroup_low(root, memcg)) { - if (!sc->may_thrash) + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; continue; - mem_cgroup_events(memcg, MEMCG_LOW, 1); + } + mem_cgroup_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; @@ -2604,22 +2570,38 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - nr_scanned, node_lru_pages); + /* + * Record the subtree's reclaim efficiency. The reclaimed + * pages from slab is excluded here because the corresponding + * scanned pages is not accounted. Moreover, freeing a page + * by slab shrinking depends on each slab's object population, + * making the cost model (i.e. scan:free) different from that + * of LRU. + */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2694,10 +2676,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2752,6 +2730,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask = orig_mask; } +static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) +{ + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(root_memcg, NULL, NULL); + do { + unsigned long refaults; + struct lruvec *lruvec; + + if (memcg) + refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); + else + refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); + + lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec->refaults = refaults; + } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); +} + /* * This is the main entry point to direct page reclaim. * @@ -2772,6 +2769,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; + pg_data_t *last_pgdat; + struct zoneref *z; + struct zone *zone; retry: delayacct_freepages_start(); @@ -2798,6 +2798,15 @@ retry: sc->may_writepage = 1; } while (--sc->priority >= 0); + last_pgdat = NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, + sc->nodemask) { + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + } + delayacct_freepages_end(); if (sc->nr_reclaimed) @@ -2808,16 +2817,17 @@ retry: return 1; /* Untapped cgroup reserves? Don't OOM, retry. */ - if (!sc->may_thrash) { + if (sc->memcg_low_skipped) { sc->priority = initial_priority; - sc->may_thrash = 1; + sc->memcg_low_reclaim = 1; + sc->memcg_low_skipped = 0; goto retry; } return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2825,10 +2835,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!managed_zone(zone) || - pgdat_reclaimable_pages(pgdat) == 0) + if (!managed_zone(zone)) + continue; + + if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2905,7 +2920,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2927,14 +2942,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -2950,7 +2965,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -3028,9 +3043,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist; unsigned long nr_reclaimed; int nid; + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, @@ -3054,9 +3070,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3076,7 +3092,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, sc, true)) + if (inactive_list_is_low(lruvec, false, memcg, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -3084,22 +3100,44 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, int classzone_idx) +/* + * Returns true if there is an eligible zone balanced for the request order + * and classzone_idx + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone); + int i; + unsigned long mark = -1; + struct zone *zone; - if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) - return false; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + mark = high_wmark_pages(zone); + if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return true; + } /* - * If any eligible zone is balanced then the node is not considered - * to be congested or dirty + * If a node has no populated zone within classzone_idx, it does not + * need balancing by definition. This can happen if a zone-restricted + * allocation tries to wake a remote kswapd. */ - clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); - clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); + if (mark == -1) + return true; - return true; + return false; +} + +/* Clear pgdat state for congested, dirty or under writeback. */ +static void clear_pgdat_congested(pg_data_t *pgdat) +{ + clear_bit(PGDAT_CONGESTED, &pgdat->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* @@ -3110,11 +3148,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - int i; - /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3128,17 +3164,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; - if (!zone_balanced(zone, order, classzone_idx)) - return false; + if (pgdat_balanced(pgdat, order, classzone_idx)) { + clear_pgdat_congested(pgdat); + return true; } - return true; + return false; } /* @@ -3214,9 +3249,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3241,23 +3276,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Check from - * high to low zone as allocations prefer higher zones. - * Scanning from low to high zone would allow congestion to be - * cleared during a very small window when a small low - * zone was balanced even under extreme pressure when the - * overall node may be congested. Note that sc.reclaim_idx - * is not used as buffer_heads_over_limit may have adjusted - * it. + * Only reclaim if there are no eligible zones. Note that + * sc.reclaim_idx is not used as buffer_heads_over_limit may + * have adjusted it. */ - for (i = classzone_idx; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - - if (zone_balanced(zone, sc.order, classzone_idx)) - goto out; - } + if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + goto out; /* * Do some background aging of the anon list, to give @@ -3271,7 +3295,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) + if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ @@ -3295,7 +3319,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3306,11 +3330,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: + snapshot_refaults(NULL, pgdat); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3320,6 +3349,22 @@ out: return sc.order; } +/* + * pgdat->kswapd_classzone_idx is the highest zone index that a recent + * allocation request woke kswapd for. When kswapd has not woken recently, + * the value is MAX_NR_ZONES which is not a valid index. This compares a + * given classzone and returns it or the highest classzone index kswapd + * was recently woke for. + */ +static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, + enum zone_type classzone_idx) +{ + if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) + return classzone_idx; + + return max(pgdat->kswapd_classzone_idx, classzone_idx); +} + static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int classzone_idx) { @@ -3331,7 +3376,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - /* Try to sleep for a short interval */ + /* + * Try to sleep for a short interval. Note that kcompactd will only be + * woken if it is possible to sleep for a short interval. This is + * deliberate on the assumption that if reclaim cannot keep an + * eligible zone balanced that it's also unlikely that compaction will + * succeed. + */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to @@ -3355,7 +3406,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * the previous request that slept prematurely. */ if (remaining) { - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); } @@ -3409,7 +3460,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ static int kswapd(void *p) { - unsigned int alloc_order, reclaim_order, classzone_idx; + unsigned int alloc_order, reclaim_order; + unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3439,20 +3491,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - pgdat->kswapd_order = alloc_order = reclaim_order = 0; - pgdat->kswapd_classzone_idx = classzone_idx = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; for ( ; ; ) { bool ret; + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, classzone_idx); /* Read the new order and classzone_idx */ alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; + classzone_idx = kswapd_classzone_idx(pgdat, 0); pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3478,9 +3533,6 @@ kswapd_try_sleep: reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; - - alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3496,7 +3548,6 @@ kswapd_try_sleep: void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; - int z; if (!managed_zone(zone)) return; @@ -3504,22 +3555,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, + classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Only wake kswapd if all zones are unbalanced */ - for (z = 0; z <= classzone_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; - if (zone_balanced(zone, order, classzone_idx)) - return; - } + if (pgdat_balanced(pgdat, order, classzone_idx)) + return; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3548,8 +3597,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; + unsigned int noreclaim_flag; - p->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3558,7 +3608,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; } @@ -3723,9 +3773,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in struct task_struct *p = current; struct reclaim_state reclaim_state; int classzone_idx = gfp_zone(gfp_mask); + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), @@ -3740,7 +3791,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * and we also need to be able to write out pages for RECLAIM_WRITE * and RECLAIM_UNMAP. */ - p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + noreclaim_flag = memalloc_noreclaim_save(); + p->flags |= PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3756,7 +3808,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + current->flags &= ~PF_SWAPWRITE; + memalloc_noreclaim_restore(noreclaim_flag); lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } @@ -3779,9 +3832,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 809025ed97ea..757be8303aa0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -954,7 +954,6 @@ const char * const vmstat_text[] = { "nr_unevictable", "nr_isolated_anon", "nr_isolated_file", - "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", @@ -992,6 +991,7 @@ const char * const vmstat_text[] = { "pgfree", "pgactivate", "pgdeactivate", + "pglazyfree", "pgfault", "pgmajfault", @@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg) { } -/* Walk all the zones in a node and print using a callback */ +/* + * Walk zones in a node and print using a callback. + * If @assert_populated is true, only use callback for zones that are populated. + */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool assert_populated, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) + if (assert_populated && !populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, frag_show_print); + walk_zones_in_node(m, pgdat, true, frag_show_print); return 0; } @@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); return 0; } @@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); return 0; } @@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); - seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) seq_printf(m, ", %ld", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); + seq_putc(m, ')'); + + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; @@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - !pgdat_reclaimable(zone->zone_pgdat), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } /* - * Output information about zones in @pgdat. + * Output information about zones in @pgdat. All zones are printed regardless + * of whether they are populated or not: lowmem_reserve_ratio operates on the + * set of all zones and userspace would not be aware of such zones if they are + * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). */ static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); return 0; } @@ -1586,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write, for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { - switch (i) { - case NR_PAGES_SCANNED: - /* - * This is often seen to go negative in - * recent kernels, but not to go permanently - * negative. Whilst it would be nicer not to - * have exceptions, rooting them out would be - * another task, of rather low priority. - */ - break; - default: - pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); - err = -EINVAL; - break; - } + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; } } if (err) @@ -1857,7 +1855,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, unusable_show_print); + walk_zones_in_node(m, pgdat, true, unusable_show_print); return 0; } @@ -1909,7 +1907,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, extfrag_show_print); + walk_zones_in_node(m, pgdat, true, extfrag_show_print); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index eda05c71fa49..b8c9ab678479 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -269,7 +269,6 @@ bool workingset_refault(void *shadow) lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); - rcu_read_unlock(); /* * The unsigned subtraction here gives an accurate distance @@ -290,11 +289,15 @@ bool workingset_refault(void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; inc_node_state(pgdat, WORKINGSET_REFAULT); + inc_memcg_state(memcg, WORKINGSET_REFAULT); if (refault_distance <= active_file) { inc_node_state(pgdat, WORKINGSET_ACTIVATE); + inc_memcg_state(memcg, WORKINGSET_ACTIVATE); + rcu_read_unlock(); return true; } + rcu_read_unlock(); return false; } @@ -472,6 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); + inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, workingset_update_node, mapping); diff --git a/mm/z3fold.c b/mm/z3fold.c index f9492bccfd79..54f63c4a809a 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -185,6 +185,12 @@ static inline void z3fold_page_lock(struct z3fold_header *zhdr) spin_lock(&zhdr->page_lock); } +/* Try to lock a z3fold page */ +static inline int z3fold_page_trylock(struct z3fold_header *zhdr) +{ + return spin_trylock(&zhdr->page_lock); +} + /* Unlock a z3fold page */ static inline void z3fold_page_unlock(struct z3fold_header *zhdr) { @@ -385,7 +391,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, spin_lock(&pool->lock); zhdr = list_first_entry_or_null(&pool->unbuddied[i], struct z3fold_header, buddy); - if (!zhdr) { + if (!zhdr || !z3fold_page_trylock(zhdr)) { spin_unlock(&pool->lock); continue; } @@ -394,7 +400,6 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, spin_unlock(&pool->lock); page = virt_to_page(zhdr); - z3fold_page_lock(zhdr); if (zhdr->first_chunks == 0) { if (zhdr->middle_chunks != 0 && chunks >= zhdr->start_middle) diff --git a/net/core/dev.c b/net/core/dev.c index 3805b964448a..fc0f5f9dac0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -81,6 +81,7 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/mutex.h> #include <linux/string.h> #include <linux/mm.h> @@ -4227,7 +4228,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should @@ -4238,9 +4239,9 @@ static int __netif_receive_skb(struct sk_buff *skb) * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_core(skb, true); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_core(skb, false); diff --git a/net/core/sock.c b/net/core/sock.c index b5baeb9cb0fb..79c6aee6af9b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -102,6 +102,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> @@ -372,14 +373,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = sk->sk_backlog_rcv(sk, skb); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return ret; } diff --git a/scripts/Makefile.asm-offsets b/scripts/Makefile.asm-offsets new file mode 100644 index 000000000000..4ba80ba29b82 --- /dev/null +++ b/scripts/Makefile.asm-offsets @@ -0,0 +1,22 @@ +# Default sed regexp - multiline due to syntax constraints +define sed-asm-offsets-to-c + "/^->/{s:->#\(.*\):/* \1 */:; \ + s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:->::; p;}" +endef + +define gen_header_from_asm_offsets + (set -e; \ + echo "#ifndef $1"; \ + echo "#define $1"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by Kbuild"; \ + echo " */"; \ + echo ""; \ + sed -ne $(sed-asm-offsets-to-c); \ + echo ""; \ + echo "#endif" ) +endef diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index baa3c7be04ad..b1befa2cec26 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2539,6 +2539,7 @@ sub process { # Check for git id commit length and improperly formed commit descriptions if ($in_commit_log && !$commit_log_possible_stack_dump && $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink):/i && + $line !~ /^This reverts commit [0-9a-f]{7,40}/ && ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i || ($line =~ /(?:\s|^)[0-9a-f]{12,40}(?:[\s"'\(\[]|$)/i && $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i && @@ -2628,8 +2629,8 @@ sub process { # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && - !($rawline =~ /^\s+\S/ || - $rawline =~ /^(commit\b|from\b|[\w-]+:).*$/i)) { + !($rawline =~ /^\s+(?:\S|$)/ || + $rawline =~ /^(?:commit\b|from\b|[\w-]+:)/i)) { $in_header_lines = 0; $in_commit_log = 1; $has_commit_log = 1; @@ -2757,13 +2758,6 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } -# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig. - if ($realfile =~ /Kconfig/ && - $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) { - WARN("CONFIG_EXPERIMENTAL", - "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); - } - # discourage the use of boolean for type definition attributes of Kconfig options if ($realfile =~ /Kconfig/ && $line =~ /^\+\s*\bboolean\b/) { @@ -3133,6 +3127,17 @@ sub process { # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# check if this appears to be the start function declaration, save the name + if ($sline =~ /^\+\{\s*$/ && + $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) { + $context_function = $1; + } + +# check if this appears to be the end of function declaration + if ($sline =~ /^\+\}\s*$/) { + undef $context_function; + } + # check indentation of any line with a bare else # (but not if it is a multiple line "if (foo) return bar; else return baz;") # if the previous line is a break or return and is indented 1 tab more... @@ -3157,12 +3162,6 @@ sub process { } } -# discourage the addition of CONFIG_EXPERIMENTAL in #if(def). - if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { - WARN("CONFIG_EXPERIMENTAL", - "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); - } - # check for RCS/CVS revision markers if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) { WARN("CVS_KEYWORD", @@ -4851,8 +4850,10 @@ sub process { $dstat !~ /^\(\{/ && # ({... $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/) { - - if ($dstat =~ /;/) { + if ($dstat =~ /^\s*if\b/) { + ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", + "Macros starting with if should be enclosed by a do - while loop to avoid possible if/else logic defects\n" . "$herectx"); + } elsif ($dstat =~ /;/) { ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", "Macros with multiple statements should be enclosed in a do - while loop\n" . "$herectx"); } else { @@ -5174,14 +5175,16 @@ sub process { "break quoted strings at a space character\n" . $hereprev); } -#check for an embedded function name in a string when the function is known -# as part of a diff. This does not work for -f --file checking as it -#depends on patch context providing the function name +# check for an embedded function name in a string when the function is known +# This does not work very well for -f --file checking as it depends on patch +# context providing the function name or a single line form for in-file +# function declarations if ($line =~ /^\+.*$String/ && defined($context_function) && - get_quoted_string($line, $rawline) =~ /\b$context_function\b/) { + get_quoted_string($line, $rawline) =~ /\b$context_function\b/ && + length(get_quoted_string($line, $rawline)) != (length($context_function) + 2)) { WARN("EMBEDDED_FUNCTION_NAME", - "Prefer using \"%s\", __func__ to embedded function names\n" . $herecurr); + "Prefer using '\"%s...\", __func__' to using '$context_function', this function's name, in a string\n" . $herecurr); } # check for spaces before a quoted newline @@ -5676,6 +5679,32 @@ sub process { } } + # check for vsprintf extension %p<foo> misuses + if ($^V && $^V ge 5.10.0 && + defined $stat && + $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && + $1 !~ /^_*volatile_*$/) { + my $bad_extension = ""; + my $lc = $stat =~ tr@\n@@; + $lc = $lc + $linenr; + for (my $count = $linenr; $count <= $lc; $count++) { + my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0)); + $fmt =~ s/%%//g; + if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGN]).)/) { + $bad_extension = $1; + last; + } + } + if ($bad_extension ne "") { + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + WARN("VSPRINTF_POINTER_EXTENSION", + "Invalid vsprintf pointer extension '$bad_extension'\n" . "$here\n$stat_real\n"); + } + } + # Check for misused memsets if ($^V && $^V ge 5.10.0 && defined $stat && @@ -6066,11 +6095,11 @@ sub process { } # check for various structs that are normally const (ops, kgdb, device_tree) +# and avoid what seem like struct definitions 'struct foo {' if ($line !~ /\bconst\b/ && - $line =~ /\bstruct\s+($const_structs)\b/) { + $line =~ /\bstruct\s+($const_structs)\b(?!\s*\{)/) { WARN("CONST_STRUCT", - "struct $1 should normally be const\n" . - $herecurr); + "struct $1 should normally be const\n" . $herecurr); } # use of NR_CPUS is usually wrong diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 7986f4e0da12..7aad82406422 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/of_fdt.h> /* We need to stringify expanded macros so that they can be parsed */ @@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC) LX_VALUE(MNT_NOATIME) LX_VALUE(MNT_NODIRATIME) LX_VALUE(MNT_RELATIME) + +/* linux/of_fdt.h> */ +LX_VALUE(OF_DT_HEADER) + +/* Kernel Configs */ +LX_CONFIG(CONFIG_OF) diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 38b1f09d1cd9..086d27223c0c 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -16,6 +16,7 @@ from linux import constants from linux import utils from linux import tasks from linux import lists +from struct import * class LxCmdLine(gdb.Command): @@ -195,3 +196,75 @@ values of that process namespace""" info_opts(MNT_INFO, m_flags))) LxMounts() + + +class LxFdtDump(gdb.Command): + """Output Flattened Device Tree header and dump FDT blob to the filename + specified as the command argument. Equivalent to + 'cat /proc/fdt > fdtdump.dtb' on a running target""" + + def __init__(self): + super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA, + gdb.COMPLETE_FILENAME) + + def fdthdr_to_cpu(self, fdt_header): + + fdt_header_be = ">IIIIIII" + fdt_header_le = "<IIIIIII" + + if utils.get_target_endianness() == 1: + output_fmt = fdt_header_le + else: + output_fmt = fdt_header_be + + return unpack(output_fmt, pack(fdt_header_be, + fdt_header['magic'], + fdt_header['totalsize'], + fdt_header['off_dt_struct'], + fdt_header['off_dt_strings'], + fdt_header['off_mem_rsvmap'], + fdt_header['version'], + fdt_header['last_comp_version'])) + + def invoke(self, arg, from_tty): + + if not constants.LX_CONFIG_OF: + raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n") + + if len(arg) == 0: + filename = "fdtdump.dtb" + else: + filename = arg + + py_fdt_header_ptr = gdb.parse_and_eval( + "(const struct fdt_header *) initial_boot_params") + py_fdt_header = py_fdt_header_ptr.dereference() + + fdt_header = self.fdthdr_to_cpu(py_fdt_header) + + if fdt_header[0] != constants.LX_OF_DT_HEADER: + raise gdb.GdbError("No flattened device tree magic found\n") + + gdb.write("fdt_magic: 0x{:02X}\n".format(fdt_header[0])) + gdb.write("fdt_totalsize: 0x{:02X}\n".format(fdt_header[1])) + gdb.write("off_dt_struct: 0x{:02X}\n".format(fdt_header[2])) + gdb.write("off_dt_strings: 0x{:02X}\n".format(fdt_header[3])) + gdb.write("off_mem_rsvmap: 0x{:02X}\n".format(fdt_header[4])) + gdb.write("version: {}\n".format(fdt_header[5])) + gdb.write("last_comp_version: {}\n".format(fdt_header[6])) + + inf = gdb.inferiors()[0] + fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr, + fdt_header[1]).tobytes() + + try: + f = open(filename, 'wb') + except: + raise gdb.GdbError("Could not open file to dump fdt") + + f.write(fdt_buf) + f.close() + + gdb.write("Dumped fdt blob to " + filename + "\n") + +LxFdtDump() diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index 19d9bcadc0cc..5858bebfaf32 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -7,28 +7,11 @@ modpost-objs := modpost.o file2alias.o sumversion.o devicetable-offsets-file := devicetable-offsets.h -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef +include $(srctree)/scripts/Makefile.asm-offsets quiet_cmd_offsets = GEN $@ define cmd_offsets - (set -e; \ - echo "#ifndef __DEVICETABLE_OFFSETS_H__"; \ - echo "#define __DEVICETABLE_OFFSETS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ + $(call gen_header_from_asm_offsets,__DEVICETABLE_OFFSETS_H__) < $< > $@ endef $(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 41642ba5e318..dba889004ea1 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -15,21 +15,14 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -TEST_GEN_FILES += userfaultfd_hugetlb -TEST_GEN_FILES += userfaultfd_shmem TEST_GEN_FILES += mlock-random-test TEST_PROGS := run_vmtests include ../lib.mk -$(OUTPUT)/userfaultfd: LDLIBS += -lpthread ../../../../usr/include/linux/kernel.h - -$(OUTPUT)/userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h - $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread - -$(OUTPUT)/userfaultfd_shmem: userfaultfd.c ../../../../usr/include/linux/kernel.h - $(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread +$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h +$(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index c92f6cf31d0a..3214a6456d13 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -95,7 +95,7 @@ echo " hugetlb regression testing." echo "--------------------" echo "running userfaultfd" echo "--------------------" -./userfaultfd 128 32 +./userfaultfd anon 128 32 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -107,7 +107,7 @@ echo "----------------------------" echo "running userfaultfd_hugetlb" echo "----------------------------" # 258MB total huge pages == 128MB src and 128MB dst -./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file +./userfaultfd hugetlb 128 32 $mnt/ufd_test_file if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -119,7 +119,7 @@ rm -f $mnt/ufd_test_file echo "----------------------------" echo "running userfaultfd_shmem" echo "----------------------------" -./userfaultfd_shmem 128 32 +./userfaultfd shmem 128 32 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index e9449c801888..1eae79ae5b4e 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -77,10 +77,13 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; #define BOUNCE_POLL (1<<3) static int bounces; -#ifdef HUGETLB_TEST +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 +static int test_type; + static int huge_fd; static char *huge_fd_off0; -#endif static unsigned long long *count_verify; static int uffd, uffd_flags, finished, *pipefd; static char *area_src, *area_dst; @@ -102,14 +105,7 @@ pthread_attr_t attr; ~(unsigned long)(sizeof(unsigned long long) \ - 1))) -#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) - -/* Anonymous memory */ -#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ - (1 << _UFFDIO_COPY) | \ - (1 << _UFFDIO_ZEROPAGE)) - -static int release_pages(char *rel_area) +static int anon_release_pages(char *rel_area) { int ret = 0; @@ -121,7 +117,7 @@ static int release_pages(char *rel_area) return ret; } -static void allocate_area(void **alloc_area) +static void anon_allocate_area(void **alloc_area) { if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) { fprintf(stderr, "out of memory\n"); @@ -129,14 +125,9 @@ static void allocate_area(void **alloc_area) } } -#else /* HUGETLB_TEST or SHMEM_TEST */ - -#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC - -#ifdef HUGETLB_TEST /* HugeTLB memory */ -static int release_pages(char *rel_area) +static int hugetlb_release_pages(char *rel_area) { int ret = 0; @@ -152,7 +143,7 @@ static int release_pages(char *rel_area) } -static void allocate_area(void **alloc_area) +static void hugetlb_allocate_area(void **alloc_area) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_HUGETLB, huge_fd, @@ -167,10 +158,8 @@ static void allocate_area(void **alloc_area) huge_fd_off0 = *alloc_area; } -#elif defined(SHMEM_TEST) - /* Shared memory */ -static int release_pages(char *rel_area) +static int shmem_release_pages(char *rel_area) { int ret = 0; @@ -182,7 +171,7 @@ static int release_pages(char *rel_area) return ret; } -static void allocate_area(void **alloc_area) +static void shmem_allocate_area(void **alloc_area) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); @@ -192,11 +181,35 @@ static void allocate_area(void **alloc_area) } } -#else /* SHMEM_TEST */ -#error "Undefined test type" -#endif /* HUGETLB_TEST */ - -#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */ +struct uffd_test_ops { + unsigned long expected_ioctls; + void (*allocate_area)(void **alloc_area); + int (*release_pages)(char *rel_area); +}; + +#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ + (1 << _UFFDIO_COPY) | \ + (1 << _UFFDIO_ZEROPAGE)) + +static struct uffd_test_ops anon_uffd_test_ops = { + .expected_ioctls = ANON_EXPECTED_IOCTLS, + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, +}; + +static struct uffd_test_ops shmem_uffd_test_ops = { + .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, +}; + +static struct uffd_test_ops hugetlb_uffd_test_ops = { + .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, +}; + +static struct uffd_test_ops *uffd_test_ops; static int my_bcmp(char *str1, char *str2, size_t n) { @@ -505,7 +518,7 @@ static int stress(unsigned long *userfaults) * UFFDIO_COPY without writing zero pages into area_dst * because the background threads already completed). */ - if (release_pages(area_src)) + if (uffd_test_ops->release_pages(area_src)) return 1; for (cpu = 0; cpu < nr_cpus; cpu++) { @@ -577,12 +590,12 @@ static int faulting_process(void) { unsigned long nr; unsigned long long count; + unsigned long split_nr_pages; -#ifndef HUGETLB_TEST - unsigned long split_nr_pages = (nr_pages + 1) / 2; -#else - unsigned long split_nr_pages = nr_pages; -#endif + if (test_type != TEST_HUGETLB) + split_nr_pages = (nr_pages + 1) / 2; + else + split_nr_pages = nr_pages; for (nr = 0; nr < split_nr_pages; nr++) { count = *area_count(area_dst, nr); @@ -594,7 +607,9 @@ static int faulting_process(void) } } -#ifndef HUGETLB_TEST + if (test_type == TEST_HUGETLB) + return 0; + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, area_src); if (area_dst == MAP_FAILED) @@ -610,7 +625,7 @@ static int faulting_process(void) } } - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; for (nr = 0; nr < nr_pages; nr++) { @@ -618,8 +633,6 @@ static int faulting_process(void) fprintf(stderr, "nr %lu is not zero\n", nr), exit(1); } -#endif /* HUGETLB_TEST */ - return 0; } @@ -627,7 +640,9 @@ static int uffdio_zeropage(int ufd, unsigned long offset) { struct uffdio_zeropage uffdio_zeropage; int ret; - unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE); + unsigned long has_zeropage; + + has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); if (offset >= nr_pages * page_size) fprintf(stderr, "unexpected offset %lu\n", @@ -675,7 +690,7 @@ static int userfaultfd_zeropage_test(void) printf("testing UFFDIO_ZEROPAGE: "); fflush(stdout); - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; if (userfaultfd_open(0) < 0) @@ -686,7 +701,7 @@ static int userfaultfd_zeropage_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); - expected_ioctls = EXPECTED_IOCTLS; + expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) fprintf(stderr, @@ -716,7 +731,7 @@ static int userfaultfd_events_test(void) printf("testing events (fork, remap, remove): "); fflush(stdout); - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | @@ -731,7 +746,7 @@ static int userfaultfd_events_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); - expected_ioctls = EXPECTED_IOCTLS; + expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) fprintf(stderr, @@ -773,10 +788,10 @@ static int userfaultfd_stress(void) int err; unsigned long userfaults[nr_cpus]; - allocate_area((void **)&area_src); + uffd_test_ops->allocate_area((void **)&area_src); if (!area_src) return 1; - allocate_area((void **)&area_dst); + uffd_test_ops->allocate_area((void **)&area_dst); if (!area_dst) return 1; @@ -856,7 +871,7 @@ static int userfaultfd_stress(void) fprintf(stderr, "register failure\n"); return 1; } - expected_ioctls = EXPECTED_IOCTLS; + expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { fprintf(stderr, @@ -888,7 +903,7 @@ static int userfaultfd_stress(void) * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's * required to MADV_DONTNEED here. */ - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; /* bounce pass */ @@ -934,36 +949,6 @@ static int userfaultfd_stress(void) return userfaultfd_zeropage_test() || userfaultfd_events_test(); } -#ifndef HUGETLB_TEST - -int main(int argc, char **argv) -{ - if (argc < 3) - fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - page_size = sysconf(_SC_PAGE_SIZE); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - fprintf(stderr, "Impossible to run this test\n"), exit(2); - nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / - nr_cpus; - if (!nr_pages_per_cpu) { - fprintf(stderr, "invalid MiB\n"); - fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); - } - bounces = atoi(argv[2]); - if (bounces <= 0) { - fprintf(stderr, "invalid bounces\n"); - fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); - } - nr_pages = nr_pages_per_cpu * nr_cpus; - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); -} - -#else /* HUGETLB_TEST */ - /* * Copied from mlock2-tests.c */ @@ -988,48 +973,78 @@ unsigned long default_huge_page_size(void) return hps; } -int main(int argc, char **argv) +static void set_test_type(const char *type) { - if (argc < 4) - fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"), - exit(1); - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - page_size = default_huge_page_size(); + if (!strcmp(type, "anon")) { + test_type = TEST_ANON; + uffd_test_ops = &anon_uffd_test_ops; + } else if (!strcmp(type, "hugetlb")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + } else if (!strcmp(type, "shmem")) { + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + } else { + fprintf(stderr, "Unknown test type: %s\n", type), exit(1); + } + + if (test_type == TEST_HUGETLB) + page_size = default_huge_page_size(); + else + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) - fprintf(stderr, "Unable to determine huge page size\n"), + fprintf(stderr, "Unable to determine page size\n"), exit(2); if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 > page_size) fprintf(stderr, "Impossible to run this test\n"), exit(2); - nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / +} + +int main(int argc, char **argv) +{ + if (argc < 4) + fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"), + exit(1); + + set_test_type(argv[1]); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / nr_cpus; if (!nr_pages_per_cpu) { fprintf(stderr, "invalid MiB\n"); fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); } - bounces = atoi(argv[2]); + + bounces = atoi(argv[3]); if (bounces <= 0) { fprintf(stderr, "invalid bounces\n"); fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); } nr_pages = nr_pages_per_cpu * nr_cpus; - huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755); - if (huge_fd < 0) { - fprintf(stderr, "Open of %s failed", argv[3]); - perror("open"); - exit(1); - } - if (ftruncate(huge_fd, 0)) { - fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); - perror("ftruncate"); - exit(1); + + if (test_type == TEST_HUGETLB) { + if (argc < 5) + fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"), + exit(1); + huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); + if (huge_fd < 0) { + fprintf(stderr, "Open of %s failed", argv[3]); + perror("open"); + exit(1); + } + if (ftruncate(huge_fd, 0)) { + fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); + perror("ftruncate"); + exit(1); + } } printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", nr_pages, nr_pages_per_cpu); return userfaultfd_stress(); } -#endif #else /* __NR_userfaultfd */ #warning "missing __NR_userfaultfd definition" diff --git a/usr/Kconfig b/usr/Kconfig index 6278f135256d..c0c48507e44e 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -21,6 +21,16 @@ config INITRAMFS_SOURCE If you are not sure, leave it blank. +config INITRAMFS_FORCE + bool "Ignore the initramfs passed by the bootloader" + depends on CMDLINE_EXTEND || CMDLINE_FORCE + help + This option causes the kernel to ignore the initramfs image + (or initrd image) passed to it by the bootloader. This is + analogous to CMDLINE_FORCE, which is found on some architectures, + and is useful if you cannot or don't want to change the image + your bootloader passes to the kernel. + config INITRAMFS_ROOT_UID int "User ID to map to 0 (user root)" depends on INITRAMFS_SOURCE!="" |