summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorAl Viro <viro@zeniv.linux.org.uk>2014-12-08 20:39:29 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2014-12-08 20:39:29 -0500
commitba00410b8131b23edfb0e09f8b6dd26c8eb621fb (patch)
treec08504e4d2fa51ac91cef544f336d0169806c49f /mm
parent8ce74dd6057832618957fc2cbd38fa959c3a0a6c (diff)
parentaa583096d9767892983332e7c1a984bd17e3cd39 (diff)
Merge branch 'iov_iter' into for-next
Diffstat (limited to 'mm')
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/bootmem.c9
-rw-r--r--mm/cma.c68
-rw-r--r--mm/compaction.c21
-rw-r--r--mm/huge_memory.c15
-rw-r--r--mm/internal.h25
-rw-r--r--mm/iov_iter.c1062
-rw-r--r--mm/memcontrol.c105
-rw-r--r--mm/memory.c1
-rw-r--r--mm/memory_hotplug.c31
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/nobootmem.c8
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c68
-rw-r--r--mm/page_cgroup.c1
-rw-r--r--mm/page_isolation.c43
-rw-r--r--mm/rmap.c88
-rw-r--r--mm/slab_common.c14
-rw-r--r--mm/truncate.c6
19 files changed, 762 insertions, 856 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index b3cbe19f71b5..fcad8322ef36 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -68,11 +68,13 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
* to be released by the balloon driver.
*/
if (trylock_page(page)) {
+#ifdef CONFIG_BALLOON_COMPACTION
if (!PagePrivate(page)) {
/* raced with isolation */
unlock_page(page);
continue;
}
+#endif
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
balloon_page_delete(page);
__count_vm_event(BALLOON_DEFLATE);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8a000cebb0d7..477be696511d 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
static int reset_managed_pages_done __initdata;
-static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
+void reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- if (reset_managed_pages_done)
- return;
-
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
z->managed_pages = 0;
}
@@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void)
{
struct pglist_data *pgdat;
+ if (reset_managed_pages_done)
+ return;
+
for_each_online_pgdat(pgdat)
reset_node_managed_pages(pgdat);
+
reset_managed_pages_done = 1;
}
diff --git a/mm/cma.c b/mm/cma.c
index 963bc4add9af..fde706e1284f 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -124,6 +124,7 @@ static int __init cma_activate_area(struct cma *cma)
err:
kfree(cma->bitmap);
+ cma->count = 0;
return -EINVAL;
}
@@ -217,9 +218,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
phys_addr_t highmem_start = __pa(high_memory);
int ret = 0;
- pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
- __func__, (unsigned long)size, (unsigned long)base,
- (unsigned long)limit, (unsigned long)alignment);
+ pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
+ __func__, &size, &base, &limit, &alignment);
if (cma_area_count == ARRAY_SIZE(cma_areas)) {
pr_err("Not enough slots for CMA reserved regions!\n");
@@ -244,52 +244,72 @@ int __init cma_declare_contiguous(phys_addr_t base,
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
+ if (!base)
+ fixed = false;
+
/* size should be aligned with order_per_bit */
if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
return -EINVAL;
/*
- * adjust limit to avoid crossing low/high memory boundary for
- * automatically allocated regions
+ * If allocating at a fixed base the request region must not cross the
+ * low/high memory boundary.
*/
- if (((limit == 0 || limit > memblock_end) &&
- (memblock_end - size < highmem_start &&
- memblock_end > highmem_start)) ||
- (!fixed && limit > highmem_start && limit - size < highmem_start)) {
- limit = highmem_start;
- }
-
- if (fixed && base < highmem_start && base+size > highmem_start) {
+ if (fixed && base < highmem_start && base + size > highmem_start) {
ret = -EINVAL;
- pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
- (unsigned long)base, (unsigned long)highmem_start);
+ pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
+ &base, &highmem_start);
goto err;
}
+ /*
+ * If the limit is unspecified or above the memblock end, its effective
+ * value will be the memblock end. Set it explicitly to simplify further
+ * checks.
+ */
+ if (limit == 0 || limit > memblock_end)
+ limit = memblock_end;
+
/* Reserve memory */
- if (base && fixed) {
+ if (fixed) {
if (memblock_is_region_reserved(base, size) ||
memblock_reserve(base, size) < 0) {
ret = -EBUSY;
goto err;
}
} else {
- phys_addr_t addr = memblock_alloc_range(size, alignment, base,
- limit);
+ phys_addr_t addr = 0;
+
+ /*
+ * All pages in the reserved area must come from the same zone.
+ * If the requested region crosses the low/high memory boundary,
+ * try allocating from high memory first and fall back to low
+ * memory in case of failure.
+ */
+ if (base < highmem_start && limit > highmem_start) {
+ addr = memblock_alloc_range(size, alignment,
+ highmem_start, limit);
+ limit = highmem_start;
+ }
+
if (!addr) {
- ret = -ENOMEM;
- goto err;
- } else {
- base = addr;
+ addr = memblock_alloc_range(size, alignment, base,
+ limit);
+ if (!addr) {
+ ret = -ENOMEM;
+ goto err;
+ }
}
+
+ base = addr;
}
ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
if (ret)
goto err;
- pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M,
- (unsigned long)base);
+ pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
+ &base);
return 0;
err:
diff --git a/mm/compaction.c b/mm/compaction.c
index edba18aed173..f9792ba3537c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -479,6 +479,16 @@ isolate_freepages_range(struct compact_control *cc,
block_end_pfn = min(block_end_pfn, end_pfn);
+ /*
+ * pfn could pass the block_end_pfn if isolated freepage
+ * is more than pageblock order. In this case, we adjust
+ * scanning range to right one.
+ */
+ if (pfn >= block_end_pfn) {
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+ }
+
if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
break;
@@ -784,6 +794,9 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
cc->nr_migratepages = 0;
break;
}
+
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ break;
}
acct_isolated(cc->zone, cc);
@@ -1026,8 +1039,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
acct_isolated(zone, cc);
- /* Record where migration scanner will be restarted */
- cc->migrate_pfn = low_pfn;
+ /*
+ * Record where migration scanner will be restarted. If we end up in
+ * the same pageblock as the free scanner, make the scanners fully
+ * meet so that compact_finished() terminates compaction.
+ */
+ cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74c78aa8bc2f..de984159cf0b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -200,7 +200,7 @@ retry:
preempt_disable();
if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
- __free_page(zero_page);
+ __free_pages(zero_page, compound_order(zero_page));
goto retry;
}
@@ -232,7 +232,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
- __free_page(zero_page);
+ __free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma)))
+ if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
@@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
- if (unlikely(khugepaged_enter_vma_merge(vma)))
+ if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
return -ENOMEM;
break;
case MADV_NOHUGEPAGE:
@@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm)
return 0;
}
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
unsigned long hstart, hend;
if (!vma->anon_vma)
@@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
+ VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
- return khugepaged_enter(vma);
+ return khugepaged_enter(vma, vm_flags);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 829304090b90..a4f90ba7068e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
*/
+
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ * B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ * P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
+ */
+static inline unsigned long
+__find_buddy_index(unsigned long page_idx, unsigned int order)
+{
+ return page_idx ^ (1 << order);
+}
+
+extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index eafcf60f6b83..a1599ca4ab0e 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -3,95 +3,136 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-
-static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip += copy;
- from += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip = copy;
- from += copy;
- bytes -= copy;
- }
-
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip += copy;
- to += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip = copy;
- to += copy;
- bytes -= copy;
- }
-
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
+#include <net/checksum.h>
+
+#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
+ size_t left; \
+ size_t wanted = n; \
+ __p = i->iov; \
+ __v.iov_len = min(n, __p->iov_len - skip); \
+ if (likely(__v.iov_len)) { \
+ __v.iov_base = __p->iov_base + skip; \
+ left = (STEP); \
+ __v.iov_len -= left; \
+ skip += __v.iov_len; \
+ n -= __v.iov_len; \
+ } else { \
+ left = 0; \
+ } \
+ while (unlikely(!left && n)) { \
+ __p++; \
+ __v.iov_len = min(n, __p->iov_len); \
+ if (unlikely(!__v.iov_len)) \
+ continue; \
+ __v.iov_base = __p->iov_base; \
+ left = (STEP); \
+ __v.iov_len -= left; \
+ skip = __v.iov_len; \
+ n -= __v.iov_len; \
+ } \
+ n = wanted - n; \
+}
+
+#define iterate_kvec(i, n, __v, __p, skip, STEP) { \
+ size_t wanted = n; \
+ __p = i->kvec; \
+ __v.iov_len = min(n, __p->iov_len - skip); \
+ if (likely(__v.iov_len)) { \
+ __v.iov_base = __p->iov_base + skip; \
+ (void)(STEP); \
+ skip += __v.iov_len; \
+ n -= __v.iov_len; \
+ } \
+ while (unlikely(n)) { \
+ __p++; \
+ __v.iov_len = min(n, __p->iov_len); \
+ if (unlikely(!__v.iov_len)) \
+ continue; \
+ __v.iov_base = __p->iov_base; \
+ (void)(STEP); \
+ skip = __v.iov_len; \
+ n -= __v.iov_len; \
+ } \
+ n = wanted; \
+}
+
+#define iterate_bvec(i, n, __v, __p, skip, STEP) { \
+ size_t wanted = n; \
+ __p = i->bvec; \
+ __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \
+ if (likely(__v.bv_len)) { \
+ __v.bv_page = __p->bv_page; \
+ __v.bv_offset = __p->bv_offset + skip; \
+ (void)(STEP); \
+ skip += __v.bv_len; \
+ n -= __v.bv_len; \
+ } \
+ while (unlikely(n)) { \
+ __p++; \
+ __v.bv_len = min_t(size_t, n, __p->bv_len); \
+ if (unlikely(!__v.bv_len)) \
+ continue; \
+ __v.bv_page = __p->bv_page; \
+ __v.bv_offset = __p->bv_offset; \
+ (void)(STEP); \
+ skip = __v.bv_len; \
+ n -= __v.bv_len; \
+ } \
+ n = wanted; \
+}
+
+#define iterate_all_kinds(i, n, v, I, B, K) { \
+ size_t skip = i->iov_offset; \
+ if (unlikely(i->type & ITER_BVEC)) { \
+ const struct bio_vec *bvec; \
+ struct bio_vec v; \
+ iterate_bvec(i, n, v, bvec, skip, (B)) \
+ } else if (unlikely(i->type & ITER_KVEC)) { \
+ const struct kvec *kvec; \
+ struct kvec v; \
+ iterate_kvec(i, n, v, kvec, skip, (K)) \
+ } else { \
+ const struct iovec *iov; \
+ struct iovec v; \
+ iterate_iovec(i, n, v, iov, skip, (I)) \
+ } \
+}
+
+#define iterate_and_advance(i, n, v, I, B, K) { \
+ size_t skip = i->iov_offset; \
+ if (unlikely(i->type & ITER_BVEC)) { \
+ const struct bio_vec *bvec; \
+ struct bio_vec v; \
+ iterate_bvec(i, n, v, bvec, skip, (B)) \
+ if (skip == bvec->bv_len) { \
+ bvec++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= bvec - i->bvec; \
+ i->bvec = bvec; \
+ } else if (unlikely(i->type & ITER_KVEC)) { \
+ const struct kvec *kvec; \
+ struct kvec v; \
+ iterate_kvec(i, n, v, kvec, skip, (K)) \
+ if (skip == kvec->iov_len) { \
+ kvec++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= kvec - i->kvec; \
+ i->kvec = kvec; \
+ } else { \
+ const struct iovec *iov; \
+ struct iovec v; \
+ iterate_iovec(i, n, v, iov, skip, (I)) \
+ if (skip == iov->iov_len) { \
+ iov++; \
+ skip = 0; \
+ } \
+ i->nr_segs -= iov - i->iov; \
+ i->iov = iov; \
+ } \
+ i->count -= n; \
+ i->iov_offset = skip; \
}
static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
@@ -256,134 +297,6 @@ done:
return wanted - bytes;
}
-static size_t zero_iovec(size_t bytes, struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- left = __clear_user(buf, copy);
- copy -= left;
- skip += copy;
- bytes -= copy;
-
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __clear_user(buf, copy);
- copy -= left;
- skip = copy;
- bytes -= copy;
- }
-
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- size_t copied = 0, left = 0;
-
- while (bytes) {
- char __user *buf = iov->iov_base + base;
- int copy = min(bytes, iov->iov_len - base);
-
- base = 0;
- left = __copy_from_user_inatomic(vaddr, buf, copy);
- copied += copy;
- bytes -= copy;
- vaddr += copy;
- iov++;
-
- if (unlikely(left))
- break;
- }
- return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied. If a fault is encountered then return the number of
- * bytes which were copied.
- */
-static size_t copy_from_user_atomic_iovec(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap_atomic(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap_atomic(kaddr);
-
- return copied;
-}
-
-static void advance_iovec(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct iovec *iov = i->iov;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !iov->iov_len)) {
- int copy;
-
- copy = min(bytes, iov->iov_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- nr_segs--;
- base = 0;
- }
- }
- i->iov = iov;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-
/*
* Fault in the first iovec of the given iov_iter, to a maximum length
* of bytes. Returns 0 on success, or non-zero if the memory could not be
@@ -395,7 +308,7 @@ static void advance_iovec(struct iov_iter *i, size_t bytes)
*/
int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
{
- if (!(i->type & ITER_BVEC)) {
+ if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
char __user *buf = i->iov->iov_base + i->iov_offset;
bytes = min(bytes, i->iov->iov_len - i->iov_offset);
return fault_in_pages_readable(buf, bytes);
@@ -404,136 +317,25 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
}
EXPORT_SYMBOL(iov_iter_fault_in_readable);
-static unsigned long alignment_iovec(const struct iov_iter *i)
-{
- const struct iovec *iov = i->iov;
- unsigned long res;
- size_t size = i->count;
- size_t n;
-
- if (!size)
- return 0;
-
- res = (unsigned long)iov->iov_base + i->iov_offset;
- n = iov->iov_len - i->iov_offset;
- if (n >= size)
- return res | size;
- size -= n;
- res |= n;
- while (size > (++iov)->iov_len) {
- res |= (unsigned long)iov->iov_base | iov->iov_len;
- size -= iov->iov_len;
- }
- res |= (unsigned long)iov->iov_base | size;
- return res;
-}
-
void iov_iter_init(struct iov_iter *i, int direction,
const struct iovec *iov, unsigned long nr_segs,
size_t count)
{
/* It will get better. Eventually... */
- if (segment_eq(get_fs(), KERNEL_DS))
+ if (segment_eq(get_fs(), KERNEL_DS)) {
direction |= ITER_KVEC;
- i->type = direction;
- i->iov = iov;
+ i->type = direction;
+ i->kvec = (struct kvec *)iov;
+ } else {
+ i->type = direction;
+ i->iov = iov;
+ }
i->nr_segs = nr_segs;
i->iov_offset = 0;
i->count = count;
}
EXPORT_SYMBOL(iov_iter_init);
-static ssize_t get_pages_iovec(struct iov_iter *i,
- struct page **pages, size_t maxsize, unsigned maxpages,
- size_t *start)
-{
- size_t offset = i->iov_offset;
- const struct iovec *iov = i->iov;
- size_t len;
- unsigned long addr;
- int n;
- int res;
-
- len = iov->iov_len - offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- addr = (unsigned long)iov->iov_base + offset;
- len += *start = addr & (PAGE_SIZE - 1);
- if (len > maxpages * PAGE_SIZE)
- len = maxpages * PAGE_SIZE;
- addr &= ~(PAGE_SIZE - 1);
- n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
- res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
- if (unlikely(res < 0))
- return res;
- return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
-static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
- struct page ***pages, size_t maxsize,
- size_t *start)
-{
- size_t offset = i->iov_offset;
- const struct iovec *iov = i->iov;
- size_t len;
- unsigned long addr;
- void *p;
- int n;
- int res;
-
- len = iov->iov_len - offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- addr = (unsigned long)iov->iov_base + offset;
- len += *start = addr & (PAGE_SIZE - 1);
- addr &= ~(PAGE_SIZE - 1);
- n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
-
- p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
- if (!p)
- p = vmalloc(n * sizeof(struct page *));
- if (!p)
- return -ENOMEM;
-
- res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
- if (unlikely(res < 0)) {
- kvfree(p);
- return res;
- }
- *pages = p;
- return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
-static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
-{
- size_t offset = i->iov_offset;
- size_t size = i->count;
- const struct iovec *iov = i->iov;
- int npages = 0;
- int n;
-
- for (n = 0; size && n < i->nr_segs; n++, iov++) {
- unsigned long addr = (unsigned long)iov->iov_base + offset;
- size_t len = iov->iov_len - offset;
- offset = 0;
- if (unlikely(!len)) /* empty segment */
- continue;
- if (len > size)
- len = size;
- npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
- - addr / PAGE_SIZE;
- if (npages >= maxpages) /* don't bother going further */
- return maxpages;
- size -= len;
- offset = 0;
- }
- return min(npages, maxpages);
-}
-
static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
{
char *from = kmap_atomic(page);
@@ -555,293 +357,78 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_atomic(addr);
}
-static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i)
+size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
-
+ char *from = addr;
if (unlikely(bytes > i->count))
bytes = i->count;
if (unlikely(!bytes))
return 0;
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
- copy = min_t(size_t, bytes, bvec->bv_len - skip);
+ iterate_and_advance(i, bytes, v,
+ __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
+ v.iov_len),
+ memcpy_to_page(v.bv_page, v.bv_offset,
+ (from += v.bv_len) - v.bv_len, v.bv_len),
+ memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
+ )
- memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
- skip += copy;
- from += copy;
- bytes -= copy;
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
- skip = copy;
- from += copy;
- bytes -= copy;
- }
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted - bytes;
+ return bytes;
}
+EXPORT_SYMBOL(copy_to_iter);
-static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i)
+size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
-
+ char *to = addr;
if (unlikely(bytes > i->count))
bytes = i->count;
if (unlikely(!bytes))
return 0;
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
-
- copy = min(bytes, bvec->bv_len - skip);
-
- memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
-
- to += copy;
- skip += copy;
- bytes -= copy;
-
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
- skip = copy;
- to += copy;
- bytes -= copy;
- }
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted;
-}
-
-static size_t copy_page_to_iter_bvec(struct page *page, size_t offset,
- size_t bytes, struct iov_iter *i)
-{
- void *kaddr = kmap_atomic(page);
- size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i);
- kunmap_atomic(kaddr);
- return wanted;
-}
+ iterate_and_advance(i, bytes, v,
+ __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
+ v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len),
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ )
-static size_t copy_page_from_iter_bvec(struct page *page, size_t offset,
- size_t bytes, struct iov_iter *i)
-{
- void *kaddr = kmap_atomic(page);
- size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i);
- kunmap_atomic(kaddr);
- return wanted;
+ return bytes;
}
+EXPORT_SYMBOL(copy_from_iter);
-static size_t zero_bvec(size_t bytes, struct iov_iter *i)
+size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
-
+ char *to = addr;
if (unlikely(bytes > i->count))
bytes = i->count;
if (unlikely(!bytes))
return 0;
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
- copy = min_t(size_t, bytes, bvec->bv_len - skip);
+ iterate_and_advance(i, bytes, v,
+ __copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+ v.iov_base, v.iov_len),
+ memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len),
+ memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ )
- memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy);
- skip += copy;
- bytes -= copy;
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memzero_page(bvec->bv_page, bvec->bv_offset, copy);
- skip = copy;
- bytes -= copy;
- }
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t copy_from_user_bvec(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t left;
- const struct bio_vec *bvec;
- size_t base = i->iov_offset;
-
- kaddr = kmap_atomic(page);
- for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
- size_t copy = min(left, bvec->bv_len - base);
- if (!bvec->bv_len)
- continue;
- memcpy_from_page(kaddr + offset, bvec->bv_page,
- bvec->bv_offset + base, copy);
- offset += copy;
- left -= copy;
- }
- kunmap_atomic(kaddr);
return bytes;
}
-
-static void advance_bvec(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct bio_vec *bvec = i->bvec;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !bvec->bv_len)) {
- int copy;
-
- copy = min(bytes, bvec->bv_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (bvec->bv_len == base) {
- bvec++;
- nr_segs--;
- base = 0;
- }
- }
- i->bvec = bvec;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-
-static unsigned long alignment_bvec(const struct iov_iter *i)
-{
- const struct bio_vec *bvec = i->bvec;
- unsigned long res;
- size_t size = i->count;
- size_t n;
-
- if (!size)
- return 0;
-
- res = bvec->bv_offset + i->iov_offset;
- n = bvec->bv_len - i->iov_offset;
- if (n >= size)
- return res | size;
- size -= n;
- res |= n;
- while (size > (++bvec)->bv_len) {
- res |= bvec->bv_offset | bvec->bv_len;
- size -= bvec->bv_len;
- }
- res |= bvec->bv_offset | size;
- return res;
-}
-
-static ssize_t get_pages_bvec(struct iov_iter *i,
- struct page **pages, size_t maxsize, unsigned maxpages,
- size_t *start)
-{
- const struct bio_vec *bvec = i->bvec;
- size_t len = bvec->bv_len - i->iov_offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- /* can't be more than PAGE_SIZE */
- *start = bvec->bv_offset + i->iov_offset;
-
- get_page(*pages = bvec->bv_page);
-
- return len;
-}
-
-static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
- struct page ***pages, size_t maxsize,
- size_t *start)
-{
- const struct bio_vec *bvec = i->bvec;
- size_t len = bvec->bv_len - i->iov_offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- *start = bvec->bv_offset + i->iov_offset;
-
- *pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
- if (!*pages)
- return -ENOMEM;
-
- get_page(**pages = bvec->bv_page);
-
- return len;
-}
-
-static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
-{
- size_t offset = i->iov_offset;
- size_t size = i->count;
- const struct bio_vec *bvec = i->bvec;
- int npages = 0;
- int n;
-
- for (n = 0; size && n < i->nr_segs; n++, bvec++) {
- size_t len = bvec->bv_len - offset;
- offset = 0;
- if (unlikely(!len)) /* empty segment */
- continue;
- if (len > size)
- len = size;
- npages++;
- if (npages >= maxpages) /* don't bother going further */
- return maxpages;
- size -= len;
- offset = 0;
- }
- return min(npages, maxpages);
-}
+EXPORT_SYMBOL(copy_from_iter_nocache);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return copy_page_to_iter_bvec(page, offset, bytes, i);
- else
+ if (i->type & (ITER_BVEC|ITER_KVEC)) {
+ void *kaddr = kmap_atomic(page);
+ size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
+ kunmap_atomic(kaddr);
+ return wanted;
+ } else
return copy_page_to_iter_iovec(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_to_iter);
@@ -849,57 +436,53 @@ EXPORT_SYMBOL(copy_page_to_iter);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return copy_page_from_iter_bvec(page, offset, bytes, i);
- else
+ if (i->type & (ITER_BVEC|ITER_KVEC)) {
+ void *kaddr = kmap_atomic(page);
+ size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
+ kunmap_atomic(kaddr);
+ return wanted;
+ } else
return copy_page_from_iter_iovec(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_from_iter);
-size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
+size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return copy_to_iter_bvec(addr, bytes, i);
- else
- return copy_to_iter_iovec(addr, bytes, i);
-}
-EXPORT_SYMBOL(copy_to_iter);
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
-size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
- if (i->type & ITER_BVEC)
- return copy_from_iter_bvec(addr, bytes, i);
- else
- return copy_from_iter_iovec(addr, bytes, i);
-}
-EXPORT_SYMBOL(copy_from_iter);
+ if (unlikely(!bytes))
+ return 0;
-size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
-{
- if (i->type & ITER_BVEC) {
- return zero_bvec(bytes, i);
- } else {
- return zero_iovec(bytes, i);
- }
+ iterate_and_advance(i, bytes, v,
+ __clear_user(v.iov_base, v.iov_len),
+ memzero_page(v.bv_page, v.bv_offset, v.bv_len),
+ memset(v.iov_base, 0, v.iov_len)
+ )
+
+ return bytes;
}
EXPORT_SYMBOL(iov_iter_zero);
size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
- if (i->type & ITER_BVEC)
- return copy_from_user_bvec(page, i, offset, bytes);
- else
- return copy_from_user_atomic_iovec(page, i, offset, bytes);
+ char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+ iterate_all_kinds(i, bytes, v,
+ __copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
+ v.iov_base, v.iov_len),
+ memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
+ v.bv_offset, v.bv_len),
+ memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+ )
+ kunmap_atomic(kaddr);
+ return bytes;
}
EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
void iov_iter_advance(struct iov_iter *i, size_t size)
{
- if (i->type & ITER_BVEC)
- advance_bvec(i, size);
- else
- advance_iovec(i, size);
+ iterate_and_advance(i, size, v, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -911,18 +494,39 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
if (i->nr_segs == 1)
return i->count;
else if (i->type & ITER_BVEC)
- return min(i->count, i->iov->iov_len - i->iov_offset);
- else
return min(i->count, i->bvec->bv_len - i->iov_offset);
+ else
+ return min(i->count, i->iov->iov_len - i->iov_offset);
}
EXPORT_SYMBOL(iov_iter_single_seg_count);
+void iov_iter_kvec(struct iov_iter *i, int direction,
+ const struct kvec *iov, unsigned long nr_segs,
+ size_t count)
+{
+ BUG_ON(!(direction & ITER_KVEC));
+ i->type = direction;
+ i->kvec = (struct kvec *)iov;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_kvec);
+
unsigned long iov_iter_alignment(const struct iov_iter *i)
{
- if (i->type & ITER_BVEC)
- return alignment_bvec(i);
- else
- return alignment_iovec(i);
+ unsigned long res = 0;
+ size_t size = i->count;
+
+ if (!size)
+ return 0;
+
+ iterate_all_kinds(i, size, v,
+ (res |= (unsigned long)v.iov_base | v.iov_len, 0),
+ res |= v.bv_offset | v.bv_len,
+ res |= (unsigned long)v.iov_base | v.iov_len
+ )
+ return res;
}
EXPORT_SYMBOL(iov_iter_alignment);
@@ -930,29 +534,207 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
{
- if (i->type & ITER_BVEC)
- return get_pages_bvec(i, pages, maxsize, maxpages, start);
- else
- return get_pages_iovec(i, pages, maxsize, maxpages, start);
+ if (maxsize > i->count)
+ maxsize = i->count;
+
+ if (!maxsize)
+ return 0;
+
+ iterate_all_kinds(i, maxsize, v, ({
+ unsigned long addr = (unsigned long)v.iov_base;
+ size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+ int n;
+ int res;
+
+ if (len > maxpages * PAGE_SIZE)
+ len = maxpages * PAGE_SIZE;
+ addr &= ~(PAGE_SIZE - 1);
+ n = DIV_ROUND_UP(len, PAGE_SIZE);
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
+ if (unlikely(res < 0))
+ return res;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+ 0;}),({
+ /* can't be more than PAGE_SIZE */
+ *start = v.bv_offset;
+ get_page(*pages = v.bv_page);
+ return v.bv_len;
+ }),({
+ return -EFAULT;
+ })
+ )
+ return 0;
}
EXPORT_SYMBOL(iov_iter_get_pages);
+static struct page **get_pages_array(size_t n)
+{
+ struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
+ if (!p)
+ p = vmalloc(n * sizeof(struct page *));
+ return p;
+}
+
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
struct page ***pages, size_t maxsize,
size_t *start)
{
- if (i->type & ITER_BVEC)
- return get_pages_alloc_bvec(i, pages, maxsize, start);
- else
- return get_pages_alloc_iovec(i, pages, maxsize, start);
+ struct page **p;
+
+ if (maxsize > i->count)
+ maxsize = i->count;
+
+ if (!maxsize)
+ return 0;
+
+ iterate_all_kinds(i, maxsize, v, ({
+ unsigned long addr = (unsigned long)v.iov_base;
+ size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+ int n;
+ int res;
+
+ addr &= ~(PAGE_SIZE - 1);
+ n = DIV_ROUND_UP(len, PAGE_SIZE);
+ p = get_pages_array(n);
+ if (!p)
+ return -ENOMEM;
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
+ if (unlikely(res < 0)) {
+ kvfree(p);
+ return res;
+ }
+ *pages = p;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+ 0;}),({
+ /* can't be more than PAGE_SIZE */
+ *start = v.bv_offset;
+ *pages = p = get_pages_array(1);
+ if (!p)
+ return -ENOMEM;
+ get_page(*p = v.bv_page);
+ return v.bv_len;
+ }),({
+ return -EFAULT;
+ })
+ )
+ return 0;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
+ struct iov_iter *i)
+{
+ char *to = addr;
+ __wsum sum, next;
+ size_t off = 0;
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ sum = *csum;
+ iterate_and_advance(i, bytes, v, ({
+ int err = 0;
+ next = csum_and_copy_from_user(v.iov_base,
+ (to += v.iov_len) - v.iov_len,
+ v.iov_len, 0, &err);
+ if (!err) {
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ }
+ err ? v.iov_len : 0;
+ }), ({
+ char *p = kmap_atomic(v.bv_page);
+ next = csum_partial_copy_nocheck(p + v.bv_offset,
+ (to += v.bv_len) - v.bv_len,
+ v.bv_len, 0);
+ kunmap_atomic(p);
+ sum = csum_block_add(sum, next, off);
+ off += v.bv_len;
+ }),({
+ next = csum_partial_copy_nocheck(v.iov_base,
+ (to += v.iov_len) - v.iov_len,
+ v.iov_len, 0);
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ })
+ )
+ *csum = sum;
+ return bytes;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter);
+
+size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum,
+ struct iov_iter *i)
+{
+ char *from = addr;
+ __wsum sum, next;
+ size_t off = 0;
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ sum = *csum;
+ iterate_and_advance(i, bytes, v, ({
+ int err = 0;
+ next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
+ v.iov_base,
+ v.iov_len, 0, &err);
+ if (!err) {
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ }
+ err ? v.iov_len : 0;
+ }), ({
+ char *p = kmap_atomic(v.bv_page);
+ next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
+ p + v.bv_offset,
+ v.bv_len, 0);
+ kunmap_atomic(p);
+ sum = csum_block_add(sum, next, off);
+ off += v.bv_len;
+ }),({
+ next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len,
+ v.iov_base,
+ v.iov_len, 0);
+ sum = csum_block_add(sum, next, off);
+ off += v.iov_len;
+ })
+ )
+ *csum = sum;
+ return bytes;
+}
+EXPORT_SYMBOL(csum_and_copy_to_iter);
+
int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
- if (i->type & ITER_BVEC)
- return iov_iter_npages_bvec(i, maxpages);
- else
- return iov_iter_npages_iovec(i, maxpages);
+ size_t size = i->count;
+ int npages = 0;
+
+ if (!size)
+ return 0;
+
+ iterate_all_kinds(i, size, v, ({
+ unsigned long p = (unsigned long)v.iov_base;
+ npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
+ - p / PAGE_SIZE;
+ if (npages >= maxpages)
+ return maxpages;
+ 0;}),({
+ npages++;
+ if (npages >= maxpages)
+ return maxpages;
+ }),({
+ unsigned long p = (unsigned long)v.iov_base;
+ npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
+ - p / PAGE_SIZE;
+ if (npages >= maxpages)
+ return maxpages;
+ })
+ )
+ return npages;
}
EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c3385181b16..ee48428cf8e3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
* start move here.
*/
-/* for quick checking without looking up memcg */
-atomic_t memcg_moving __read_mostly;
-
static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{
- atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account);
synchronize_rcu();
}
@@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
* Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller.
*/
- if (memcg) {
- atomic_dec(&memcg_moving);
+ if (memcg)
atomic_dec(&memcg->moving_account);
- }
}
/*
@@ -2204,41 +2198,52 @@ cleanup:
return true;
}
-/*
- * Used to update mapped file or writeback or other statistics.
+/**
+ * mem_cgroup_begin_page_stat - begin a page state statistics transaction
+ * @page: page that is going to change accounted state
+ * @locked: &memcg->move_lock slowpath was taken
+ * @flags: IRQ-state flags for &memcg->move_lock
*
- * Notes: Race condition
+ * This function must mark the beginning of an accounted page state
+ * change to prevent double accounting when the page is concurrently
+ * being moved to another memcg:
*
- * Charging occurs during page instantiation, while the page is
- * unmapped and locked in page migration, or while the page table is
- * locked in THP migration. No race is possible.
+ * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ * if (TestClearPageState(page))
+ * mem_cgroup_update_page_stat(memcg, state, -1);
+ * mem_cgroup_end_page_stat(memcg, locked, flags);
*
- * Uncharge happens to pages with zero references, no race possible.
+ * The RCU lock is held throughout the transaction. The fast path can
+ * get away without acquiring the memcg->move_lock (@locked is false)
+ * because page moving starts with an RCU grace period.
*
- * Charge moving between groups is protected by checking mm->moving
- * account and taking the move_lock in the slowpath.
+ * The RCU lock also protects the memcg from being freed when the page
+ * state that is going to change is the only thing preventing the page
+ * from being uncharged. E.g. end-writeback clearing PageWriteback(),
+ * which allows migration to go ahead and uncharge the page before the
+ * account transaction might be complete.
*/
-
-void __mem_cgroup_begin_update_page_stat(struct page *page,
- bool *locked, unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
+ bool *locked,
+ unsigned long *flags)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc;
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
pc = lookup_page_cgroup(page);
again:
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
- return;
- /*
- * If this memory cgroup is not under account moving, we don't
- * need to take move_lock_mem_cgroup(). Because we already hold
- * rcu_read_lock(), any calls to move_account will be delayed until
- * rcu_read_unlock().
- */
- VM_BUG_ON(!rcu_read_lock_held());
+ return NULL;
+
+ *locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
- return;
+ return memcg;
move_lock_mem_cgroup(memcg, flags);
if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
@@ -2246,36 +2251,40 @@ again:
goto again;
}
*locked = true;
+
+ return memcg;
}
-void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
+/**
+ * mem_cgroup_end_page_stat - finish a page state statistics transaction
+ * @memcg: the memcg that was accounted against
+ * @locked: value received from mem_cgroup_begin_page_stat()
+ * @flags: value received from mem_cgroup_begin_page_stat()
+ */
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
+ unsigned long flags)
{
- struct page_cgroup *pc = lookup_page_cgroup(page);
+ if (memcg && locked)
+ move_unlock_mem_cgroup(memcg, &flags);
- /*
- * It's guaranteed that pc->mem_cgroup never changes while
- * lock is held because a routine modifies pc->mem_cgroup
- * should take move_lock_mem_cgroup().
- */
- move_unlock_mem_cgroup(pc->mem_cgroup, flags);
+ rcu_read_unlock();
}
-void mem_cgroup_update_page_stat(struct page *page,
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx, int val)
{
- struct mem_cgroup *memcg;
- struct page_cgroup *pc = lookup_page_cgroup(page);
- unsigned long uninitialized_var(flags);
-
- if (mem_cgroup_disabled())
- return;
-
VM_BUG_ON(!rcu_read_lock_held());
- memcg = pc->mem_cgroup;
- if (unlikely(!memcg || !PageCgroupUsed(pc)))
- return;
- this_cpu_add(memcg->stat->count[idx], val);
+ if (memcg)
+ this_cpu_add(memcg->stat->count[idx], val);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 1cc6bfbd872e..3e503831e042 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1147,6 +1147,7 @@ again:
print_bad_pte(vma, addr, ptent, page);
if (unlikely(!__tlb_remove_page(tlb, page))) {
force_flush = 1;
+ addr += PAGE_SIZE;
break;
}
continue;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 29d8693d0c61..1bf4807cb21e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
#include <linux/stop_machine.h>
#include <linux/hugetlb.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/tlbflush.h>
@@ -1066,6 +1067,16 @@ out:
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
+static void reset_node_present_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ z->present_pages = 0;
+
+ pgdat->node_present_pages = 0;
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
@@ -1096,6 +1107,21 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);
+ /*
+ * zone->managed_pages is set to an approximate value in
+ * free_area_init_core(), which will cause
+ * /sys/device/system/node/nodeX/meminfo has wrong data.
+ * So reset it to 0 before any memory is onlined.
+ */
+ reset_node_managed_pages(pgdat);
+
+ /*
+ * When memory is hot-added, all the memory is in offline state. So
+ * clear all zones' present_pages because they will be updated in
+ * online_pages() and offline_pages().
+ */
+ reset_node_present_pages(pgdat);
+
return pgdat;
}
@@ -1912,7 +1938,6 @@ void try_offline_node(int nid)
unsigned long start_pfn = pgdat->node_start_pfn;
unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
unsigned long pfn;
- struct page *pgdat_page = virt_to_page(pgdat);
int i;
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -1941,10 +1966,6 @@ void try_offline_node(int nid)
node_set_offline(nid);
unregister_one_node(nid);
- if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
- /* node data is allocated from boot memory */
- return;
-
/* free waittable in each zone */
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f855206e7fb..87e82b38453c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
end, prev->vm_pgoff, NULL);
if (err)
return NULL;
- khugepaged_enter_vma_merge(prev);
+ khugepaged_enter_vma_merge(prev, vm_flags);
return prev;
}
@@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
next->vm_pgoff - pglen, NULL);
if (err)
return NULL;
- khugepaged_enter_vma_merge(area);
+ khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
@@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
vma_unlock_anon_vma(vma);
- khugepaged_enter_vma_merge(vma);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(vma->vm_mm);
return error;
}
@@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
vma_unlock_anon_vma(vma);
- khugepaged_enter_vma_merge(vma);
+ khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(vma->vm_mm);
return error;
}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7c7ab32ee503..90b50468333e 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -145,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void)
static int reset_managed_pages_done __initdata;
-static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
+void reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- if (reset_managed_pages_done)
- return;
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
z->managed_pages = 0;
}
@@ -159,8 +157,12 @@ void __init reset_all_zones_managed_pages(void)
{
struct pglist_data *pgdat;
+ if (reset_managed_pages_done)
+ return;
+
for_each_online_pgdat(pgdat)
reset_node_managed_pages(pgdat);
+
reset_managed_pages_done = 1;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ff24c9d83112..19ceae87522d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2116,23 +2116,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
- * Helper function for set_page_writeback family.
- *
- * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
- * while calling this function.
- * See test_set_page_writeback for example.
- *
- * NOTE: Unlike account_page_dirtied this does not rely on being atomic
- * wrt interrupts.
- */
-void account_page_writeback(struct page *page)
-{
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
- inc_zone_page_state(page, NR_WRITEBACK);
-}
-EXPORT_SYMBOL(account_page_writeback);
-
-/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -2344,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- int ret;
- bool locked;
unsigned long memcg_flags;
+ struct mem_cgroup *memcg;
+ bool locked;
+ int ret;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2369,22 +2353,23 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- int ret;
- bool locked;
unsigned long memcg_flags;
+ struct mem_cgroup *memcg;
+ bool locked;
+ int ret;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2410,9 +2395,11 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
} else {
ret = TestSetPageWriteback(page);
}
- if (!ret)
- account_page_writeback(page);
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
+ if (!ret) {
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITEBACK);
+ }
+ mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9cd36b822444..616a2c956b4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -467,29 +467,6 @@ static inline void rmv_page_order(struct page *page)
}
/*
- * Locate the struct page for both the matching buddy in our
- * pair (buddy1) and the combined O(n+1) page they form (page).
- *
- * 1) Any buddy B1 will have an order O twin B2 which satisfies
- * the following equation:
- * B2 = B1 ^ (1 << O)
- * For example, if the starting buddy (buddy2) is #8 its order
- * 1 buddy is #10:
- * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
- *
- * 2) Any buddy B will have an order O+1 parent P which
- * satisfies the following equation:
- * P = B & ~(1 << O)
- *
- * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
- */
-static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
-{
- return page_idx ^ (1 << order);
-}
-
-/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
* (a) the buddy is not in a hole &&
@@ -569,6 +546,7 @@ static inline void __free_one_page(struct page *page,
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
+ int max_order = MAX_ORDER;
VM_BUG_ON(!zone_is_initialized(zone));
@@ -577,13 +555,24 @@ static inline void __free_one_page(struct page *page,
return;
VM_BUG_ON(migratetype == -1);
+ if (is_migrate_isolate(migratetype)) {
+ /*
+ * We restrict max order of merging to prevent merge
+ * between freepages on isolate pageblock and normal
+ * pageblock. Without this, pageblock isolation
+ * could cause incorrect freepage accounting.
+ */
+ max_order = min(MAX_ORDER, pageblock_order + 1);
+ } else {
+ __mod_zone_freepage_state(zone, 1 << order, migratetype);
+ }
- page_idx = pfn & ((1 << MAX_ORDER) - 1);
+ page_idx = pfn & ((1 << max_order) - 1);
VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
- while (order < MAX_ORDER-1) {
+ while (order < max_order - 1) {
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
@@ -594,9 +583,11 @@ static inline void __free_one_page(struct page *page,
*/
if (page_is_guard(buddy)) {
clear_page_guard_flag(buddy);
- set_page_private(page, 0);
- __mod_zone_freepage_state(zone, 1 << order,
- migratetype);
+ set_page_private(buddy, 0);
+ if (!is_migrate_isolate(migratetype)) {
+ __mod_zone_freepage_state(zone, 1 << order,
+ migratetype);
+ }
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
@@ -715,14 +706,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
mt = get_freepage_migratetype(page);
+ if (unlikely(has_isolate_pageblock(zone)))
+ mt = get_pageblock_migratetype(page);
+
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
- if (likely(!is_migrate_isolate_page(page))) {
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
- if (is_migrate_cma(mt))
- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
- }
} while (--to_free && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
@@ -739,9 +728,11 @@ static void free_one_page(struct zone *zone,
if (nr_scanned)
__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ if (unlikely(has_isolate_pageblock(zone) ||
+ is_migrate_isolate(migratetype))) {
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ }
__free_one_page(page, pfn, zone, order, migratetype);
- if (unlikely(!is_migrate_isolate(migratetype)))
- __mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock(&zone->lock);
}
@@ -1484,7 +1475,7 @@ void split_page(struct page *page, unsigned int order)
}
EXPORT_SYMBOL_GPL(split_page);
-static int __isolate_free_page(struct page *page, unsigned int order)
+int __isolate_free_page(struct page *page, unsigned int order)
{
unsigned long watermark;
struct zone *zone;
@@ -6408,13 +6399,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
- outer_start, end);
+ pr_info("%s: [%lx, %lx) PFNs busy\n",
+ __func__, outer_start, end);
ret = -EBUSY;
goto done;
}
-
/* Grab isolated pages from freelists. */
outer_end = isolate_freepages_range(&cc, outer_start, end);
if (!outer_end) {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3708264d2833..5331c2bd85a2 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -171,6 +171,7 @@ static void free_page_cgroup(void *addr)
sizeof(struct page_cgroup) * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index d1473b2e9481..c8778f7e208e 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -60,6 +60,7 @@ out:
int migratetype = get_pageblock_migratetype(page);
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ zone->nr_isolate_pageblock++;
nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -75,16 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
+ struct page *isolated_page = NULL;
+ unsigned int order;
+ unsigned long page_idx, buddy_idx;
+ struct page *buddy;
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
goto out;
- nr_pages = move_freepages_block(zone, page, migratetype);
- __mod_zone_freepage_state(zone, nr_pages, migratetype);
+
+ /*
+ * Because freepage with more than pageblock_order on isolated
+ * pageblock is restricted to merge due to freepage counting problem,
+ * it is possible that there is free buddy page.
+ * move_freepages_block() doesn't care of merge so we need other
+ * approach in order to merge them. Isolation and free will make
+ * these pages to be merged.
+ */
+ if (PageBuddy(page)) {
+ order = page_order(page);
+ if (order >= pageblock_order) {
+ page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
+
+ if (!is_migrate_isolate_page(buddy)) {
+ __isolate_free_page(page, order);
+ set_page_refcounted(page);
+ isolated_page = page;
+ }
+ }
+ }
+
+ /*
+ * If we isolate freepage with more than pageblock_order, there
+ * should be no freepage in the range, so we could avoid costly
+ * pageblock scanning for freepage moving.
+ */
+ if (!isolated_page) {
+ nr_pages = move_freepages_block(zone, page, migratetype);
+ __mod_zone_freepage_state(zone, nr_pages, migratetype);
+ }
set_pageblock_migratetype(page, migratetype);
+ zone->nr_isolate_pageblock--;
out:
spin_unlock_irqrestore(&zone->lock, flags);
+ if (isolated_page)
+ __free_pages(isolated_page, order);
}
static inline struct page *
diff --git a/mm/rmap.c b/mm/rmap.c
index 116a5053415b..19886fb2f13a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1042,15 +1042,46 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- bool locked;
+ struct mem_cgroup *memcg;
unsigned long flags;
+ bool locked;
- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg, locked, flags);
+}
+
+static void page_remove_file_rmap(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+ bool locked;
+
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ goto out;
+
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page)))
+ goto out;
+
+ /*
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption disabled.
+ */
+ __dec_zone_page_state(page, NR_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+out:
+ mem_cgroup_end_page_stat(memcg, locked, flags);
}
/**
@@ -1061,46 +1092,33 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
- bool anon = PageAnon(page);
- bool locked;
- unsigned long flags;
-
- /*
- * The anon case has no mem_cgroup page_stat to update; but may
- * uncharge_page() below, where the lock ordering can deadlock if
- * we hold the lock against page_stat move: so avoid it on anon.
- */
- if (!anon)
- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page);
+ return;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
/*
- * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
- * and not charged by memcg for now.
- *
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
- * these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (unlikely(PageHuge(page)))
- goto out;
- if (anon) {
- if (PageTransHuge(page))
- __dec_zone_page_state(page,
- NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
- } else {
- __dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
- }
+ if (PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+ -hpage_nr_pages(page));
+
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1110,10 +1128,6 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
- return;
-out:
- if (!anon)
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
/*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3a6e0cfdf03a..dcdab81bd240 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -93,16 +93,6 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
s->object_size);
continue;
}
-
-#if !defined(CONFIG_SLUB)
- if (!strcmp(s->name, name)) {
- pr_err("%s (%s): Cache name already exists.\n",
- __func__, name);
- dump_stack();
- s = NULL;
- return -EINVAL;
- }
-#endif
}
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
@@ -269,6 +259,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
if (s->size - size >= sizeof(void *))
continue;
+ if (IS_ENABLED(CONFIG_SLAB) && align &&
+ (align > s->align || s->align % align))
+ continue;
+
return s;
}
return NULL;
diff --git a/mm/truncate.c b/mm/truncate.c
index 261eaf6e5a19..f1e4d6052369 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache);
* necessary) to @newsize. It will be typically be called from the filesystem's
* setattr function when ATTR_SIZE is passed in.
*
- * Must be called with inode_mutex held and before all filesystem specific
- * block truncation has been performed.
+ * Must be called with a lock serializing truncates and writes (generally
+ * i_mutex but e.g. xfs uses a different lock) and before all filesystem
+ * specific block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
@@ -755,7 +756,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
struct page *page;
pgoff_t index;
- WARN_ON(!mutex_is_locked(&inode->i_mutex));
WARN_ON(to > inode->i_size);
if (from >= to || bsize == PAGE_CACHE_SIZE)