diff options
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 243 |
1 files changed, 76 insertions, 167 deletions
@@ -66,7 +66,7 @@ static int dax_is_pte_entry(void *entry) static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_HZP; + return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) @@ -206,7 +206,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, for (;;) { entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || + if (!entry || + WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; @@ -241,14 +242,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, } static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) + pgoff_t index) { - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } + dax_unlock_mapping_entry(mapping, index); } /* @@ -258,7 +254,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { - if (!radix_tree_exceptional_entry(entry)) + if (!entry) return; /* We have to wake up next waiter for the radix tree entry lock */ @@ -266,15 +262,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, } /* - * Find radix tree entry at given index. If it points to a page, return with - * the page locked. If it points to the exceptional entry, return with the - * radix tree entry locked. If the radix tree doesn't contain given index, - * create empty exceptional entry for the index and return with it locked. + * Find radix tree entry at given index. If it points to an exceptional entry, + * return it with the radix tree entry locked. If the radix tree doesn't + * contain given index, create an empty exceptional entry for the index and + * return with it locked. * * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries (either zero pages or DAX entries) - * within the 2MiB range that we are requesting. + * happen if there are any 4k entries within the 2MiB range that we are + * requesting. * * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB @@ -301,18 +297,21 @@ restart: spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { + entry = ERR_PTR(-EIO); + goto out_unlock; + } + if (entry) { if (size_flag & RADIX_DAX_PMD) { - if (!radix_tree_exceptional_entry(entry) || - dax_is_pte_entry(entry)) { + if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; } } else { /* trying to grab a PTE entry */ - if (radix_tree_exceptional_entry(entry) && - dax_is_pmd_entry(entry) && + if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; @@ -346,7 +345,7 @@ restart: mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); if (err) { if (pmd_downgrade) - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } spin_lock_irq(&mapping->tree_lock); @@ -396,21 +395,6 @@ restart: spin_unlock_irq(&mapping->tree_lock); return entry; } - /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(entry)) { - struct page *page = entry; - - get_page(page); - spin_unlock_irq(&mapping->tree_lock); - lock_page(page); - /* Page got truncated? Retry... */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto restart; - } - return page; - } entry = lock_slot(mapping, slot); out_unlock: spin_unlock_irq(&mapping->tree_lock); @@ -426,7 +410,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) + if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || @@ -508,47 +492,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; - int error = 0; - bool hole_fill = false; void *new_entry; pgoff_t index = vmf->pgoff; if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - /* Replacing hole page with block mapping? */ - if (!radix_tree_exceptional_entry(entry)) { - hole_fill = true; - /* - * Unmap the page now before we remove it from page cache below. - * The page is locked so it cannot be faulted in again. - */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); - if (error) - return ERR_PTR(error); - } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { - /* replacing huge zero page with PMD block mapping */ - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + /* we are replacing a zero page with block mapping */ + if (dax_is_pmd_entry(entry)) + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, + PMD_SIZE, 0); + else /* pte entry */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); new_entry = dax_radix_locked_entry(sector, flags); - if (hole_fill) { - __delete_from_page_cache(entry, NULL); - /* Drop pagecache reference */ - put_page(entry); - error = __radix_tree_insert(page_tree, index, - dax_radix_order(new_entry), new_entry); - if (error) { - new_entry = ERR_PTR(error); - goto unlock; - } - mapping->nrexceptional++; - } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Only swap our new entry into the radix tree if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -565,23 +529,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, new_entry, NULL, NULL); + entry = new_entry; } + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); - unlock: + spin_unlock_irq(&mapping->tree_lock); - if (hole_fill) { - radix_tree_preload_end(); - /* - * We don't need hole page anymore, it has been replaced with - * locked radix tree entry now. - */ - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(entry); - unlock_page(entry); - put_page(entry); - } - return new_entry; + return entry; } static inline unsigned long @@ -683,7 +638,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ - if (!entry2 || !radix_tree_exceptional_entry(entry2)) + if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to @@ -755,7 +710,7 @@ static int dax_writeback_one(struct block_device *bdev, trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); dax_unlock: dax_read_unlock(id); - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ret; put_unlocked: @@ -830,11 +785,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void **entryp, + sector_t sector, size_t size, void *entry, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - void *entry = *entryp; void *ret, *kaddr; pgoff_t pgoff; int id, rc; @@ -855,87 +809,44 @@ static int dax_insert_mapping(struct address_space *mapping, ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); - *entryp = ret; trace_dax_insert_mapping(mapping->host, vmf, ret); - return vm_insert_mixed(vma, vaddr, pfn); -} - -/** - * dax_pfn_mkwrite - handle first write to DAX page - * @vmf: The description of the fault - */ -int dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - void *entry, **slot; - pgoff_t index = vmf->pgoff; - - spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, &slot); - if (!entry || !radix_tree_exceptional_entry(entry)) { - if (entry) - put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); - trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; - } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - /* - * If we race with somebody updating the PTE and finish_mkwrite_fault() - * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry - * the fault in either case. - */ - finish_mkwrite_fault(vmf); - put_locked_mapping_entry(mapping, index, entry); - trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + if (vmf->flags & FAULT_FLAG_WRITE) + return vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + return vm_insert_mixed(vma, vaddr, pfn); } -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); /* - * The user has performed a load from a hole in the file. Allocating - * a new page in the file would cause excessive storage usage for - * workloads with sparse files. We allocate a page cache page instead. - * We'll kick it out of the page cache if it's ever written to, - * otherwise it will simply fall out of the page cache under memory - * pressure without ever having been dirtied. + * The user has performed a load from a hole in the file. Allocating a new + * page in the file would cause excessive storage usage for workloads with + * sparse files. Instead we insert a read-only mapping of the 4k zero page. + * If this page is ever written to we will re-fault and change the mapping to + * point to real DAX storage instead. */ -static int dax_load_hole(struct address_space *mapping, void **entry, +static int dax_load_hole(struct address_space *mapping, void *entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; - struct page *page; - int ret; - - /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(*entry)) { - page = *entry; - goto finish_fault; - } + unsigned long vaddr = vmf->address; + int ret = VM_FAULT_NOPAGE; + struct page *zero_page; + void *entry2; - /* This will replace locked radix tree entry with a hole page */ - page = find_or_create_page(mapping, vmf->pgoff, - vmf->gfp_mask | __GFP_ZERO); - if (!page) { + zero_page = ZERO_PAGE(0); + if (unlikely(!zero_page)) { ret = VM_FAULT_OOM; goto out; } -finish_fault: - vmf->page = page; - ret = finish_fault(vmf); - vmf->page = NULL; - *entry = page; - if (!ret) { - /* Grab reference for PTE that is now referencing the page */ - get_page(page); - ret = VM_FAULT_NOPAGE; + entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_ZERO_PAGE); + if (IS_ERR(entry2)) { + ret = VM_FAULT_SIGBUS; + goto out; } + + vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1223,7 +1134,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, &entry, vmf->vma, vmf); + sector, PAGE_SIZE, entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1231,7 +1142,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - vmf_ret = dax_load_hole(mapping, &entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1258,7 +1169,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + put_locked_mapping_entry(mapping, vmf->pgoff); out: trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; @@ -1272,7 +1183,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void **entryp) + loff_t pos, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1303,11 +1214,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, goto unlock_fallback; dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; - *entryp = ret; trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, @@ -1321,7 +1231,7 @@ fallback: } static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void **entryp) + void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1336,11 +1246,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, if (unlikely(!zero_page)) goto fallback; - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, - RADIX_DAX_PMD | RADIX_DAX_HZP); + ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); if (IS_ERR(ret)) goto fallback; - *entryp = ret; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1416,10 +1325,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. + * grab_mapping_entry() will make sure we get a 2MiB empty entry, a + * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page + * is already in the tree, for instance), it will return -EEXIST and + * we just fall back to 4k entries. */ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); if (IS_ERR(entry)) @@ -1452,13 +1361,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, &entry); + result = dax_pmd_load_hole(vmf, &iomap, entry); break; default: WARN_ON_ONCE(1); @@ -1481,7 +1390,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); + put_locked_mapping_entry(mapping, pgoff); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); |