diff options
40 files changed, 1673 insertions, 796 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 426115f7cb63..84b1b21b08ae 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2,6 +2,7 @@ config ARM bool default y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 259c0ca9c99a..e356357d86bb 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -15,20 +15,6 @@ config ARM_PTDUMP kernel. If in doubt, say "N" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. - - If this option is switched on, the /dev/mem file only allows - userspace access to memory mapped peripherals. - - If in doubt, say Y. - # RMK wants arm kernels compiled with frame pointers or stack unwinding. # If you know what you are doing and are willing to live without stack # traces, you can get a slightly smaller kernel by setting this option to diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d6ebffdc6bb1..4d5b416e2e4b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -3,6 +3,7 @@ config ARM64 select ACPI_CCA_REQUIRED if ACPI select ACPI_GENERIC_GSI if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index 04fb73b973f1..e13c4bf84d9e 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -14,20 +14,6 @@ config ARM64_PTDUMP kernel. If in doubt, say "N" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU - help - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. - - If this option is switched on, the /dev/mem file only allows - userspace access to memory mapped peripherals. - - If in doubt, say Y. - config PID_IN_CONTEXTIDR bool "Write the current PID to the CONTEXTIDR register" help diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 34aa19352dc1..03bfd6bf03e7 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -10,6 +10,7 @@ config FRV select HAVE_DEBUG_BUGVERBOSE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_WANT_IPC_PARSE_VERSION select OLD_SIGSUSPEND3 select OLD_SIGACTION diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 9e44bbd8051e..836ac5a963c8 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -13,6 +13,7 @@ config M32R select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_ATOMIC64 + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_USES_GETTIMEOFFSET select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index db49e0d796b1..85eabc49de61 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -159,6 +159,7 @@ config PPC select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB select ARCH_HAS_DMA_SET_COHERENT_MASK + select ARCH_HAS_DEVMEM_IS_ALLOWED select HAVE_ARCH_SECCOMP_FILTER config GENERIC_CSUM diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 3a510f4a6b68..a0e44a9c456f 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR platform probing is done, all platforms selected must share the same address. -config STRICT_DEVMEM - def_bool y - prompt "Filter access to /dev/mem" - help - This option restricts access to /dev/mem. If this option is - disabled, you allow userspace access to all memory, including - kernel and userspace memory. Accidental memory access is likely - to be disastrous. - Memory access is required for experts who want to debug the kernel. - - If you are unsure, say Y. - config FAIL_IOMMU bool "Fault-injection capability for IOMMU" depends on FAULT_INJECTION diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 6d0ae7d30724..24490344c30f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -66,6 +66,7 @@ config S390 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SG_CHAIN diff --git a/arch/s390/Kconfig.debug b/arch/s390/Kconfig.debug index c56878e1245f..26c5d5beb4be 100644 --- a/arch/s390/Kconfig.debug +++ b/arch/s390/Kconfig.debug @@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config STRICT_DEVMEM - def_bool y - prompt "Filter access to /dev/mem" - ---help--- - This option restricts access to /dev/mem. If this option is - disabled, you allow userspace access to all memory, including - kernel and userspace memory. Accidental memory access is likely - to be disastrous. - Memory access is required for experts who want to debug the kernel. - - If you are unsure, say Y. - config S390_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 8ec7a4599c08..be05b4ae02b6 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -19,6 +19,7 @@ config TILE select VIRT_TO_BUS select SYS_HYPERVISOR select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA @@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT config TRACE_IRQFLAGS_SUPPORT def_bool y -config STRICT_DEVMEM - def_bool y - # SMP is required for Tilera Linux. config SMP def_bool y diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index c9faddc61100..5dc4c0a43ccd 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -1,5 +1,6 @@ config UNICORE32 def_bool y + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_MEMBLOCK diff --git a/arch/unicore32/Kconfig.debug b/arch/unicore32/Kconfig.debug index 1a3626239843..f075bbe1d46f 100644 --- a/arch/unicore32/Kconfig.debug +++ b/arch/unicore32/Kconfig.debug @@ -2,20 +2,6 @@ menu "Kernel hacking" source "lib/Kconfig.debug" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. - - If this option is switched on, the /dev/mem file only allows - userspace access to memory mapped peripherals. - - If in doubt, say Y. - config EARLY_PRINTK def_bool DEBUG_OCD help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ccfededfe470..5d2293417946 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 110253ce83af..9b18ed97a8a2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. Note that with PAT support - enabled, even in this case there are restrictions on /dev/mem - use due to the cache aliasing requirements. - - If this option is switched on, the /dev/mem file only allows - userspace access to PCI space and the BIOS code and data regions. - This is sufficient for dosemu and X and all common users of - /dev/mem. - - If in doubt, say Y. - config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y diff --git a/block/Makefile b/block/Makefile index 00ecc97629db..db5f622c9d67 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ - partitions/ + badblocks.o partitions/ obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o diff --git a/block/badblocks.c b/block/badblocks.c new file mode 100644 index 000000000000..7be53cb1cc3c --- /dev/null +++ b/block/badblocks.c @@ -0,0 +1,585 @@ +/* + * Bad block management + * + * - Heavily based on MD badblocks code from Neil Brown + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/badblocks.h> +#include <linux/seqlock.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/slab.h> + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + /* This could still be the one, earlier ranges + * could not. + */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 0; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range + */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' + */ + if (bb->count >= MAX_BADBLOCKS) { + /* No room for more */ + rv = 1; + break; + } else { + int this_sectors = sectors; + + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_clear); + +/** + * ack_all_badblocks() - Acknowledge all bad blocks in a list. + * @bb: the badblocks structure that holds all badblock information + * + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(ack_all_badblocks); + +/** + * badblocks_show() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of returned data + */ +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} +EXPORT_SYMBOL_GPL(badblocks_show); + +/** + * badblocks_store() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @len: length of data received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of the buffer processed or -ve error. + */ +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack) +{ + unsigned long long sector; + int length; + char newline; + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (badblocks_set(bb, sector, length, !unack)) + return -ENOSPC; + else + return len; +} +EXPORT_SYMBOL_GPL(badblocks_store); + +static int __badblocks_init(struct device *dev, struct badblocks *bb, + int enable) +{ + bb->dev = dev; + bb->count = 0; + if (enable) + bb->shift = 0; + else + bb->shift = -1; + if (dev) + bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); + else + bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->page) { + bb->shift = -1; + return -ENOMEM; + } + seqlock_init(&bb->lock); + + return 0; +} + +/** + * badblocks_init() - initialize the badblocks structure + * @bb: the badblocks structure that holds all badblock information + * @enable: weather to enable badblocks accounting + * + * Return: + * 0: success + * -ve errno: on error + */ +int badblocks_init(struct badblocks *bb, int enable) +{ + return __badblocks_init(NULL, bb, enable); +} +EXPORT_SYMBOL_GPL(badblocks_init); + +int devm_init_badblocks(struct device *dev, struct badblocks *bb) +{ + if (!bb) + return -EINVAL; + return __badblocks_init(dev, bb, 1); +} +EXPORT_SYMBOL_GPL(devm_init_badblocks); + +/** + * badblocks_exit() - free the badblocks structure + * @bb: the badblocks structure that holds all badblock information + */ +void badblocks_exit(struct badblocks *bb) +{ + if (!bb) + return; + if (bb->dev) + devm_kfree(bb->dev, bb->page); + else + kfree(bb->page); + bb->page = NULL; +} +EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/block/genhd.c b/block/genhd.c index e5cafa51567c..5aaeb2ad0fd3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -20,6 +20,7 @@ #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> +#include <linux/badblocks.h> #include "blk.h" @@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk) } EXPORT_SYMBOL(del_gendisk); +/* sysfs access to bad-blocks list. */ +static ssize_t disk_badblocks_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return sprintf(page, "\n"); + + return badblocks_show(disk->bb, page, 0); +} + +static ssize_t disk_badblocks_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t len) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return -ENXIO; + + return badblocks_store(disk->bb, page, len, 0); +} + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, + disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_badblocks.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/block/ioctl.c b/block/ioctl.c index 0918aed2d847..2c84683aada5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,6 +4,7 @@ #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> +#include <linux/badblocks.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> @@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } +#ifdef CONFIG_FS_DAX +bool blkdev_dax_capable(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + + if (!disk->fops->direct_access) + return false; + + /* + * If the partition is not aligned on a page boundary, we can't + * do dax I/O to it. + */ + if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) + || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + return false; + + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + + return true; +} + +static int blkdev_daxset(struct block_device *bdev, unsigned long argp) +{ + unsigned long arg; + int rc = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (get_user(arg, (int __user *)(argp))) + return -EFAULT; + arg = !!arg; + if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) + return 0; + + if (arg) + arg = S_DAX; + + if (arg && !blkdev_dax_capable(bdev)) + return -ENOTTY; + + mutex_lock(&bdev->bd_inode->i_mutex); + if (bdev->bd_map_count == 0) + inode_set_flags(bdev->bd_inode, arg, S_DAX); + else + rc = -EBUSY; + mutex_unlock(&bdev->bd_inode->i_mutex); + return rc; +} +#else +static int blkdev_daxset(struct block_device *bdev, int arg) +{ + if (arg) + return -ENOTTY; + return 0; +} +#endif + static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKDAXSET: + return blkdev_daxset(bdev, arg); + case BLKDAXGET: + return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); + break; case IOC_PR_REGISTER: return blkdev_pr_register(bdev, argp); case IOC_PR_RESERVE: diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index aa45d4802707..ad6d8c6b777e 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/ndctl.h> +#include <linux/delay.h> #include <linux/list.h> #include <linux/acpi.h> #include <linux/sort.h> @@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, /* devm will free nfit_blk */ } +static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_cap *cmd, u64 addr, u64 length) +{ + cmd->address = addr; + cmd->length = length; + + return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd, + sizeof(*cmd)); +} + +static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_start *cmd, u64 addr, u64 length) +{ + int rc; + + cmd->address = addr; + cmd->length = length; + cmd->type = ND_ARS_PERSISTENT; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd, + sizeof(*cmd)); + if (rc) + return rc; + switch (cmd->status) { + case 0: + return 0; + case 1: + /* ARS unsupported, but we should never get here */ + return 0; + case 2: + return -EINVAL; + case 3: + /* ARS is in progress */ + msleep(1000); + break; + default: + return -ENXIO; + } + } +} + +static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc, + struct nd_cmd_ars_status *cmd) +{ + int rc; + + while (1) { + rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd, + sizeof(*cmd)); + if (rc || cmd->status & 0xffff) + return -ENXIO; + + /* Check extended status (Upper two bytes) */ + switch (cmd->status >> 16) { + case 0: + return 0; + case 1: + /* ARS is in progress */ + msleep(1000); + break; + case 2: + /* No ARS performed for the current boot */ + return 0; + default: + return -ENXIO; + } + } +} + +static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus, + struct nd_cmd_ars_status *ars_status, u64 start) +{ + int rc; + u32 i; + + /* + * The address field returned by ars_status should be either + * less than or equal to the address we last started ARS for. + * The (start, length) returned by ars_status should also have + * non-zero overlap with the range we started ARS for. + * If this is not the case, bail. + */ + if (ars_status->address > start || + (ars_status->address + ars_status->length < start)) + return -ENXIO; + + for (i = 0; i < ars_status->num_records; i++) { + rc = nvdimm_bus_add_poison(nvdimm_bus, + ars_status->records[i].err_address, + ars_status->records[i].length); + if (rc) + return rc; + } + + return 0; +} + +static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc, + struct nd_region_desc *ndr_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus; + struct nd_cmd_ars_status *ars_status = NULL; + struct nd_cmd_ars_start *ars_start = NULL; + struct nd_cmd_ars_cap *ars_cap = NULL; + u64 start, len, cur, remaining; + int rc; + + ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL); + if (!ars_cap) + return -ENOMEM; + + start = ndr_desc->res->start; + len = ndr_desc->res->end - ndr_desc->res->start + 1; + + rc = ars_get_cap(nd_desc, ars_cap, start, len); + if (rc) + goto out; + + /* + * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in + * extended status is not set, skip this but continue initialization + */ + if ((ars_cap->status & 0xffff) || + !(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) { + dev_warn(acpi_desc->dev, + "ARS unsupported (status: 0x%x), won't create an error list\n", + ars_cap->status); + goto out; + } + + /* + * Check if a full-range ARS has been run. If so, use those results + * without having to start a new ARS. + */ + ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status), + GFP_KERNEL); + if (!ars_status) { + rc = -ENOMEM; + goto out; + } + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + if (ars_status->address <= start && + (ars_status->address + ars_status->length >= start + len)) { + rc = ars_status_process_records(nvdimm_bus, ars_status, start); + goto out; + } + + /* + * ARS_STATUS can overflow if the number of poison entries found is + * greater than the maximum buffer size (ars_cap->max_ars_out) + * To detect overflow, check if the length field of ars_status + * is less than the length we supplied. If so, process the + * error entries we got, adjust the start point, and start again + */ + ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL); + if (!ars_start) + return -ENOMEM; + + cur = start; + remaining = len; + do { + u64 done, end; + + rc = ars_do_start(nd_desc, ars_start, cur, remaining); + if (rc) + goto out; + + rc = ars_get_status(nd_desc, ars_status); + if (rc) + goto out; + + rc = ars_status_process_records(nvdimm_bus, ars_status, cur); + if (rc) + goto out; + + end = min(cur + remaining, + ars_status->address + ars_status->length); + done = end - cur; + cur += done; + remaining -= done; + } while (remaining); + + out: + kfree(ars_cap); + kfree(ars_start); + kfree(ars_status); + return rc; +} + static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, @@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { + rc = acpi_nfit_find_poison(acpi_desc, ndr_desc); + if (rc) { + dev_err(acpi_desc->dev, + "error while performing ARS to find poison: %d\n", + rc); + return rc; + } if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 61aacab424cf..31b595479aa5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,6 +34,7 @@ #include <linux/kthread.h> #include <linux/blkdev.h> +#include <linux/badblocks.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/fs.h> @@ -710,8 +711,7 @@ void md_rdev_clear(struct md_rdev *rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } - kfree(rdev->badblocks.page); - rdev->badblocks.page = NULL; + badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); @@ -1361,8 +1361,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) return cpu_to_le32(csum); } -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; @@ -1487,8 +1485,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (md_set_badblocks(&rdev->badblocks, - sector, count, 1) == 0) + if (badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -2320,7 +2317,7 @@ repeat: rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); @@ -2446,7 +2443,7 @@ repeat: clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } @@ -3054,11 +3051,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack); -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); - +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); @@ -3173,14 +3176,7 @@ int md_rdev_init(struct md_rdev *rdev) * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ - rdev->badblocks.count = 0; - rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ - rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); - seqlock_init(&rdev->badblocks.lock); - if (rdev->badblocks.page == NULL) - return -ENOMEM; - - return 0; + return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -8489,254 +8485,9 @@ void md_finish_reshape(struct mddev *mddev) } EXPORT_SYMBOL(md_finish_reshape); -/* Bad block management. - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. - * - * Locking of the bad-block table uses a seqlock so md_is_badblock - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. - * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. - * We return - * 0 if there are no known bad blocks in the range - * 1 if there are known bad block which are all acknowledged - * -1 if there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. - */ -int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - int hi; - int lo; - u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; - - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<<bb->shift) - 1; - target >>= bb->shift; - sectors = target - s; - } - /* 'target' is now the first block after the bad range */ - -retry: - seq = read_seqbegin(&bb->lock); - lo = 0; - rv = 0; - hi = bb->count; - - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. */ - lo = mid; - else - /* This and later ranges are definitely out. */ - hi = mid; - } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. - */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; - } - } - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return rv; -} -EXPORT_SYMBOL_GPL(md_is_badblock); - -/* - * Add a range of bad blocks to the table. - * This might extend the table, or might contract it - * if two adjacent ranges can be merged. - * We binary-search to find the 'insertion' point, then - * decide how best to handle it. - */ -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) -{ - u64 *p; - int lo, hi; - int rv = 1; - unsigned long flags; - - if (bb->shift < 0) - /* badblocks are disabled */ - return 0; - - if (bb->shift) { - /* round the start down, and the end up */ - sector_t next = s + sectors; - s >>= bb->shift; - next += (1<<bb->shift) - 1; - next >>= bb->shift; - sectors = next - s; - } - - write_seqlock_irqsave(&bb->lock, flags); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; - } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; - } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - } - } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - lo = hi; - hi++; - } - } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; - } - } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' */ - if (bb->count >= MD_MAX_BADBLOCKS) { - /* No room for more */ - rv = 0; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; - - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } - } - - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - write_sequnlock_irqrestore(&bb->lock, flags); - - return rv; -} +/* Bad block management */ +/* Returns 1 on success, 0 on failure */ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8745,114 +8496,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - rv = md_set_badblocks(&rdev->badblocks, - s, sectors, 0); - if (rv) { + rv = badblocks_set(&rdev->badblocks, s, sectors, 0); + if (rv == 0) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); - } - return rv; + return 1; + } else + return 0; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -/* - * Remove a range of bad blocks from the table. - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - */ -static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) -{ - u64 *p; - int lo, hi; - sector_t target = s + sectors; - int rv = 0; - - if (bb->shift > 0) { - /* When clearing we round the start up and the end down. - * This should not matter as the shift should align with - * the block size and no rounding should ever be needed. - * However it is better the think a block is bad when it - * isn't than to think a block is not bad when it is. - */ - s += (1<<bb->shift) - 1; - s >>= bb->shift; - target >>= bb->shift; - sectors = target - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; - } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MD_MAX_BADBLOCKS) { - rv = -ENOSPC; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; - } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded - */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); - } - } - - bb->changed = 1; -out: - write_sequnlock_irq(&bb->lock); - return rv; -} - int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8860,133 +8516,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - return md_clear_badblocks(&rdev->badblocks, + return badblocks_clear(&rdev->badblocks, s, sectors); } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); -/* - * Acknowledge all bad blocks in a list. - * This only succeeds if ->changed is clear. It is used by - * in-kernel metadata updates - */ -void md_ack_all_badblocks(struct badblocks *bb) -{ - if (bb->page == NULL || bb->changed) - /* no point even trying */ - return; - write_seqlock_irq(&bb->lock); - - if (bb->changed == 0 && bb->unacked_exist) { - u64 *p = bb->page; - int i; - for (i = 0; i < bb->count ; i++) { - if (!BB_ACK(p[i])) { - sector_t start = BB_OFFSET(p[i]); - int len = BB_LEN(p[i]); - p[i] = BB_MAKE(start, len, 1); - } - } - bb->unacked_exist = 0; - } - write_sequnlock_irq(&bb->lock); -} -EXPORT_SYMBOL_GPL(md_ack_all_badblocks); - -/* sysfs access to bad-blocks list. - * We present two files. - * 'bad-blocks' lists sector numbers and lengths of ranges that - * are recorded as bad. The list is truncated to fit within - * the one-page limit of sysfs. - * Writing "sector length" to this file adds an acknowledged - * bad block list. - * 'unacknowledged-bad-blocks' lists bad blocks that have not yet - * been acknowledged. Writing to this file adds bad blocks - * without acknowledging them. This is largely for testing. - */ - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack) -{ - size_t len; - int i; - u64 *p = bb->page; - unsigned seq; - - if (bb->shift < 0) - return 0; - -retry: - seq = read_seqbegin(&bb->lock); - - len = 0; - i = 0; - - while (len < PAGE_SIZE && i < bb->count) { - sector_t s = BB_OFFSET(p[i]); - unsigned int length = BB_LEN(p[i]); - int ack = BB_ACK(p[i]); - i++; - - if (unack && ack) - continue; - - len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", - (unsigned long long)s << bb->shift, - length << bb->shift); - } - if (unack && len == 0) - bb->unacked_exist = 0; - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return len; -} - -#define DO_DEBUG 1 - -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) -{ - unsigned long long sector; - int length; - char newline; -#ifdef DO_DEBUG - /* Allow clearing via sysfs *only* for testing/debugging. - * Normally only a successful write may clear a badblock - */ - int clear = 0; - if (page[0] == '-') { - clear = 1; - page++; - } -#endif /* DO_DEBUG */ - - switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { - case 3: - if (newline != '\n') - return -EINVAL; - case 2: - if (length <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - -#ifdef DO_DEBUG - if (clear) { - md_clear_badblocks(bb, sector, length); - return len; - } -#endif /* DO_DEBUG */ - if (md_set_badblocks(bb, sector, length, !unack)) - return len; - else - return -ENOSPC; -} - static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { diff --git a/drivers/md/md.h b/drivers/md/md.h index ca0b643fe3c1..75b9aaacb03f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -17,6 +17,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/badblocks.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mm.h> @@ -28,13 +29,6 @@ #define MaxSector (~(sector_t)0) -/* Bad block numbers are stored sorted in a single page. - * 64bits is used for each block or extent. - * 54 bits are sector number, 9 bits are extent size, - * 1 bit is an 'acknowledged' flag. - */ -#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) - /* * MD's 'extended' device */ @@ -117,22 +111,7 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - struct badblocks { - int count; /* count of bad blocks */ - int unacked_exist; /* there probably are unacknowledged - * bad blocks. This is only cleared - * when a read discovers none - */ - int shift; /* shift from sectors to block size - * a -ve shift means badblocks are - * disabled.*/ - u64 *page; /* badblock list */ - int changed; - seqlock_t lock; - - sector_t sector; - sector_t size; /* in sectors */ - } badblocks; + struct badblocks badblocks; }; enum flag_bits { Faulty, /* device is known to have a fault */ @@ -185,22 +164,11 @@ enum flag_bits { */ }; -#define BB_LEN_MASK (0x00000000000001FFULL) -#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) -#define BB_ACK_MASK (0x8000000000000000ULL) -#define BB_MAX_LEN 512 -#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) -#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) -#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) -#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) - -extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { if (unlikely(rdev->badblocks.count)) { - int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, sectors, first_bad, bad_sectors); if (rv) @@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); -extern void md_ack_all_badblocks(struct badblocks *bb); - struct md_cluster_info; struct mddev { diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 82c49bb87055..2e2832b83c93 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #include <linux/libnvdimm.h> +#include <linux/badblocks.h> #include <linux/export.h> #include <linux/module.h> #include <linux/blkdev.h> @@ -325,6 +326,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, if (!nvdimm_bus) return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); + INIT_LIST_HEAD(&nvdimm_bus->poison_list); init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); @@ -359,6 +361,172 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(__nvdimm_bus_register); +static void set_badblock(struct badblocks *bb, sector_t s, int num) +{ + dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", + (u64) s * 512, (u64) num * 512); + /* this isn't an error as the hardware will still throw an exception */ + if (badblocks_set(bb, s, num, 1)) + dev_info_once(bb->dev, "%s: failed for sector %llx\n", + __func__, (u64) s); +} + +/** + * __add_badblock_range() - Convert a physical address range to bad sectors + * @bb: badblocks instance to populate + * @ns_offset: namespace offset where the error range begins (in bytes) + * @len: number of bytes of poison to be added + * + * This assumes that the range provided with (ns_offset, len) is within + * the bounds of physical addresses for this namespace, i.e. lies in the + * interval [ns_start, ns_start + ns_size) + */ +static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) +{ + const unsigned int sector_size = 512; + sector_t start_sector; + u64 num_sectors; + u32 rem; + + start_sector = div_u64(ns_offset, sector_size); + num_sectors = div_u64_rem(len, sector_size, &rem); + if (rem) + num_sectors++; + + if (unlikely(num_sectors > (u64)INT_MAX)) { + u64 remaining = num_sectors; + sector_t s = start_sector; + + while (remaining) { + int done = min_t(u64, remaining, INT_MAX); + + set_badblock(bb, s, done); + remaining -= done; + s += done; + } + } else + set_badblock(bb, start_sector, num_sectors); +} + +/** + * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks + * @ndns: the namespace containing poison ranges + * @bb: badblocks instance to populate + * @offset: offset at the start of the namespace before 'sector 0' + * + * The poison list generated during NFIT initialization may contain multiple, + * possibly overlapping ranges in the SPA (System Physical Address) space. + * Compare each of these ranges to the namespace currently being initialized, + * and add badblocks to the gendisk for all matching sub-ranges + */ +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + struct nvdimm_bus *nvdimm_bus; + struct list_head *poison_list; + u64 ns_start, ns_end, ns_size; + struct nd_poison *pl; + + ns_size = nvdimm_namespace_capacity(ndns) - offset; + ns_start = nsio->res.start + offset; + ns_end = nsio->res.end; + + nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent); + poison_list = &nvdimm_bus->poison_list; + if (list_empty(poison_list)) + return; + + list_for_each_entry(pl, poison_list, list) { + u64 pl_end = pl->start + pl->length - 1; + + /* Discard intervals with no intersection */ + if (pl_end < ns_start) + continue; + if (pl->start > ns_end) + continue; + /* Deal with any overlap after start of the namespace */ + if (pl->start >= ns_start) { + u64 start = pl->start; + u64 len; + + if (pl_end <= ns_end) + len = pl->length; + else + len = ns_start + ns_size - pl->start; + __add_badblock_range(bb, start - ns_start, len); + continue; + } + /* Deal with overlap for poison starting before the namespace */ + if (pl->start < ns_start) { + u64 len; + + if (pl_end < ns_end) + len = pl->start + pl->length - ns_start; + else + len = ns_size; + __add_badblock_range(bb, 0, len); + } + } +} +EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison); + +static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + pl = kzalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + + pl->start = addr; + pl->length = length; + list_add_tail(&pl->list, &nvdimm_bus->poison_list); + + return 0; +} + +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +{ + struct nd_poison *pl; + + if (list_empty(&nvdimm_bus->poison_list)) + return __add_poison(nvdimm_bus, addr, length); + + /* + * There is a chance this is a duplicate, check for those first. + * This will be the common case as ARS_STATUS returns all known + * errors in the SPA space, and we can't query it per region + */ + list_for_each_entry(pl, &nvdimm_bus->poison_list, list) + if (pl->start == addr) { + /* If length has changed, update this list entry */ + if (pl->length != length) + pl->length = length; + return 0; + } + + /* + * If not a duplicate or a simple length update, add the entry as is, + * as any overlapping ranges will get resolved when the list is consumed + * and converted to badblocks + */ + return __add_poison(nvdimm_bus, addr, length); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); + +static void free_poison_list(struct list_head *poison_list) +{ + struct nd_poison *pl, *next; + + list_for_each_entry_safe(pl, next, poison_list, list) { + list_del(&pl->list); + kfree(pl); + } + list_del_init(poison_list); +} + static int child_unregister(struct device *dev, void *data) { /* @@ -385,6 +553,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + free_poison_list(&nvdimm_bus->poison_list); nvdimm_bus_destroy_ndctl(nvdimm_bus); device_unregister(&nvdimm_bus->dev); diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 0955b2cb10fe..8ebfcaae3f5a 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -77,6 +77,59 @@ static bool is_namespace_io(struct device *dev) return dev ? dev->type == &namespace_io_device_type : false; } +static int is_uuid_busy(struct device *dev, void *data) +{ + u8 *uuid1 = data, *uuid2 = NULL; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid2 = nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid2 = nsblk->uuid; + } else if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + uuid2 = nd_btt->uuid; + } else if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + uuid2 = nd_pfn->uuid; + } + + if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0) + return -EBUSY; + + return 0; +} + +static int is_namespace_uuid_busy(struct device *dev, void *data) +{ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + return device_for_each_child(dev, data, is_uuid_busy); + return 0; +} + +/** + * nd_is_uuid_unique - verify that no other namespace has @uuid + * @dev: any device on a nvdimm_bus + * @uuid: uuid to check + */ +bool nd_is_uuid_unique(struct device *dev, u8 *uuid) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); + if (device_for_each_child(&nvdimm_bus->dev, uuid, + is_namespace_uuid_busy) != 0) + return false; + return true; +} + bool pmem_should_map_pages(struct device *dev) { struct nd_region *nd_region = to_nd_region(dev->parent); @@ -104,20 +157,10 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, struct nd_region *nd_region = to_nd_region(ndns->dev.parent); const char *suffix = NULL; - if (ndns->claim) { - if (is_nd_btt(ndns->claim)) - suffix = "s"; - else if (is_nd_pfn(ndns->claim)) - suffix = "m"; - else - dev_WARN_ONCE(&ndns->dev, 1, - "unknown claim type by %s\n", - dev_name(ndns->claim)); - } + if (ndns->claim && is_nd_btt(ndns->claim)) + suffix = "s"; if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { - if (!suffix && pmem_should_map_pages(&ndns->dev)) - suffix = "m"; sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : ""); } else if (is_namespace_blk(&ndns->dev)) { struct nd_namespace_blk *nsblk; @@ -791,6 +834,15 @@ static void nd_namespace_pmem_set_size(struct nd_region *nd_region, res->end = nd_region->ndr_start + size - 1; } +static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where) +{ + if (!uuid) { + dev_dbg(dev, "%s: uuid not set\n", where); + return true; + } + return false; +} + static ssize_t __size_store(struct device *dev, unsigned long long val) { resource_size_t allocated = 0, available = 0; @@ -820,8 +872,12 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) * We need a uuid for the allocation-label and dimm(s) on which * to store the label. */ - if (!uuid || nd_region->ndr_mappings == 0) + if (uuid_not_set(uuid, dev, __func__)) return -ENXIO; + if (nd_region->ndr_mappings == 0) { + dev_dbg(dev, "%s: not associated with dimm(s)\n", __func__); + return -ENXIO; + } div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder); if (remainder) { @@ -1211,6 +1267,29 @@ static ssize_t holder_show(struct device *dev, } static DEVICE_ATTR_RO(holder); +static ssize_t mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + struct device *claim; + char *mode; + ssize_t rc; + + device_lock(dev); + claim = ndns->claim; + if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim))) + mode = "memory"; + else if (claim && is_nd_btt(claim)) + mode = "safe"; + else + mode = "raw"; + rc = sprintf(buf, "%s\n", mode); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(mode); + static ssize_t force_raw_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -1234,6 +1313,7 @@ static DEVICE_ATTR_RW(force_raw); static struct attribute *nd_namespace_attributes[] = { &dev_attr_nstype.attr, &dev_attr_size.attr, + &dev_attr_mode.attr, &dev_attr_uuid.attr, &dev_attr_holder.attr, &dev_attr_resource.attr, @@ -1267,7 +1347,8 @@ static umode_t namespace_visible(struct kobject *kobj, if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr || a == &dev_attr_holder.attr - || a == &dev_attr_force_raw.attr) + || a == &dev_attr_force_raw.attr + || a == &dev_attr_mode.attr) return a->mode; return 0; @@ -1343,14 +1424,19 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) struct nd_namespace_pmem *nspm; nspm = to_nd_namespace_pmem(&ndns->dev); - if (!nspm->uuid) { - dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__); + if (uuid_not_set(nspm->uuid, &ndns->dev, __func__)) return ERR_PTR(-ENODEV); - } } else if (is_namespace_blk(&ndns->dev)) { struct nd_namespace_blk *nsblk; nsblk = to_nd_namespace_blk(&ndns->dev); + if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__)) + return ERR_PTR(-ENODEV); + if (!nsblk->lbasize) { + dev_dbg(&ndns->dev, "%s: sector size not set\n", + __func__); + return ERR_PTR(-ENODEV); + } if (!nd_namespace_blk_validate(nsblk)) return ERR_PTR(-ENODEV); } @@ -1689,6 +1775,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region) nd_device_register(nd_region->ns_seed); } +void nd_region_create_pfn_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->pfn_seed = nd_pfn_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->pfn_seed) + dev_err(&nd_region->dev, "failed to create pfn namespace\n"); +} + void nd_region_create_btt_seed(struct nd_region *nd_region) { WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 159aed532042..1d1500f3d8b5 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -30,6 +30,7 @@ struct nvdimm_bus { struct list_head list; struct device dev; int id, probe_active; + struct list_head poison_list; struct mutex reconfig_mutex; }; @@ -52,6 +53,7 @@ void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); struct nd_region; void nd_region_create_blk_seed(struct nd_region *nd_region); void nd_region_create_btt_seed(struct nd_region *nd_region); +void nd_region_create_pfn_seed(struct nd_region *nd_region); void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 417e521d299c..ba1633b9da31 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -29,13 +29,12 @@ enum { ND_MAX_LANES = 256, SECTOR_SHIFT = 9, INT_LBASIZE_ALIGNMENT = 64, -#if IS_ENABLED(CONFIG_NVDIMM_PFN) - ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE, - ND_PFN_MASK = ND_PFN_ALIGN - 1, -#else - ND_PFN_ALIGN = 0, - ND_PFN_MASK = 0, -#endif +}; + +struct nd_poison { + u64 start; + u64 length; + struct list_head list; }; struct nvdimm_drvdata { @@ -153,6 +152,7 @@ struct nd_pfn { int id; u8 *uuid; struct device dev; + unsigned long align; unsigned long npfns; enum nd_pfn_mode mode; struct nd_pfn_sb *pfn_sb; @@ -262,6 +262,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); +void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns, + struct badblocks *bb, resource_size_t offset); int nd_blk_region_init(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 71805a1aa0f3..f9b674bc49db 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev, } static DEVICE_ATTR_RW(mode); +static ssize_t align_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + return sprintf(buf, "%lx\n", nd_pfn->align); +} + +static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf) +{ + unsigned long val; + int rc; + + rc = kstrtoul(buf, 0, &val); + if (rc) + return rc; + + if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G) + return -EINVAL; + + if (nd_pfn->dev.driver) + return -EBUSY; + else + nd_pfn->align = val; + + return 0; +} + +static ssize_t align_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = __align_store(nd_pfn, buf); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(align); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = { &dev_attr_mode.attr, &dev_attr_namespace.attr, &dev_attr_uuid.attr, + &dev_attr_align.attr, NULL, }; @@ -179,7 +226,6 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = { }; static struct device *__nd_pfn_create(struct nd_region *nd_region, - u8 *uuid, enum nd_pfn_mode mode, struct nd_namespace_common *ndns) { struct nd_pfn *nd_pfn; @@ -199,10 +245,8 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, return NULL; } - nd_pfn->mode = mode; - if (uuid) - uuid = kmemdup(uuid, 16, GFP_KERNEL); - nd_pfn->uuid = uuid; + nd_pfn->mode = PFN_MODE_NONE; + nd_pfn->align = HPAGE_SIZE; dev = &nd_pfn->dev; dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id); dev->parent = &nd_region->dev; @@ -220,8 +264,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, struct device *nd_pfn_create(struct nd_region *nd_region) { - struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, - NULL); + struct device *dev = __nd_pfn_create(nd_region, NULL); if (dev) __nd_device_register(dev); @@ -230,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region) int nd_pfn_validate(struct nd_pfn *nd_pfn) { - struct nd_namespace_common *ndns = nd_pfn->ndns; - struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; - struct nd_namespace_io *nsio; u64 checksum, offset; + struct nd_namespace_io *nsio; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + struct nd_namespace_common *ndns = nd_pfn->ndns; + const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev); if (!pfn_sb || !ndns) return -ENODEV; @@ -241,10 +285,6 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) if (!is_nd_pmem(nd_pfn->dev.parent)) return -ENODEV; - /* section alignment for simple hotplug */ - if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN) - return -ENODEV; - if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb))) return -ENXIO; @@ -257,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -ENODEV; pfn_sb->checksum = cpu_to_le64(checksum); + if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0) + return -ENODEV; + switch (le32_to_cpu(pfn_sb->mode)) { case PFN_MODE_RAM: break; @@ -278,6 +321,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) return -EINVAL; } + if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) { + dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n", + nd_pfn->align, nvdimm_namespace_capacity(ndns)); + return -EINVAL; + } + /* * These warnings are verbose because they can only trigger in * the case where the physical address alignment of the @@ -286,17 +335,19 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) */ offset = le64_to_cpu(pfn_sb->dataoff); nsio = to_nd_namespace_io(&ndns->dev); - if (nsio->res.start & ND_PFN_MASK) { - dev_err(&nd_pfn->dev, - "init failed: %s not section aligned\n", - dev_name(&ndns->dev)); - return -EBUSY; - } else if (offset >= resource_size(&nsio->res)) { + if (offset >= resource_size(&nsio->res)) { dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n", dev_name(&ndns->dev)); return -EBUSY; } + nd_pfn->align = 1UL << ilog2(offset); + if (!is_power_of_2(offset) || offset < PAGE_SIZE) { + dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n", + offset); + return -ENXIO; + } + return 0; } EXPORT_SYMBOL(nd_pfn_validate); @@ -313,7 +364,7 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata) return -ENODEV; nvdimm_bus_lock(&ndns->dev); - dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns); + dev = __nd_pfn_create(nd_region, ndns); nvdimm_bus_unlock(&ndns->dev); if (!dev) return -ENOMEM; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8ee79893d2f5..b493ff3fccb2 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/memory_hotplug.h> #include <linux/moduleparam.h> +#include <linux/badblocks.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/pmem.h> @@ -41,11 +42,25 @@ struct pmem_device { phys_addr_t data_offset; void __pmem *virt_addr; size_t size; + struct badblocks bb; }; static int pmem_major; -static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) +{ + if (bb->count) { + sector_t first_bad; + int num_bad; + + return !!badblocks_check(bb, sector, len / 512, &first_bad, + &num_bad); + } + + return false; +} + +static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, int rw, sector_t sector) { @@ -54,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, void __pmem *pmem_addr = pmem->virt_addr + pmem_off; if (rw == READ) { + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return -EIO; memcpy_from_pmem(mem + off, pmem_addr, len); flush_dcache_page(page); } else { @@ -62,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, } kunmap_atomic(mem); + return 0; } static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { + int rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -74,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) struct pmem_device *pmem = bdev->bd_disk->private_data; do_acct = nd_iostat_start(bio, &start); - bio_for_each_segment(bvec, bio, iter) - pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - bio_data_dir(bio), iter.bi_sector); + bio_for_each_segment(bvec, bio, iter) { + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, + bvec.bv_offset, bio_data_dir(bio), + iter.bi_sector); + if (rc) { + bio->bi_error = rc; + break; + } + } if (do_acct) nd_iostat_end(bio, start); @@ -91,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, int rw) { struct pmem_device *pmem = bdev->bd_disk->private_data; + int rc; - pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); + rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); if (rw & WRITE) wmb_pmem(); - page_endio(page, rw & WRITE, 0); - return 0; + /* + * The ->rw_page interface is subtle and tricky. The core + * retries on any error, so we can only invoke page_endio() in + * the successful completion case. Otherwise, we'll see crashes + * caused by double completion. + */ + if (rc == 0) + page_endio(page, rw & WRITE, 0); + + return rc; } static long pmem_direct_access(struct block_device *bdev, sector_t sector, @@ -195,7 +229,12 @@ static int pmem_attach_disk(struct device *dev, disk->driverfs_dev = dev; set_capacity(disk, (pmem->size - pmem->data_offset) / 512); pmem->pmem_disk = disk; + devm_exit_badblocks(dev, &pmem->bb); + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); + disk->bb = &pmem->bb; add_disk(disk); revalidate_disk(disk); @@ -212,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, return -EFAULT; } - if (rw == READ) + if (rw == READ) { + unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); + + if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align))) + return -EIO; memcpy_from_pmem(buf, pmem->virt_addr + offset, size); - else { + } else { memcpy_to_pmem(pmem->virt_addr + offset, buf, size); wmb_pmem(); } @@ -238,14 +281,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn); - if (rc == 0 || rc == -EBUSY) + if (rc == -ENODEV) + /* no info block, do init */; + else return rc; - /* section alignment for simple hotplug */ - if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN - || pmem->phys_addr & ND_PFN_MASK) - return -ENODEV; - nd_region = to_nd_region(nd_pfn->dev.parent); if (nd_region->ro) { dev_info(&nd_pfn->dev, @@ -263,9 +303,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * ->direct_access() to those that are included in the memmap. */ if (nd_pfn->mode == PFN_MODE_PMEM) - offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE); + offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align); else if (nd_pfn->mode == PFN_MODE_RAM) - offset = SZ_8K; + offset = ALIGN(SZ_8K, nd_pfn->align); else goto err; @@ -275,6 +315,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) pfn_sb->npfns = cpu_to_le64(npfns); memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); + memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); @@ -326,21 +367,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) if (rc) return rc; - if (PAGE_SIZE != SZ_4K) { - dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n"); - return -ENXIO; - } - if (nsio->res.start & ND_PFN_MASK) { - dev_err(dev, "%s not memory hotplug section aligned\n", - dev_name(&ndns->dev)); - return -ENXIO; - } - pfn_sb = nd_pfn->pfn_sb; offset = le64_to_cpu(pfn_sb->dataoff); nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); if (nd_pfn->mode == PFN_MODE_RAM) { - if (offset != SZ_8K) + if (offset < SZ_8K) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); altmap = NULL; @@ -389,6 +420,9 @@ static int nd_pmem_probe(struct device *dev) pmem->ndns = ndns; dev_set_drvdata(dev, pmem); ndns->rw_bytes = pmem_rw_bytes; + if (devm_init_badblocks(dev, &pmem->bb)) + return -ENOMEM; + nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); if (is_nd_btt(dev)) return nvdimm_namespace_attach_btt(ndns); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 529f3f02e7b2..139bf71ca549 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -134,62 +134,6 @@ int nd_region_to_nstype(struct nd_region *nd_region) } EXPORT_SYMBOL(nd_region_to_nstype); -static int is_uuid_busy(struct device *dev, void *data) -{ - struct nd_region *nd_region = to_nd_region(dev->parent); - u8 *uuid = data; - - switch (nd_region_to_nstype(nd_region)) { - case ND_DEVICE_NAMESPACE_PMEM: { - struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - - if (!nspm->uuid) - break; - if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0) - return -EBUSY; - break; - } - case ND_DEVICE_NAMESPACE_BLK: { - struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); - - if (!nsblk->uuid) - break; - if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0) - return -EBUSY; - break; - } - default: - break; - } - - return 0; -} - -static int is_namespace_uuid_busy(struct device *dev, void *data) -{ - if (is_nd_pmem(dev) || is_nd_blk(dev)) - return device_for_each_child(dev, data, is_uuid_busy); - return 0; -} - -/** - * nd_is_uuid_unique - verify that no other namespace has @uuid - * @dev: any device on a nvdimm_bus - * @uuid: uuid to check - */ -bool nd_is_uuid_unique(struct device *dev, u8 *uuid) -{ - struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); - - if (!nvdimm_bus) - return false; - WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); - if (device_for_each_child(&nvdimm_bus->dev, uuid, - is_namespace_uuid_busy) != 0) - return false; - return true; -} - static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -406,6 +350,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) struct nd_interleave_set *nd_set = nd_region->nd_set; int type = nd_region_to_nstype(nd_region); + if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr) + return 0; + if (a != &dev_attr_set_cookie.attr && a != &dev_attr_available_size.attr) return a->mode; @@ -487,6 +434,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nd_region_create_blk_seed(nd_region); nvdimm_bus_unlock(dev); } + if (is_nd_pfn(dev) && probe) { + nd_region = to_nd_region(dev->parent); + nvdimm_bus_lock(dev); + if (nd_region->pfn_seed == dev) + nd_region_create_pfn_seed(nd_region); + nvdimm_bus_unlock(dev); + } } void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev) diff --git a/fs/block_dev.c b/fs/block_dev.c index 01b8e0d4b4ff..d878e4860fb7 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, @@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t retval; mutex_lock(&bd_inode->i_mutex); @@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = filp->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; @@ -1224,8 +1229,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; + } /* * If the device is invalidated, rescan partition @@ -1239,6 +1247,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) else if (ret == -ENOMEDIUM) invalidate_partitions(disk, bdev); } + if (ret) goto out_clear; } else { @@ -1259,12 +1268,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - /* - * If the partition is not aligned on a page - * boundary, we can't do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) || - (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + if (!blkdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1599,14 +1603,14 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* @@ -1631,7 +1635,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; @@ -1663,7 +1667,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; @@ -1702,13 +1706,101 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#ifdef CONFIG_FS_DAX +/* + * In the raw block case we do not need to contend with truncation nor + * unwritten file extents. Without those concerns there is no need for + * additional locking beyond the mmap_sem context that these routines + * are already executing under. + * + * Note, there is no protection if the block device is dynamically + * resized (partition grow/shrink) during a fault. A stable block device + * size is already not enforced in the blkdev_direct_IO path. + * + * For DAX, it is the responsibility of the block device driver to + * ensure the whole-disk device size is stable while requests are in + * flight. + * + * Finally, unlike the filemap_page_mkwrite() case there is no + * filesystem superblock to sync against freezing. We still include a + * pfn_mkwrite callback for dax drivers to receive write fault + * notifications. + */ +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return __dax_fault(vma, vmf, blkdev_get_block, NULL); +} + +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); +} + +static void blkdev_vm_open(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + mutex_unlock(&bd_inode->i_mutex); +} + +static void blkdev_vm_close(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count--; + mutex_unlock(&bd_inode->i_mutex); +} + +static const struct vm_operations_struct blkdev_dax_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = blkdev_dax_fault, + .pmd_fault = blkdev_dax_pmd_fault, + .pfn_mkwrite = blkdev_dax_fault, +}; + +static const struct vm_operations_struct blkdev_default_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(bd_inode); + + file_accessed(file); + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + if (IS_DAX(bd_inode)) { + vma->vm_ops = &blkdev_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = &blkdev_default_vm_ops; + } + mutex_unlock(&bd_inode->i_mutex); + + return 0; +} +#else +#define blkdev_mmap generic_file_mmap +#endif + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h new file mode 100644 index 000000000000..c3bdf8c59480 --- /dev/null +++ b/include/linux/badblocks.h @@ -0,0 +1,65 @@ +#ifndef _LINUX_BADBLOCKS_H +#define _LINUX_BADBLOCKS_H + +#include <linux/seqlock.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/types.h> + +#define BB_LEN_MASK (0x00000000000001FFULL) +#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) +#define BB_ACK_MASK (0x8000000000000000ULL) +#define BB_MAX_LEN 512 +#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) +#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) +#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + +/* Bad block numbers are stored sorted in a single page. + * 64bits is used for each block or extent. + * 54 bits are sector number, 9 bits are extent size, + * 1 bit is an 'acknowledged' flag. + */ +#define MAX_BADBLOCKS (PAGE_SIZE/8) + +struct badblocks { + struct device *dev; /* set by devm_init_badblocks */ + int count; /* count of bad blocks */ + int unacked_exist; /* there probably are unacknowledged + * bad blocks. This is only cleared + * when a read discovers none + */ + int shift; /* shift from sectors to block size + * a -ve shift means badblocks are + * disabled.*/ + u64 *page; /* badblock list */ + int changed; + seqlock_t lock; + sector_t sector; + sector_t size; /* in sectors */ +}; + +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +void ack_all_badblocks(struct badblocks *bb); +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack); +int badblocks_init(struct badblocks *bb, int enable); +void badblocks_exit(struct badblocks *bb); +struct device; +int devm_init_badblocks(struct device *dev, struct badblocks *bb); +static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb) +{ + if (bb->dev != dev) { + dev_WARN_ONCE(dev, 1, "%s: badblocks instance not associated\n", + __func__); + return; + } + badblocks_exit(bb); +} +#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 731262c3fbb7..eb73d74ed992 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -483,6 +483,9 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; +#ifdef CONFIG_FS_DAX + int bd_map_count; +#endif }; /* @@ -2280,6 +2283,14 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); +#ifdef CONFIG_FS_DAX +extern bool blkdev_dax_capable(struct block_device *bdev); +#else +static inline bool blkdev_dax_capable(struct block_device *bdev) +{ + return false; +} +#endif extern struct super_block *blockdev_superblock; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 847cc1d91634..5c706765404a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -162,6 +162,7 @@ struct disk_part_tbl { }; struct disk_events; +struct badblocks; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -213,6 +214,7 @@ struct gendisk { struct kobject integrity_kobj; #endif /* CONFIG_BLK_DEV_INTEGRITY */ int node_id; + struct badblocks *bb; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 3f021dc5da8c..bed40dff0e86 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } +int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b38e647664a0..b8a5b3b86ad6 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -188,6 +188,8 @@ struct inodes_stat_t { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +#define BLKDAXSET _IO(0x12,128) +#define BLKDAXGET _IO(0x12,129) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ diff --git a/kernel/resource.c b/kernel/resource.c index f150dbbe6f62..09c0597840b0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e2a236a7341f..ee1ac1cc082c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1886,3 +1886,42 @@ source "samples/Kconfig" source "lib/Kconfig.kgdb" +config ARCH_HAS_DEVMEM_IS_ALLOWED + bool + +config STRICT_DEVMEM + bool "Filter access to /dev/mem" + depends on MMU + depends on ARCH_HAS_DEVMEM_IS_ALLOWED + default y if TILE || PPC + ---help--- + If this option is disabled, you allow userspace (root) access to all + of memory, including kernel and userspace memory. Accidental + access to this is obviously disastrous, but specific access can + be used by people debugging the kernel. Note that with PAT support + enabled, even in this case there are restrictions on /dev/mem + use due to the cache aliasing requirements. + + If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem + file only allows userspace access to PCI space and the BIOS code and + data regions. This is sufficient for dosemu and X and all common + users of /dev/mem. + + If in doubt, say Y. + +config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM + default STRICT_DEVMEM + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that + range. Accidental access to this is obviously disastrous, but + specific access can be used by people debugging kernel drivers. + + If this option is switched on, the /dev/mem file only allows + userspace access to *idle* io-memory ranges (see /proc/iomem) This + may break traditional users of /dev/mem (dosemu, legacy X, etc...) + if the driver using a given range cannot be disabled. + + If in doubt, say Y. diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index 38b00ecb2ed5..a34bfd0c8928 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -9,6 +9,8 @@ ldflags-y += --wrap=memunmap ldflags-y += --wrap=__devm_request_region ldflags-y += --wrap=__request_region ldflags-y += --wrap=__release_region +ldflags-y += --wrap=devm_memremap_pages +ldflags-y += --wrap=phys_to_pfn_t DRIVERS := ../../../drivers NVDIMM_SRC := $(DRIVERS)/nvdimm diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index b7251314bbc0..7ec7df9e7fc7 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/io.h> +#include <linux/mm.h> #include "nfit_test.h" static LIST_HEAD(iomap_head); @@ -41,7 +42,7 @@ void nfit_test_teardown(void) } EXPORT_SYMBOL(nfit_test_teardown); -static struct nfit_test_resource *get_nfit_res(resource_size_t resource) +static struct nfit_test_resource *__get_nfit_res(resource_size_t resource) { struct iomap_ops *ops; @@ -51,14 +52,22 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource) return NULL; } -void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, - void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) +static struct nfit_test_resource *get_nfit_res(resource_size_t resource) { - struct nfit_test_resource *nfit_res; + struct nfit_test_resource *res; rcu_read_lock(); - nfit_res = get_nfit_res(offset); + res = __get_nfit_res(resource); rcu_read_unlock(); + + return res; +} + +void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, + void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) +{ + struct nfit_test_resource *nfit_res = get_nfit_res(offset); + if (nfit_res) return (void __iomem *) nfit_res->buf + offset - nfit_res->res->start; @@ -68,11 +77,8 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, void __iomem *__wrap_devm_ioremap_nocache(struct device *dev, resource_size_t offset, unsigned long size) { - struct nfit_test_resource *nfit_res; + struct nfit_test_resource *nfit_res = get_nfit_res(offset); - rcu_read_lock(); - nfit_res = get_nfit_res(offset); - rcu_read_unlock(); if (nfit_res) return (void __iomem *) nfit_res->buf + offset - nfit_res->res->start; @@ -83,25 +89,58 @@ EXPORT_SYMBOL(__wrap_devm_ioremap_nocache); void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, size_t size, unsigned long flags) { - struct nfit_test_resource *nfit_res; + struct nfit_test_resource *nfit_res = get_nfit_res(offset); - rcu_read_lock(); - nfit_res = get_nfit_res(offset); - rcu_read_unlock(); if (nfit_res) return nfit_res->buf + offset - nfit_res->res->start; return devm_memremap(dev, offset, size, flags); } EXPORT_SYMBOL(__wrap_devm_memremap); +#ifdef __HAVE_ARCH_PTE_DEVMAP +#include <linux/memremap.h> +#include <linux/pfn_t.h> + +void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, + struct percpu_ref *ref, struct vmem_altmap *altmap) +{ + resource_size_t offset = res->start; + struct nfit_test_resource *nfit_res = get_nfit_res(offset); + + if (nfit_res) + return nfit_res->buf + offset - nfit_res->res->start; + return devm_memremap_pages(dev, res, ref, altmap); +} +EXPORT_SYMBOL(__wrap_devm_memremap_pages); + +pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +{ + struct nfit_test_resource *nfit_res = get_nfit_res(addr); + + if (nfit_res) + flags &= ~PFN_MAP; + return phys_to_pfn_t(addr, flags); +} +EXPORT_SYMBOL(__wrap_phys_to_pfn_t); +#else +/* to be removed post 4.5-rc1 */ +void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res) +{ + resource_size_t offset = res->start; + struct nfit_test_resource *nfit_res = get_nfit_res(offset); + + if (nfit_res) + return nfit_res->buf + offset - nfit_res->res->start; + return devm_memremap_pages(dev, res); +} +EXPORT_SYMBOL(__wrap_devm_memremap_pages); +#endif + void *__wrap_memremap(resource_size_t offset, size_t size, unsigned long flags) { - struct nfit_test_resource *nfit_res; + struct nfit_test_resource *nfit_res = get_nfit_res(offset); - rcu_read_lock(); - nfit_res = get_nfit_res(offset); - rcu_read_unlock(); if (nfit_res) return nfit_res->buf + offset - nfit_res->res->start; return memremap(offset, size, flags); @@ -110,11 +149,8 @@ EXPORT_SYMBOL(__wrap_memremap); void __wrap_devm_memunmap(struct device *dev, void *addr) { - struct nfit_test_resource *nfit_res; + struct nfit_test_resource *nfit_res = get_nfit_res((long) addr); - rcu_read_lock(); - nfit_res = get_nfit_res((unsigned long) addr); - rcu_read_unlock(); if (nfit_res) return; return devm_memunmap(dev, addr); @@ -135,11 +171,7 @@ EXPORT_SYMBOL(__wrap_ioremap_wc); void __wrap_iounmap(volatile void __iomem *addr) { - struct nfit_test_resource *nfit_res; - - rcu_read_lock(); - nfit_res = get_nfit_res((unsigned long) addr); - rcu_read_unlock(); + struct nfit_test_resource *nfit_res = get_nfit_res((long) addr); if (nfit_res) return; return iounmap(addr); @@ -148,11 +180,8 @@ EXPORT_SYMBOL(__wrap_iounmap); void __wrap_memunmap(void *addr) { - struct nfit_test_resource *nfit_res; + struct nfit_test_resource *nfit_res = get_nfit_res((long) addr); - rcu_read_lock(); - nfit_res = get_nfit_res((unsigned long) addr); - rcu_read_unlock(); if (nfit_res) return; return memunmap(addr); @@ -166,9 +195,7 @@ static struct resource *nfit_test_request_region(struct device *dev, struct nfit_test_resource *nfit_res; if (parent == &iomem_resource) { - rcu_read_lock(); nfit_res = get_nfit_res(start); - rcu_read_unlock(); if (nfit_res) { struct resource *res = nfit_res->res + 1; @@ -218,9 +245,7 @@ void __wrap___release_region(struct resource *parent, resource_size_t start, struct nfit_test_resource *nfit_res; if (parent == &iomem_resource) { - rcu_read_lock(); nfit_res = get_nfit_res(start); - rcu_read_unlock(); if (nfit_res) { struct resource *res = nfit_res->res + 1; diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 51cf8256c6cd..90bd2ea41032 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -248,6 +248,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd, nd_cmd->out_length = 256; nd_cmd->num_records = 0; + nd_cmd->address = 0; + nd_cmd->length = -1ULL; nd_cmd->status = 0; return 0; @@ -1088,6 +1090,8 @@ static void nfit_test1_setup(struct nfit_test *t) struct acpi_nfit_memory_map *memdev; struct acpi_nfit_control_region *dcr; struct acpi_nfit_system_address *spa; + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; offset = 0; /* spa0 (flat range with no bdw aliasing) */ @@ -1135,6 +1139,13 @@ static void nfit_test1_setup(struct nfit_test *t) dcr->command_size = 0; dcr->status_offset = 0; dcr->status_size = 0; + + acpi_desc = &t->acpi_desc; + set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en); + nd_desc = &acpi_desc->nd_desc; + nd_desc->ndctl = nfit_test_ctl; } static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, |