From ad5fb870c486d932a1749d7853dd70f436a7e03f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 3 Apr 2015 12:05:28 -0400 Subject: e820, efi: add ACPI 6.0 persistent memory types ACPI 6.0 formalizes e820-type-7 and efi-type-14 as persistent memory. Mark it "reserved" and allow it to be claimed by a persistent memory device driver. This definition is in addition to the Linux kernel's existing type-12 definition that was recently added in support of shipping platforms with NVDIMM support that predate ACPI 6.0 (which now classifies type-12 as OEM reserved). Note, /proc/iomem can be consulted for differentiating legacy "Persistent Memory (legacy)" E820_PRAM vs standard "Persistent Memory" E820_PMEM. Cc: Boaz Harrosh Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Jens Axboe Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Thomas Gleixner Acked-by: Jeff Moyer Acked-by: Andy Lutomirski Reviewed-by: Ross Zwisler Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- arch/arm64/kernel/efi.c | 1 + arch/ia64/kernel/efi.c | 4 ++++ arch/x86/boot/compressed/eboot.c | 4 ++++ arch/x86/include/uapi/asm/e820.h | 1 + arch/x86/kernel/e820.c | 28 ++++++++++++++++++++++++---- arch/x86/platform/efi/efi.c | 3 +++ 6 files changed, 37 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index ab21e0d58278..9d4aa18f2a82 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -158,6 +158,7 @@ static __init int is_reserve_region(efi_memory_desc_t *md) case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: return 0; default: break; diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index c52d7540dc05..5f6be9dd6968 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -1223,6 +1223,10 @@ efi_initialize_iomem_resources(struct resource *code_resource, flags |= IORESOURCE_DISABLED; break; + case EFI_PERSISTENT_MEMORY: + name = "Persistent Memory"; + break; + case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: case EFI_RUNTIME_SERVICES_DATA: diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 48304b89b601..2c82bd150d43 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1224,6 +1224,10 @@ static efi_status_t setup_e820(struct boot_params *params, e820_type = E820_NVS; break; + case EFI_PERSISTENT_MEMORY: + e820_type = E820_PMEM; + break; + default: continue; } diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 960a8a9dc4ab..0f457e6eab18 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -32,6 +32,7 @@ #define E820_ACPI 3 #define E820_NVS 4 #define E820_UNUSABLE 5 +#define E820_PMEM 7 /* * This is a non-standardized way to represent ADR or NVDIMM regions that diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e2ce85db2283..c857d53269dd 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type) case E820_UNUSABLE: printk(KERN_CONT "unusable"); break; + case E820_PMEM: case E820_PRAM: printk(KERN_CONT "persistent (type %u)", type); break; @@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type) case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; case E820_UNUSABLE: return "Unusable memory"; - case E820_PRAM: return "Persistent RAM"; + case E820_PRAM: return "Persistent Memory (legacy)"; + case E820_PMEM: return "Persistent Memory"; default: return "reserved"; } } +static bool do_mark_busy(u32 type, struct resource *res) +{ + /* this is the legacy bios/dos rom-shadow + mmio region */ + if (res->start < (1ULL<<20)) + return true; + + /* + * Treat persistent memory like device memory, i.e. reserve it + * for exclusive use of a driver + */ + switch (type) { + case E820_RESERVED: + case E820_PRAM: + case E820_PMEM: + return false; + default: + return true; + } +} + /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -952,9 +974,7 @@ void __init e820_reserve_resources(void) * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (((e820.map[i].type != E820_RESERVED) && - (e820.map[i].type != E820_PRAM)) || - res->start < (1ULL<<20)) { + if (do_mark_busy(e820.map[i].type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 02744df576d5..fe01ae37a2a4 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -153,6 +153,9 @@ static void __init do_add_efi_memmap(void) case EFI_UNUSABLE_MEMORY: e820_type = E820_UNUSABLE; break; + case EFI_PERSISTENT_MEMORY: + e820_type = E820_PMEM; + break; default: /* * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE -- cgit v1.2.3 From 9f53f9fa4ad1d8bddd4d14359cdabc531aedffe8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 9 Jun 2015 15:33:45 -0400 Subject: libnvdimm, pmem: add libnvdimm support to the pmem driver nd_pmem attaches to persistent memory regions and namespaces emitted by the libnvdimm subsystem, and, same as the original pmem driver, presents the system-physical-address range as a block device. The existing e820-type-12 to pmem setup is converted to an nvdimm_bus that emits an nd_namespace_io device. Note that the X in 'pmemX' is now derived from the parent region. This provides some stability to the pmem devices names from boot-to-boot. The minor numbers are also more predictable by passing 0 to alloc_disk(). Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Signed-off-by: Ross Zwisler Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- arch/x86/Kconfig | 3 ++ arch/x86/kernel/pmem.c | 92 ++++++++++++++++++++++++++++++++------------------ drivers/nvdimm/pmem.c | 68 ++++++++++++++++++------------------- 3 files changed, 96 insertions(+), 67 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..1a2cbf641667 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1424,6 +1424,9 @@ source "mm/Kconfig" config X86_PMEM_LEGACY bool "Support non-standard NVDIMMs and ADR protected memory" + depends on PHYS_ADDR_T_64BIT + depends on BLK_DEV + select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used by the Intel Sandy Bridge-EP reference BIOS as protected memory. diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 3420c874ddc5..0f4ef472ab9e 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -1,53 +1,81 @@ /* * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. */ -#include #include -#include +#include +#include #include -#include -#include -static __init void register_pmem_device(struct resource *res) +static void e820_pmem_release(struct device *dev) { - struct platform_device *pdev; - int error; + struct nvdimm_bus *nvdimm_bus = dev->platform_data; - pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO); - if (!pdev) - return; + if (nvdimm_bus) + nvdimm_bus_unregister(nvdimm_bus); +} - error = platform_device_add_resources(pdev, res, 1); - if (error) - goto out_put_pdev; +static struct platform_device e820_pmem = { + .name = "e820_pmem", + .id = -1, + .dev = { + .release = e820_pmem_release, + }, +}; - error = platform_device_add(pdev); - if (error) - goto out_put_pdev; - return; +static const struct attribute_group *e820_pmem_attribute_groups[] = { + &nvdimm_bus_attribute_group, + NULL, +}; -out_put_pdev: - dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n"); - platform_device_put(pdev); -} +static const struct attribute_group *e820_pmem_region_attribute_groups[] = { + &nd_region_attribute_group, + &nd_device_attribute_group, + NULL, +}; -static __init int register_pmem_devices(void) +static __init int register_e820_pmem(void) { - int i; + static struct nvdimm_bus_descriptor nd_desc; + struct device *dev = &e820_pmem.dev; + struct nvdimm_bus *nvdimm_bus; + int rc, i; + + rc = platform_device_register(&e820_pmem); + if (rc) + return rc; + + nd_desc.attr_groups = e820_pmem_attribute_groups; + nd_desc.provider_name = "e820"; + nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); + if (!nvdimm_bus) + goto err; + dev->platform_data = nvdimm_bus; for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; + struct resource res = { + .flags = IORESOURCE_MEM, + .start = ei->addr, + .end = ei->addr + ei->size - 1, + }; + struct nd_region_desc ndr_desc; + + if (ei->type != E820_PRAM) + continue; - if (ei->type == E820_PRAM) { - struct resource res = { - .flags = IORESOURCE_MEM, - .start = ei->addr, - .end = ei->addr + ei->size - 1, - }; - register_pmem_device(&res); - } + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = &res; + ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + goto err; } return 0; + + err: + dev_err(dev, "failed to register legacy persistent memory ranges\n"); + platform_device_unregister(&e820_pmem); + return -ENXIO; } -device_initcall(register_pmem_devices); +device_initcall(register_e820_pmem); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index eabf4a8d0085..d46975ed9e40 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -1,7 +1,7 @@ /* * Persistent Memory Driver * - * Copyright (c) 2014, Intel Corporation. + * Copyright (c) 2014-2015, Intel Corporation. * Copyright (c) 2015, Christoph Hellwig . * Copyright (c) 2015, Boaz Harrosh . * @@ -23,8 +23,8 @@ #include #include #include - -#define PMEM_MINORS 16 +#include +#include "nd.h" struct pmem_device { struct request_queue *pmem_queue; @@ -37,7 +37,6 @@ struct pmem_device { }; static int pmem_major; -static atomic_t pmem_index; static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, int rw, @@ -118,11 +117,12 @@ static const struct block_device_operations pmem_fops = { .direct_access = pmem_direct_access, }; -static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) +static struct pmem_device *pmem_alloc(struct device *dev, + struct resource *res, int id) { struct pmem_device *pmem; struct gendisk *disk; - int idx, err; + int err; err = -ENOMEM; pmem = kzalloc(sizeof(*pmem), GFP_KERNEL); @@ -134,7 +134,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) err = -EINVAL; if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) { - dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size); + dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", + &pmem->phys_addr, pmem->size); goto out_free_dev; } @@ -155,19 +156,17 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) blk_queue_max_hw_sectors(pmem->pmem_queue, 1024); blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); - disk = alloc_disk(PMEM_MINORS); + disk = alloc_disk(0); if (!disk) goto out_free_queue; - idx = atomic_inc_return(&pmem_index) - 1; - disk->major = pmem_major; - disk->first_minor = PMEM_MINORS * idx; + disk->first_minor = 0; disk->fops = &pmem_fops; disk->private_data = pmem; disk->queue = pmem->pmem_queue; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "pmem%d", idx); + sprintf(disk->disk_name, "pmem%d", id); disk->driverfs_dev = dev; set_capacity(disk, pmem->size >> 9); pmem->pmem_disk = disk; @@ -198,42 +197,38 @@ static void pmem_free(struct pmem_device *pmem) kfree(pmem); } -static int pmem_probe(struct platform_device *pdev) +static int nd_pmem_probe(struct device *dev) { + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); struct pmem_device *pmem; - struct resource *res; - - if (WARN_ON(pdev->num_resources > 1)) - return -ENXIO; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENXIO; - pmem = pmem_alloc(&pdev->dev, res); + pmem = pmem_alloc(dev, &nsio->res, nd_region->id); if (IS_ERR(pmem)) return PTR_ERR(pmem); - platform_set_drvdata(pdev, pmem); + dev_set_drvdata(dev, pmem); return 0; } -static int pmem_remove(struct platform_device *pdev) +static int nd_pmem_remove(struct device *dev) { - struct pmem_device *pmem = platform_get_drvdata(pdev); + struct pmem_device *pmem = dev_get_drvdata(dev); pmem_free(pmem); return 0; } -static struct platform_driver pmem_driver = { - .probe = pmem_probe, - .remove = pmem_remove, - .driver = { - .owner = THIS_MODULE, - .name = "pmem", +MODULE_ALIAS("pmem"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); +static struct nd_device_driver nd_pmem_driver = { + .probe = nd_pmem_probe, + .remove = nd_pmem_remove, + .drv = { + .name = "nd_pmem", }, + .type = ND_DRIVER_NAMESPACE_IO, }; static int __init pmem_init(void) @@ -244,16 +239,19 @@ static int __init pmem_init(void) if (pmem_major < 0) return pmem_major; - error = platform_driver_register(&pmem_driver); - if (error) + error = nd_driver_register(&nd_pmem_driver); + if (error) { unregister_blkdev(pmem_major, "pmem"); - return error; + return error; + } + + return 0; } module_init(pmem_init); static void pmem_exit(void) { - platform_driver_unregister(&pmem_driver); + driver_unregister(&nd_pmem_driver.drv); unregister_blkdev(pmem_major, "pmem"); } module_exit(pmem_exit); -- cgit v1.2.3 From 41d7a6d637e1440f5410cb43c25a3c41255540c5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:33 -0600 Subject: libnvdimm: Set numa_node to NVDIMM devices ACPI NFIT table has System Physical Address Range Structure entries that describe a proximity ID of each range when ACPI_NFIT_PROXIMITY_VALID is set in the flags. Change acpi_nfit_register_region() to map a proximity ID to its node ID, and set it to a new numa_node field of nd_region_desc, which is then conveyed to the nd_region device. The device core arranges for btt and namespace devices to inherit their node from their parent region. Signed-off-by: Toshi Kani [djbw: move set_dev_node() from region.c to bus.c] Signed-off-by: Dan Williams --- arch/x86/kernel/pmem.c | 1 + drivers/acpi/nfit.c | 6 ++++++ drivers/nvdimm/bus.c | 6 ++++++ drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/region_devs.c | 1 + include/linux/libnvdimm.h | 1 + 6 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 0f4ef472ab9e..64f90f53bb85 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -67,6 +67,7 @@ static __init int register_e820_pmem(void) memset(&ndr_desc, 0, sizeof(ndr_desc)); ndr_desc.res = &res; ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + ndr_desc.numa_node = NUMA_NO_NODE; if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) goto err; } diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 1f6f1b1a54f4..d96c8fe974dd 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc->res = &res; ndr_desc->provider_data = nfit_spa; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) + ndr_desc->numa_node = acpi_map_pxm_to_online_node( + spa->proximity_domain); + else + ndr_desc->numa_node = NUMA_NO_NODE; + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping *nd_mapping; diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index ec59f1f26d95..205344643852 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -48,6 +48,12 @@ static int to_nd_device_type(struct device *dev) static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { + /* + * Ensure that region devices always have their numa node set as + * early as possible. + */ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + set_dev_node(dev, to_nd_region(dev)->numa_node); return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT, to_nd_device_type(dev)); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 48b09a210689..c41f53e74277 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -97,7 +97,7 @@ struct nd_region { u16 ndr_mappings; u64 ndr_size; u64 ndr_start; - int id, num_lanes, ro; + int id, num_lanes, ro, numa_node; void *provider_data; struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 482ee3e4e04a..a5233422f9dc 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -736,6 +736,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->nd_set = ndr_desc->nd_set; nd_region->num_lanes = ndr_desc->num_lanes; nd_region->ro = ro; + nd_region->numa_node = ndr_desc->numa_node; ida_init(&nd_region->ns_ida); ida_init(&nd_region->btt_ida); dev = &nd_region->dev; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index dc799a29ed1a..30b3deaafd51 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -89,6 +89,7 @@ struct nd_region_desc { struct nd_interleave_set *nd_set; void *provider_data; int num_lanes; + int numa_node; }; struct nvdimm_bus; -- cgit v1.2.3 From 61031952f4c89dba1065f7a5b9419badb112554c Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 03:08:39 -0400 Subject: arch, x86: pmem api for ensuring durability of persistent memory updates Based on an original patch by Ross Zwisler [1]. Writes to persistent memory have the potential to be posted to cpu cache, cpu write buffers, and platform write buffers (memory controller) before being committed to persistent media. Provide apis, memcpy_to_pmem(), wmb_pmem(), and memremap_pmem(), to write data to pmem and assert that it is durable in PMEM (a persistent linear address range). A '__pmem' attribute is added so sparse can track proper usage of pointers to pmem. This continues the status quo of pmem being x86 only for 4.2, but reworks to ioremap, and wider implementation of memremap() will enable other archs in 4.3. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Ross Zwisler [djbw: various reworks] Signed-off-by: Dan Williams --- arch/x86/Kconfig | 1 + arch/x86/include/asm/cacheflush.h | 72 ++++++++++++++++++ arch/x86/include/asm/io.h | 6 ++ drivers/nvdimm/pmem.c | 33 ++++---- include/linux/compiler.h | 2 + include/linux/pmem.h | 153 ++++++++++++++++++++++++++++++++++++++ lib/Kconfig | 3 + 7 files changed, 257 insertions(+), 13 deletions(-) create mode 100644 include/linux/pmem.h (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1a2cbf641667..62564ddf7f78 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,6 +27,7 @@ config X86 select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_PMEM_API select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 47c8e32f621a..ec23bb753a3e 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -4,6 +4,7 @@ /* Caches aren't brain-dead on the intel. */ #include #include +#include /* * The set_memory_* API can be used to change various attributes of a virtual @@ -104,4 +105,75 @@ static inline int rodata_test(void) } #endif +#ifdef ARCH_HAS_NOCACHE_UACCESS + +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ +#ifdef CONFIG_X86_64 + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +#else + return false; +#endif +} +#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ +extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); +extern void arch_wmb_pmem(void); + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} +#endif + #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 34a5b93704d3..c60c3f3b0183 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -247,6 +247,12 @@ static inline void flush_write_buffers(void) #endif } +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return (void __force __pmem *) ioremap_cache(offset, size); +} + #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 42b766f33e59..ade9eb917a4d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "nd.h" @@ -32,7 +33,7 @@ struct pmem_device { /* One contiguous memory region per device */ phys_addr_t phys_addr; - void *virt_addr; + void __pmem *virt_addr; size_t size; }; @@ -44,13 +45,14 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, { void *mem = kmap_atomic(page); size_t pmem_off = sector << 9; + void __pmem *pmem_addr = pmem->virt_addr + pmem_off; if (rw == READ) { - memcpy(mem + off, pmem->virt_addr + pmem_off, len); + memcpy_from_pmem(mem + off, pmem_addr, len); flush_dcache_page(page); } else { flush_dcache_page(page); - memcpy(pmem->virt_addr + pmem_off, mem + off, len); + memcpy_to_pmem(pmem_addr, mem + off, len); } kunmap_atomic(mem); @@ -71,6 +73,10 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio) bio_data_dir(bio), iter.bi_sector); if (do_acct) nd_iostat_end(bio, start); + + if (bio_data_dir(bio)) + wmb_pmem(); + bio_endio(bio, 0); } @@ -94,7 +100,8 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector, if (!pmem) return -ENODEV; - *kaddr = pmem->virt_addr + offset; + /* FIXME convert DAX to comprehend that this mapping has a lifetime */ + *kaddr = (void __force *) pmem->virt_addr + offset; *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; return pmem->size - offset; @@ -118,6 +125,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, pmem->phys_addr = res->start; pmem->size = resource_size(res); + if (!arch_has_pmem_api()) + dev_warn(dev, "unable to guarantee persistence of writes\n"); if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) { dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", @@ -126,11 +135,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, return ERR_PTR(-EBUSY); } - /* - * Map the memory as non-cachable, as we can't write back the contents - * of the CPU caches in case of a crash. - */ - pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size); + pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size); if (!pmem->virt_addr) { release_mem_region(pmem->phys_addr, pmem->size); kfree(pmem); @@ -195,16 +200,18 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, } if (rw == READ) - memcpy(buf, pmem->virt_addr + offset, size); - else - memcpy(pmem->virt_addr + offset, buf, size); + memcpy_from_pmem(buf, pmem->virt_addr + offset, size); + else { + memcpy_to_pmem(pmem->virt_addr + offset, buf, size); + wmb_pmem(); + } return 0; } static void pmem_free(struct pmem_device *pmem) { - iounmap(pmem->virt_addr); + memunmap_pmem(pmem->virt_addr); release_mem_region(pmem->phys_addr, pmem->size); kfree(pmem); } diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 867722591be2..9a528d945498 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -21,6 +21,7 @@ # define __rcu __attribute__((noderef, address_space(4))) #else # define __rcu +# define __pmem __attribute__((noderef, address_space(5))) #endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); @@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __cond_lock(x,c) (c) # define __percpu # define __rcu +# define __pmem #endif /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ diff --git a/include/linux/pmem.h b/include/linux/pmem.h new file mode 100644 index 000000000000..f6481a0b1d4f --- /dev/null +++ b/include/linux/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __PMEM_H__ +#define __PMEM_H__ + +#include + +#ifdef CONFIG_ARCH_HAS_PMEM_API +#include +#else +static inline void arch_wmb_pmem(void) +{ + BUG(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} + +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return NULL; +} + +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + BUG(); +} +#endif + +/* + * Architectures that define ARCH_HAS_PMEM_API must provide + * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(), + * arch_wmb_pmem(), and __arch_has_wmb_pmem(). + */ + +static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) +{ + memcpy(dst, (void __force const *) src, size); +} + +static inline void memunmap_pmem(void __pmem *addr) +{ + iounmap((void __force __iomem *) addr); +} + +/** + * arch_has_wmb_pmem - true if wmb_pmem() ensures durability + * + * For a given cpu implementation within an architecture it is possible + * that wmb_pmem() resolves to a nop. In the case this returns + * false, pmem api users are unable to ensure durability and may want to + * fall back to a different data consistency model, or otherwise notify + * the user. + */ +static inline bool arch_has_wmb_pmem(void) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) + return __arch_has_wmb_pmem(); + return false; +} + +static inline bool arch_has_pmem_api(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem(); +} + +/* + * These defaults seek to offer decent performance and minimize the + * window between i/o completion and writes being durable on media. + * However, it is undefined / architecture specific whether + * default_memremap_pmem + default_memcpy_to_pmem is sufficient for + * making data durable relative to i/o completion. + */ +static void default_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t size) +{ + memcpy((void __force *) dst, src, size); +} + +static void __pmem *default_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + /* TODO: convert to ioremap_wt() */ + return (void __pmem __force *)ioremap_nocache(offset, size); +} + +/** + * memremap_pmem - map physical persistent memory for pmem api + * @offset: physical address of persistent memory + * @size: size of the mapping + * + * Establish a mapping of the architecture specific memory type expected + * by memcpy_to_pmem() and wmb_pmem(). For example, it may be + * the case that an uncacheable or writethrough mapping is sufficient, + * or a writeback mapping provided memcpy_to_pmem() and + * wmb_pmem() arrange for the data to be written through the + * cache to persistent media. + */ +static inline void __pmem *memremap_pmem(resource_size_t offset, + unsigned long size) +{ + if (arch_has_pmem_api()) + return arch_memremap_pmem(offset, size); + return default_memremap_pmem(offset, size); +} + +/** + * memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Perform a memory copy that results in the destination of the copy + * being effectively evicted from, or never written to, the processor + * cache hierarchy after the copy completes. After memcpy_to_pmem() + * data may still reside in cpu or platform buffers, so this operation + * must be followed by a wmb_pmem(). + */ +static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n) +{ + if (arch_has_pmem_api()) + arch_memcpy_to_pmem(dst, src, n); + else + default_memcpy_to_pmem(dst, src, n); +} + +/** + * wmb_pmem - synchronize writes to persistent memory + * + * After a series of memcpy_to_pmem() operations this drains data from + * cpu write buffers and any platform (memory controller) buffers to + * ensure that written data is durable on persistent memory media. + */ +static inline void wmb_pmem(void) +{ + if (arch_has_pmem_api()) + arch_wmb_pmem(); +} +#endif /* __PMEM_H__ */ diff --git a/lib/Kconfig b/lib/Kconfig index 601965a948e8..d27c13a91c28 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -522,4 +522,7 @@ source "lib/fonts/Kconfig" config ARCH_HAS_SG_CHAIN def_bool n +config ARCH_HAS_PMEM_API + bool + endmenu -- cgit v1.2.3