diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-14 09:57:24 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-14 09:57:24 -0700 |
commit | d5660df4a555a98154da850fb61f118269d0a283 (patch) | |
tree | b2c5f3a15c300499df930321c32fd7d288467d6b /drivers | |
parent | b5fc7a89e58bcc059a3d5e4db79c481fb437de59 (diff) | |
parent | f1f4f3ab54e9a52c7610c998ff8255f019742e67 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
"181 patches.
Subsystems affected by this patch series: kbuild, scripts, ntfs,
ocfs2, vfs, mm (slab, slub, kmemleak, dax, debug, pagecache, fadvise,
gup, swap, memremap, memcg, selftests, pagemap, mincore, hmm, dma,
memory-failure, vmallo and migration)"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (181 commits)
mm/migrate: remove obsolete comment about device public
mm/migrate: remove cpages-- in migrate_vma_finalize()
mm, oom_adj: don't loop through tasks in __set_oom_adj when not necessary
memblock: use separate iterators for memory and reserved regions
memblock: implement for_each_reserved_mem_region() using __next_mem_region()
memblock: remove unused memblock_mem_size()
x86/setup: simplify reserve_crashkernel()
x86/setup: simplify initrd relocation and reservation
arch, drivers: replace for_each_membock() with for_each_mem_range()
arch, mm: replace for_each_memblock() with for_each_mem_pfn_range()
memblock: reduce number of parameters in for_each_mem_range()
memblock: make memblock_debug and related functionality private
memblock: make for_each_memblock_type() iterator private
mircoblaze: drop unneeded NUMA and sparsemem initializations
riscv: drop unneeded node initialization
h8300, nds32, openrisc: simplify detection of memory extents
arm64: numa: simplify dummy_numa_init()
arm, xtensa: simplify initialization of high memory pages
dma-contiguous: simplify cma_early_percent_memory()
KVM: PPC: Book3S HV: simplify kvm_cma_reserve()
...
Diffstat (limited to 'drivers')
29 files changed, 1526 insertions, 428 deletions
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 2c32cfb72370..134bcb40b2af 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -24,8 +24,15 @@ #include <linux/mutex.h> #include <linux/node.h> #include <linux/sysfs.h> +#include <linux/dax.h> static u8 hmat_revision; +static int hmat_disable __initdata; + +void __init disable_hmat(void) +{ + hmat_disable = 1; +} static LIST_HEAD(targets); static LIST_HEAD(initiators); @@ -634,66 +641,6 @@ static void hmat_register_target_perf(struct memory_target *target) node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); } -static void hmat_register_target_device(struct memory_target *target, - struct resource *r) -{ - /* define a clean / non-busy resource for the platform device */ - struct resource res = { - .start = r->start, - .end = r->end, - .flags = IORESOURCE_MEM, - }; - struct platform_device *pdev; - struct memregion_info info; - int rc, id; - - rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, - IORES_DESC_SOFT_RESERVED); - if (rc != REGION_INTERSECTS) - return; - - id = memregion_alloc(GFP_KERNEL); - if (id < 0) { - pr_err("memregion allocation failure for %pr\n", &res); - return; - } - - pdev = platform_device_alloc("hmem", id); - if (!pdev) { - pr_err("hmem device allocation failure for %pr\n", &res); - goto out_pdev; - } - - pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm); - info = (struct memregion_info) { - .target_node = acpi_map_pxm_to_node(target->memory_pxm), - }; - rc = platform_device_add_data(pdev, &info, sizeof(info)); - if (rc < 0) { - pr_err("hmem memregion_info allocation failure for %pr\n", &res); - goto out_pdev; - } - - rc = platform_device_add_resources(pdev, &res, 1); - if (rc < 0) { - pr_err("hmem resource allocation failure for %pr\n", &res); - goto out_resource; - } - - rc = platform_device_add(pdev); - if (rc < 0) { - dev_err(&pdev->dev, "device add failed for %pr\n", &res); - goto out_resource; - } - - return; - -out_resource: - put_device(&pdev->dev); -out_pdev: - memregion_free(id); -} - static void hmat_register_target_devices(struct memory_target *target) { struct resource *res; @@ -705,8 +652,11 @@ static void hmat_register_target_devices(struct memory_target *target) if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) return; - for (res = target->memregions.child; res; res = res->sibling) - hmat_register_target_device(target, res); + for (res = target->memregions.child; res; res = res->sibling) { + int target_nid = acpi_map_pxm_to_node(target->memory_pxm); + + hmem_register_device(target_nid, res); + } } static void hmat_register_target(struct memory_target *target) @@ -814,7 +764,7 @@ static __init int hmat_init(void) enum acpi_hmat_type i; acpi_status status; - if (srat_disabled()) + if (srat_disabled() || hmat_disable) return 0; status = acpi_get_table(ACPI_SIG_SRAT, 0, &tbl); diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 15bbaab8500b..1b0ae0a1959b 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -27,7 +27,12 @@ static int node_to_pxm_map[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; unsigned char acpi_srat_revision __initdata; -int acpi_numa __initdata; +static int acpi_numa __initdata; + +void __init disable_srat(void) +{ + acpi_numa = -1; +} int pxm_to_node(int pxm) { @@ -163,7 +168,7 @@ static int __init slit_valid(struct acpi_table_slit *slit) void __init bad_srat(void) { pr_err("SRAT: SRAT not used.\n"); - acpi_numa = -1; + disable_srat(); } int __init srat_disabled(void) diff --git a/drivers/base/core.c b/drivers/base/core.c index f90e9f77bf8c..0bad9e6066c3 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3324,7 +3324,7 @@ struct device *device_find_child_by_name(struct device *parent, klist_iter_init(&parent->p->klist_children, &i); while ((child = next_device(&i))) - if (!strcmp(dev_name(child), name) && get_device(child)) + if (sysfs_streq(dev_name(child), name) && get_device(child)) break; klist_iter_exit(&i); return child; diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c index 5b2a11a88951..2519ceede64b 100644 --- a/drivers/bus/mvebu-mbus.c +++ b/drivers/bus/mvebu-mbus.c @@ -610,23 +610,23 @@ static unsigned int armada_xp_mbus_win_remap_offset(int win) static void __init mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end) { - struct memblock_region *r; - uint64_t s = 0; + phys_addr_t reg_start, reg_end; + uint64_t i, s = 0; - for_each_memblock(memory, r) { + for_each_mem_range(i, ®_start, ®_end) { /* * This part of the memory is above 4 GB, so we don't * care for the MBus bridge hole. */ - if (r->base >= 0x100000000ULL) + if (reg_start >= 0x100000000ULL) continue; /* * The MBus bridge hole is at the end of the RAM under * the 4 GB limit. */ - if (r->base + r->size > s) - s = r->base + r->size; + if (reg_end > s) + s = reg_end; } *start = s; diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3b6c06f07326..567428e10b7b 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -35,6 +35,7 @@ config DEV_DAX_PMEM config DEV_DAX_HMEM tristate "HMEM DAX: direct access to 'specific purpose' memory" depends on EFI_SOFT_RESERVE + select NUMA_KEEP_MEMINFO if (NUMA && X86) default DEV_DAX help EFI 2.8 platforms, and others, may advertise 'specific purpose' @@ -48,6 +49,11 @@ config DEV_DAX_HMEM Say M if unsure. +config DEV_DAX_HMEM_DEVICES + depends on NUMA_KEEP_MEMINFO # for phys_to_target_node() + depends on DEV_DAX_HMEM && DAX=y + def_bool y + config DEV_DAX_KMEM tristate "KMEM DAX: volatile-use of persistent memory" default DEV_DAX diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 80065b38b3c4..9d4ba672d305 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -2,11 +2,10 @@ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o -obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o dax-y := super.o dax-y += bus.o device_dax-y := device.o -dax_hmem-y := hmem.o obj-y += pmem/ +obj-y += hmem/ diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index df238c8b6ef2..27513d311242 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -6,6 +6,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/dax.h> +#include <linux/io.h> #include "dax-private.h" #include "bus.h" @@ -130,10 +131,63 @@ ATTRIBUTE_GROUPS(dax_drv); static int dax_bus_match(struct device *dev, struct device_driver *drv); +static bool is_static(struct dax_region *dax_region) +{ + return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; +} + +static u64 dev_dax_size(struct dev_dax *dev_dax) +{ + u64 size = 0; + int i; + + device_lock_assert(&dev_dax->dev); + + for (i = 0; i < dev_dax->nr_range; i++) + size += range_len(&dev_dax->ranges[i].range); + + return size; +} + +static int dax_bus_probe(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + int rc; + + if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0) + return -ENXIO; + + rc = dax_drv->probe(dev_dax); + + if (rc || is_static(dax_region)) + return rc; + + /* + * Track new seed creation only after successful probe of the + * previous seed. + */ + if (dax_region->seed == dev) + dax_region->seed = NULL; + + return 0; +} + +static int dax_bus_remove(struct device *dev) +{ + struct dax_device_driver *dax_drv = to_dax_drv(dev->driver); + struct dev_dax *dev_dax = to_dev_dax(dev); + + return dax_drv->remove(dev_dax); +} + static struct bus_type dax_bus_type = { .name = "dax", .uevent = dax_bus_uevent, .match = dax_bus_match, + .probe = dax_bus_probe, + .remove = dax_bus_remove, .drv_groups = dax_drv_groups, }; @@ -176,18 +230,269 @@ static ssize_t region_size_show(struct device *dev, static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, region_size_show, NULL); -static ssize_t align_show(struct device *dev, +static ssize_t region_align_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dax_region *dax_region = dev_get_drvdata(dev); return sprintf(buf, "%u\n", dax_region->align); } -static DEVICE_ATTR_RO(align); +static struct device_attribute dev_attr_region_align = + __ATTR(align, 0400, region_align_show, NULL); + +#define for_each_dax_region_resource(dax_region, res) \ + for (res = (dax_region)->res.child; res; res = res->sibling) + +static unsigned long long dax_region_avail_size(struct dax_region *dax_region) +{ + resource_size_t size = resource_size(&dax_region->res); + struct resource *res; + + device_lock_assert(dax_region->dev); + + for_each_dax_region_resource(dax_region, res) + size -= resource_size(res); + return size; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long size; + + device_lock(dev); + size = dax_region_avail_size(dax_region); + device_unlock(dev); + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(available_size); + +static ssize_t seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *seed; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + seed = dax_region->seed; + rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(seed); + +static ssize_t create_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct device *youngest; + ssize_t rc; + + if (is_static(dax_region)) + return -EINVAL; + + device_lock(dev); + youngest = dax_region->youngest; + rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); + device_unlock(dev); + + return rc; +} + +static ssize_t create_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + unsigned long long avail; + ssize_t rc; + int val; + + if (is_static(dax_region)) + return -EINVAL; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + if (val != 1) + return -EINVAL; + + device_lock(dev); + avail = dax_region_avail_size(dax_region); + if (avail == 0) + rc = -ENOSPC; + else { + struct dev_dax_data data = { + .dax_region = dax_region, + .size = 0, + .id = -1, + }; + struct dev_dax *dev_dax = devm_create_dev_dax(&data); + + if (IS_ERR(dev_dax)) + rc = PTR_ERR(dev_dax); + else { + /* + * In support of crafting multiple new devices + * simultaneously multiple seeds can be created, + * but only the first one that has not been + * successfully bound is tracked as the region + * seed. + */ + if (!dax_region->seed) + dax_region->seed = &dev_dax->dev; + dax_region->youngest = &dev_dax->dev; + rc = len; + } + } + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(create); + +void kill_dev_dax(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + + kill_dax(dax_dev); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); +} +EXPORT_SYMBOL_GPL(kill_dev_dax); + +static void free_dev_dax_ranges(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + int i; + + device_lock_assert(dax_region->dev); + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + __release_region(&dax_region->res, range->start, + range_len(range)); + } + dev_dax->nr_range = 0; +} + +static void unregister_dev_dax(void *dev) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + dev_dbg(dev, "%s\n", __func__); + + kill_dev_dax(dev_dax); + free_dev_dax_ranges(dev_dax); + device_del(dev); + put_device(dev); +} + +/* a return value >= 0 indicates this invocation invalidated the id */ +static int __free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + int rc = dev_dax->id; + + device_lock_assert(dev); + + if (is_static(dax_region) || dev_dax->id < 0) + return -1; + ida_free(&dax_region->ida, dev_dax->id); + dev_dax->id = -1; + return rc; +} + +static int free_dev_dax_id(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + int rc; + + device_lock(dev); + rc = __free_dev_dax_id(dev_dax); + device_unlock(dev); + return rc; +} + +static ssize_t delete_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dax_region *dax_region = dev_get_drvdata(dev); + struct dev_dax *dev_dax; + struct device *victim; + bool do_del = false; + int rc; + + if (is_static(dax_region)) + return -EINVAL; + + victim = device_find_child_by_name(dax_region->dev, buf); + if (!victim) + return -ENXIO; + + device_lock(dev); + device_lock(victim); + dev_dax = to_dev_dax(victim); + if (victim->driver || dev_dax_size(dev_dax)) + rc = -EBUSY; + else { + /* + * Invalidate the device so it does not become active + * again, but always preserve device-id-0 so that + * /sys/bus/dax/ is guaranteed to be populated while any + * dax_region is registered. + */ + if (dev_dax->id > 0) { + do_del = __free_dev_dax_id(dev_dax) >= 0; + rc = len; + if (dax_region->seed == victim) + dax_region->seed = NULL; + if (dax_region->youngest == victim) + dax_region->youngest = NULL; + } else + rc = -EBUSY; + } + device_unlock(victim); + + /* won the race to invalidate the device, clean it up */ + if (do_del) + devm_release_action(dev, unregister_dev_dax, victim); + device_unlock(dev); + put_device(victim); + + return rc; +} +static DEVICE_ATTR_WO(delete); + +static umode_t dax_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dax_region *dax_region = dev_get_drvdata(dev); + + if (is_static(dax_region)) + if (a == &dev_attr_available_size.attr + || a == &dev_attr_create.attr + || a == &dev_attr_seed.attr + || a == &dev_attr_delete.attr) + return 0; + return a->mode; +} static struct attribute *dax_region_attributes[] = { + &dev_attr_available_size.attr, &dev_attr_region_size.attr, - &dev_attr_align.attr, + &dev_attr_region_align.attr, + &dev_attr_create.attr, + &dev_attr_seed.attr, + &dev_attr_delete.attr, &dev_attr_id.attr, NULL, }; @@ -195,6 +500,7 @@ static struct attribute *dax_region_attributes[] = { static const struct attribute_group dax_region_attribute_group = { .name = "dax_region", .attrs = dax_region_attributes, + .is_visible = dax_region_visible, }; static const struct attribute_group *dax_region_attribute_groups[] = { @@ -226,8 +532,8 @@ static void dax_region_unregister(void *region) } struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, - unsigned long long pfn_flags) + struct range *range, int target_node, unsigned int align, + unsigned long flags) { struct dax_region *dax_region; @@ -241,8 +547,8 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, return NULL; } - if (!IS_ALIGNED(res->start, align) - || !IS_ALIGNED(resource_size(res), align)) + if (!IS_ALIGNED(range->start, align) + || !IS_ALIGNED(range_len(range), align)) return NULL; dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); @@ -250,13 +556,18 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, return NULL; dev_set_drvdata(parent, dax_region); - memcpy(&dax_region->res, res, sizeof(*res)); - dax_region->pfn_flags = pfn_flags; kref_init(&dax_region->kref); dax_region->id = region_id; dax_region->align = align; dax_region->dev = parent; dax_region->target_node = target_node; + ida_init(&dax_region->ida); + dax_region->res = (struct resource) { + .start = range->start, + .end = range->end, + .flags = IORESOURCE_MEM | flags, + }; + if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { kfree(dax_region); return NULL; @@ -269,45 +580,631 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); +static void dax_mapping_release(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + + ida_free(&dev_dax->ida, mapping->id); + kfree(mapping); +} + +static void unregister_dax_mapping(void *data) +{ + struct device *dev = data; + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + dev_dbg(dev, "%s\n", __func__); + + device_lock_assert(dax_region->dev); + + dev_dax->ranges[mapping->range_id].mapping = NULL; + mapping->range_id = -1; + + device_del(dev); + put_device(dev); +} + +static struct dev_dax_range *get_dax_range(struct device *dev) +{ + struct dax_mapping *mapping = to_dax_mapping(dev); + struct dev_dax *dev_dax = to_dev_dax(dev->parent); + struct dax_region *dax_region = dev_dax->region; + + device_lock(dax_region->dev); + if (mapping->range_id < 0) { + device_unlock(dax_region->dev); + return NULL; + } + + return &dev_dax->ranges[mapping->range_id]; +} + +static void put_dax_range(struct dev_dax_range *dax_range) +{ + struct dax_mapping *mapping = dax_range->mapping; + struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent); + struct dax_region *dax_region = dev_dax->region; + + device_unlock(dax_region->dev); +} + +static ssize_t start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.start); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(start, 0400, start_show, NULL); + +static ssize_t end_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#llx\n", dax_range->range.end); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(end, 0400, end_show, NULL); + +static ssize_t pgoff_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax_range *dax_range; + ssize_t rc; + + dax_range = get_dax_range(dev); + if (!dax_range) + return -ENXIO; + rc = sprintf(buf, "%#lx\n", dax_range->pgoff); + put_dax_range(dax_range); + + return rc; +} +static DEVICE_ATTR(page_offset, 0400, pgoff_show, NULL); + +static struct attribute *dax_mapping_attributes[] = { + &dev_attr_start.attr, + &dev_attr_end.attr, + &dev_attr_page_offset.attr, + NULL, +}; + +static const struct attribute_group dax_mapping_attribute_group = { + .attrs = dax_mapping_attributes, +}; + +static const struct attribute_group *dax_mapping_attribute_groups[] = { + &dax_mapping_attribute_group, + NULL, +}; + +static struct device_type dax_mapping_type = { + .release = dax_mapping_release, + .groups = dax_mapping_attribute_groups, +}; + +static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id) +{ + struct dax_region *dax_region = dev_dax->region; + struct dax_mapping *mapping; + struct device *dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver, + "region disabled\n")) + return -ENXIO; + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return -ENOMEM; + mapping->range_id = range_id; + mapping->id = ida_alloc(&dev_dax->ida, GFP_KERNEL); + if (mapping->id < 0) { + kfree(mapping); + return -ENOMEM; + } + dev_dax->ranges[range_id].mapping = mapping; + dev = &mapping->dev; + device_initialize(dev); + dev->parent = &dev_dax->dev; + dev->type = &dax_mapping_type; + dev_set_name(dev, "mapping%d", mapping->id); + rc = device_add(dev); + if (rc) { + put_device(dev); + return rc; + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_mapping, + dev); + if (rc) + return rc; + return 0; +} + +static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, + resource_size_t size) +{ + struct dax_region *dax_region = dev_dax->region; + struct resource *res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct dev_dax_range *ranges; + unsigned long pgoff = 0; + struct resource *alloc; + int i, rc; + + device_lock_assert(dax_region->dev); + + /* handle the seed alloc special case */ + if (!size) { + if (dev_WARN_ONCE(dev, dev_dax->nr_range, + "0-size allocation must be first\n")) + return -EBUSY; + /* nr_range == 0 is elsewhere special cased as 0-size device */ + return 0; + } + + ranges = krealloc(dev_dax->ranges, sizeof(*ranges) + * (dev_dax->nr_range + 1), GFP_KERNEL); + if (!ranges) + return -ENOMEM; + + alloc = __request_region(res, start, size, dev_name(dev), 0); + if (!alloc) { + /* + * If this was an empty set of ranges nothing else + * will release @ranges, so do it now. + */ + if (!dev_dax->nr_range) { + kfree(ranges); + ranges = NULL; + } + dev_dax->ranges = ranges; + return -ENOMEM; + } + + for (i = 0; i < dev_dax->nr_range; i++) + pgoff += PHYS_PFN(range_len(&ranges[i].range)); + dev_dax->ranges = ranges; + ranges[dev_dax->nr_range++] = (struct dev_dax_range) { + .pgoff = pgoff, + .range = { + .start = alloc->start, + .end = alloc->end, + }, + }; + + dev_dbg(dev, "alloc range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, + &alloc->start, &alloc->end); + /* + * A dev_dax instance must be registered before mapping device + * children can be added. Defer to devm_create_dev_dax() to add + * the initial mapping device. + */ + if (!device_is_registered(&dev_dax->dev)) + return 0; + + rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1); + if (rc) { + dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, + &alloc->start, &alloc->end); + dev_dax->nr_range--; + __release_region(res, alloc->start, resource_size(alloc)); + return rc; + } + + return 0; +} + +static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size) +{ + int last_range = dev_dax->nr_range - 1; + struct dev_dax_range *dax_range = &dev_dax->ranges[last_range]; + struct dax_region *dax_region = dev_dax->region; + bool is_shrink = resource_size(res) > size; + struct range *range = &dax_range->range; + struct device *dev = &dev_dax->dev; + int rc; + + device_lock_assert(dax_region->dev); + + if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n")) + return -EINVAL; + + rc = adjust_resource(res, range->start, size); + if (rc) + return rc; + + *range = (struct range) { + .start = range->start, + .end = range->start + size - 1, + }; + + dev_dbg(dev, "%s range[%d]: %#llx:%#llx\n", is_shrink ? "shrink" : "extend", + last_range, (unsigned long long) range->start, + (unsigned long long) range->end); + + return 0; +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); - unsigned long long size = resource_size(&dev_dax->region->res); + unsigned long long size; + + device_lock(dev); + size = dev_dax_size(dev_dax); + device_unlock(dev); return sprintf(buf, "%llu\n", size); } -static DEVICE_ATTR_RO(size); -static int dev_dax_target_node(struct dev_dax *dev_dax) +static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size) +{ + /* + * The minimum mapping granularity for a device instance is a + * single subsection, unless the arch says otherwise. + */ + return IS_ALIGNED(size, max_t(unsigned long, dev_dax->align, memremap_compat_align())); +} + +static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size) { + resource_size_t to_shrink = dev_dax_size(dev_dax) - size; struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; + int i; + + for (i = dev_dax->nr_range - 1; i >= 0; i--) { + struct range *range = &dev_dax->ranges[i].range; + struct dax_mapping *mapping = dev_dax->ranges[i].mapping; + struct resource *adjust = NULL, *res; + resource_size_t shrink; + + shrink = min_t(u64, to_shrink, range_len(range)); + if (shrink >= range_len(range)) { + devm_release_action(dax_region->dev, + unregister_dax_mapping, &mapping->dev); + __release_region(&dax_region->res, range->start, + range_len(range)); + dev_dax->nr_range--; + dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i, + (unsigned long long) range->start, + (unsigned long long) range->end); + to_shrink -= shrink; + if (!to_shrink) + break; + continue; + } + + for_each_dax_region_resource(dax_region, res) + if (strcmp(res->name, dev_name(dev)) == 0 + && res->start == range->start) { + adjust = res; + break; + } + + if (dev_WARN_ONCE(dev, !adjust || i != dev_dax->nr_range - 1, + "failed to find matching resource\n")) + return -ENXIO; + return adjust_dev_dax_range(dev_dax, adjust, range_len(range) + - shrink); + } + return 0; +} - return dax_region->target_node; +/* + * Only allow adjustments that preserve the relative pgoff of existing + * allocations. I.e. the dev_dax->ranges array is ordered by increasing pgoff. + */ +static bool adjust_ok(struct dev_dax *dev_dax, struct resource *res) +{ + struct dev_dax_range *last; + int i; + + if (dev_dax->nr_range == 0) + return false; + if (strcmp(res->name, dev_name(&dev_dax->dev)) != 0) + return false; + last = &dev_dax->ranges[dev_dax->nr_range - 1]; + if (last->range.start != res->start || last->range.end != res->end) + return false; + for (i = 0; i < dev_dax->nr_range - 1; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + + if (dax_range->pgoff > last->pgoff) + return false; + } + + return true; } -static ssize_t target_node_show(struct device *dev, +static ssize_t dev_dax_resize(struct dax_region *dax_region, + struct dev_dax *dev_dax, resource_size_t size) +{ + resource_size_t avail = dax_region_avail_size(dax_region), to_alloc; + resource_size_t dev_size = dev_dax_size(dev_dax); + struct resource *region_res = &dax_region->res; + struct device *dev = &dev_dax->dev; + struct resource *res, *first; + resource_size_t alloc = 0; + int rc; + + if (dev->driver) + return -EBUSY; + if (size == dev_size) + return 0; + if (size > dev_size && size - dev_size > avail) + return -ENOSPC; + if (size < dev_size) + return dev_dax_shrink(dev_dax, size); + + to_alloc = size - dev_size; + if (dev_WARN_ONCE(dev, !alloc_is_aligned(dev_dax, to_alloc), + "resize of %pa misaligned\n", &to_alloc)) + return -ENXIO; + + /* + * Expand the device into the unused portion of the region. This + * may involve adjusting the end of an existing resource, or + * allocating a new resource. + */ +retry: + first = region_res->child; + if (!first) + return alloc_dev_dax_range(dev_dax, dax_region->res.start, to_alloc); + + rc = -ENOSPC; + for (res = first; res; res = res->sibling) { + struct resource *next = res->sibling; + + /* space at the beginning of the region */ + if (res == first && res->start > dax_region->res.start) { + alloc = min(res->start - dax_region->res.start, to_alloc); + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, alloc); + break; + } + + alloc = 0; + /* space between allocations */ + if (next && next->start > res->end + 1) + alloc = min(next->start - (res->end + 1), to_alloc); + + /* space at the end of the region */ + if (!alloc && !next && res->end < region_res->end) + alloc = min(region_res->end - res->end, to_alloc); + + if (!alloc) + continue; + + if (adjust_ok(dev_dax, res)) { + rc = adjust_dev_dax_range(dev_dax, res, resource_size(res) + alloc); + break; + } + rc = alloc_dev_dax_range(dev_dax, res->end + 1, alloc); + break; + } + if (rc) + return rc; + to_alloc -= alloc; + if (to_alloc) + goto retry; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + ssize_t rc; + unsigned long long val; + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + if (!alloc_is_aligned(dev_dax, val)) { + dev_dbg(dev, "%s: size: %lld misaligned\n", __func__, val); + return -EINVAL; + } + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + device_lock(dev); + rc = dev_dax_resize(dax_region, dev_dax, val); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_RW(size); + +static ssize_t range_parse(const char *opt, size_t len, struct range *range) +{ + unsigned long long addr = 0; + char *start, *end, *str; + ssize_t rc = EINVAL; + + str = kstrdup(opt, GFP_KERNEL); + if (!str) + return rc; + + end = str; + start = strsep(&end, "-"); + if (!start || !end) + goto err; + + rc = kstrtoull(start, 16, &addr); + if (rc) + goto err; + range->start = addr; + + rc = kstrtoull(end, 16, &addr); + if (rc) + goto err; + range->end = addr; + +err: + kfree(str); + return rc; +} + +static ssize_t mapping_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + size_t to_alloc; + struct range r; + ssize_t rc; + + rc = range_parse(buf, len, &r); + if (rc) + return rc; + + rc = -ENXIO; + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return rc; + } + device_lock(dev); + + to_alloc = range_len(&r); + if (alloc_is_aligned(dev_dax, to_alloc)) + rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc); + device_unlock(dev); + device_unlock(dax_region->dev); + + return rc == 0 ? len : rc; +} +static DEVICE_ATTR_WO(mapping); + +static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); - return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); + return sprintf(buf, "%d\n", dev_dax->align); } -static DEVICE_ATTR_RO(target_node); -static unsigned long long dev_dax_resource(struct dev_dax *dev_dax) +static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax) { + resource_size_t dev_size = dev_dax_size(dev_dax); + struct device *dev = &dev_dax->dev; + int i; + + if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) { + dev_dbg(dev, "%s: align %u invalid for size %pa\n", + __func__, dev_dax->align, &dev_size); + return -EINVAL; + } + + for (i = 0; i < dev_dax->nr_range; i++) { + size_t len = range_len(&dev_dax->ranges[i].range); + + if (!alloc_is_aligned(dev_dax, len)) { + dev_dbg(dev, "%s: align %u invalid for range %d\n", + __func__, dev_dax->align, i); + return -EINVAL; + } + } + + return 0; +} + +static ssize_t align_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_region *dax_region = dev_dax->region; + unsigned long val, align_save; + ssize_t rc; - return dax_region->res.start; + rc = kstrtoul(buf, 0, &val); + if (rc) + return -ENXIO; + + if (!dax_align_valid(val)) + return -EINVAL; + + device_lock(dax_region->dev); + if (!dax_region->dev->driver) { + device_unlock(dax_region->dev); + return -ENXIO; + } + + device_lock(dev); + if (dev->driver) { + rc = -EBUSY; + goto out_unlock; + } + + align_save = dev_dax->align; + dev_dax->align = val; + rc = dev_dax_validate_align(dev_dax); + if (rc) + dev_dax->align = align_save; +out_unlock: + device_unlock(dev); + device_unlock(dax_region->dev); + return rc == 0 ? len : rc; } +static DEVICE_ATTR_RW(align); + +static int dev_dax_target_node(struct dev_dax *dev_dax) +{ + struct dax_region *dax_region = dev_dax->region; + + return dax_region->target_node; +} + +static ssize_t target_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); +} +static DEVICE_ATTR_RO(target_node); static ssize_t resource_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + unsigned long long start; + + if (dev_dax->nr_range < 1) + start = dax_region->res.start; + else + start = dev_dax->ranges[0].range.start; - return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax)); + return sprintf(buf, "%#llx\n", start); } static DEVICE_ATTR(resource, 0400, resource_show, NULL); @@ -333,18 +1230,26 @@ static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0) return 0; if (a == &dev_attr_numa_node.attr && !IS_ENABLED(CONFIG_NUMA)) return 0; + if (a == &dev_attr_mapping.attr && is_static(dax_region)) + return 0; + if ((a == &dev_attr_align.attr || + a == &dev_attr_size.attr) && is_static(dax_region)) + return 0444; return a->mode; } static struct attribute *dev_dax_attributes[] = { &dev_attr_modalias.attr, &dev_attr_size.attr, + &dev_attr_mapping.attr, &dev_attr_target_node.attr, + &dev_attr_align.attr, &dev_attr_resource.attr, &dev_attr_numa_node.attr, NULL, @@ -360,24 +1265,17 @@ static const struct attribute_group *dax_attribute_groups[] = { NULL, }; -void kill_dev_dax(struct dev_dax *dev_dax) -{ - struct dax_device *dax_dev = dev_dax->dax_dev; - struct inode *inode = dax_inode(dax_dev); - - kill_dax(dax_dev); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); -} -EXPORT_SYMBOL_GPL(kill_dev_dax); - static void dev_dax_release(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_region *dax_region = dev_dax->region; struct dax_device *dax_dev = dev_dax->dax_dev; - dax_region_put(dax_region); put_dax(dax_dev); + free_dev_dax_id(dev_dax); + dax_region_put(dax_region); + kfree(dev_dax->ranges); + kfree(dev_dax->pgmap); kfree(dev_dax); } @@ -386,35 +1284,61 @@ static const struct device_type dev_dax_type = { .groups = dax_attribute_groups, }; -static void unregister_dev_dax(void *dev) -{ - struct dev_dax *dev_dax = to_dev_dax(dev); - - dev_dbg(dev, "%s\n", __func__); - - kill_dev_dax(dev_dax); - device_del(dev); - put_device(dev); -} - -struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap, enum dev_dax_subsys subsys) +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) { + struct dax_region *dax_region = data->dax_region; struct device *parent = dax_region->dev; struct dax_device *dax_dev; struct dev_dax *dev_dax; struct inode *inode; struct device *dev; - int rc = -ENOMEM; - - if (id < 0) - return ERR_PTR(-EINVAL); + int rc; dev_dax = kzalloc(sizeof(*dev_dax), GFP_KERNEL); if (!dev_dax) return ERR_PTR(-ENOMEM); - memcpy(&dev_dax->pgmap, pgmap, sizeof(*pgmap)); + if (is_static(dax_region)) { + if (dev_WARN_ONCE(parent, data->id < 0, + "dynamic id specified to static region\n")) { + rc = -EINVAL; + goto err_id; + } + + dev_dax->id = data->id; + } else { + if (dev_WARN_ONCE(parent, data->id >= 0, + "static id specified to dynamic region\n")) { + rc = -EINVAL; + goto err_id; + } + + rc = ida_alloc(&dax_region->ida, GFP_KERNEL); + if (rc < 0) + goto err_id; + dev_dax->id = rc; + } + + dev_dax->region = dax_region; + dev = &dev_dax->dev; + device_initialize(dev); + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); + + rc = alloc_dev_dax_range(dev_dax, dax_region->res.start, data->size); + if (rc) + goto err_range; + + if (data->pgmap) { + dev_WARN_ONCE(parent, !is_static(dax_region), + "custom dev_pagemap requires a static dax_region\n"); + + dev_dax->pgmap = kmemdup(data->pgmap, + sizeof(struct dev_pagemap), GFP_KERNEL); + if (!dev_dax->pgmap) { + rc = -ENOMEM; + goto err_pgmap; + } + } /* * No 'host' or dax_operations since there is no access to this @@ -423,30 +1347,26 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); if (IS_ERR(dax_dev)) { rc = PTR_ERR(dax_dev); - goto err; + goto err_alloc_dax; } /* a device_dax instance is dead while the driver is not attached */ kill_dax(dax_dev); - /* from here on we're committed to teardown via dax_dev_release() */ - dev = &dev_dax->dev; - device_initialize(dev); - dev_dax->dax_dev = dax_dev; - dev_dax->region = dax_region; dev_dax->target_node = dax_region->target_node; + dev_dax->align = dax_region->align; + ida_init(&dev_dax->ida); kref_get(&dax_region->kref); inode = dax_inode(dax_dev); dev->devt = inode->i_rdev; - if (subsys == DEV_DAX_BUS) + if (data->subsys == DEV_DAX_BUS) dev->bus = &dax_bus_type; else dev->class = dax_class; dev->parent = parent; dev->type = &dev_dax_type; - dev_set_name(dev, "dax%d.%d", dax_region->id, id); rc = device_add(dev); if (rc) { @@ -459,14 +1379,27 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, if (rc) return ERR_PTR(rc); + /* register mapping device for the initial allocation range */ + if (dev_dax->nr_range && range_len(&dev_dax->ranges[0].range)) { + rc = devm_register_dax_mapping(dev_dax, 0); + if (rc) + return ERR_PTR(rc); + } + return dev_dax; - err: +err_alloc_dax: + kfree(dev_dax->pgmap); +err_pgmap: + free_dev_dax_ranges(dev_dax); +err_range: + free_dev_dax_id(dev_dax); +err_id: kfree(dev_dax); return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(__devm_create_dev_dax); +EXPORT_SYMBOL_GPL(devm_create_dev_dax); static int match_always_count; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 9e4eba67e8b9..72b92f95509f 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -3,29 +3,33 @@ #ifndef __DAX_BUS_H__ #define __DAX_BUS_H__ #include <linux/device.h> +#include <linux/range.h> struct dev_dax; struct resource; struct dax_device; struct dax_region; void dax_region_put(struct dax_region *dax_region); + +#define IORESOURCE_DAX_STATIC (1UL << 0) struct dax_region *alloc_dax_region(struct device *parent, int region_id, - struct resource *res, int target_node, unsigned int align, - unsigned long long flags); + struct range *range, int target_node, unsigned int align, + unsigned long flags); enum dev_dax_subsys { - DEV_DAX_BUS, + DEV_DAX_BUS = 0, /* zeroed dev_dax_data picks this by default */ DEV_DAX_CLASS, }; -struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, - struct dev_pagemap *pgmap, enum dev_dax_subsys subsys); +struct dev_dax_data { + struct dax_region *dax_region; + struct dev_pagemap *pgmap; + enum dev_dax_subsys subsys; + resource_size_t size; + int id; +}; -static inline struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, - int id, struct dev_pagemap *pgmap) -{ - return __devm_create_dev_dax(dax_region, id, pgmap, DEV_DAX_BUS); -} +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); /* to be deleted when DEV_DAX_CLASS is removed */ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys); @@ -34,6 +38,8 @@ struct dax_device_driver { struct device_driver drv; struct list_head ids; int match_always; + int (*probe)(struct dev_dax *dev); + int (*remove)(struct dev_dax *dev); }; int __dax_driver_register(struct dax_device_driver *dax_drv, @@ -44,7 +50,7 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); #if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) -int dev_dax_probe(struct device *dev); +int dev_dax_probe(struct dev_dax *dev_dax); #endif /* diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 16850d5388ab..1c974b7caae6 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -7,6 +7,7 @@ #include <linux/device.h> #include <linux/cdev.h> +#include <linux/idr.h> /* private routines between core files */ struct dax_device; @@ -22,8 +23,10 @@ void dax_bus_exit(void); * @kref: to pin while other agents have a need to do lookups * @dev: parent device backing this region * @align: allocation and mapping alignment for child dax devices - * @res: physical address range of the region - * @pfn_flags: identify whether the pfns are paged back or not + * @ida: instance id allocator + * @res: resource tree to track instance allocations + * @seed: allow userspace to find the first unbound seed device + * @youngest: allow userspace to find the most recently created device */ struct dax_region { int id; @@ -31,8 +34,16 @@ struct dax_region { struct kref kref; struct device *dev; unsigned int align; + struct ida ida; struct resource res; - unsigned long long pfn_flags; + struct device *seed; + struct device *youngest; +}; + +struct dax_mapping { + struct device dev; + int range_id; + int id; }; /** @@ -41,22 +52,57 @@ struct dax_region { * @region - parent region * @dax_dev - core dax functionality * @target_node: effective numa node if dev_dax memory range is onlined + * @id: ida allocated id + * @ida: mapping id allocator * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) - * @dax_mem_res: physical address range of hotadded DAX memory - * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed() + * @nr_range: size of @ranges + * @ranges: resource-span + pgoff tuples for the instance */ struct dev_dax { struct dax_region *region; struct dax_device *dax_dev; + unsigned int align; int target_node; + int id; + struct ida ida; struct device dev; - struct dev_pagemap pgmap; - struct resource *dax_kmem_res; + struct dev_pagemap *pgmap; + int nr_range; + struct dev_dax_range { + unsigned long pgoff; + struct range range; + struct dax_mapping *mapping; + } *ranges; }; static inline struct dev_dax *to_dev_dax(struct device *dev) { return container_of(dev, struct dev_dax, dev); } + +static inline struct dax_mapping *to_dax_mapping(struct device *dev) +{ + return container_of(dev, struct dax_mapping, dev); +} + +phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline bool dax_align_valid(unsigned long align) +{ + if (align == PUD_SIZE && IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + return true; + if (align == PMD_SIZE && has_transparent_hugepage()) + return true; + if (align == PAGE_SIZE) + return true; + return false; +} +#else +static inline bool dax_align_valid(unsigned long align) +{ + return align == PAGE_SIZE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1e89513f3c59..25e0b84a4296 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -17,7 +17,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { - struct dax_region *dax_region = dev_dax->region; struct device *dev = &dev_dax->dev; unsigned long mask; @@ -32,7 +31,7 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -EINVAL; } - mask = dax_region->align - 1; + mask = dev_dax->align - 1; if (vma->vm_start & mask || vma->vm_end & mask) { dev_info_ratelimited(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", @@ -41,14 +40,6 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, return -EINVAL; } - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV - && (vma->vm_flags & VM_DONTCOPY) == 0) { - dev_info_ratelimited(dev, - "%s: %s: fail, dax range requires MADV_DONTFORK\n", - current->comm, func); - return -EINVAL; - } - if (!vma_is_dax(vma)) { dev_info_ratelimited(dev, "%s: %s: fail, vma is not DAX capable\n", @@ -63,15 +54,22 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { - struct resource *res = &dev_dax->region->res; - phys_addr_t phys; - - phys = pgoff * PAGE_SIZE + res->start; - if (phys >= res->start && phys <= res->end) { - if (phys + size - 1 <= res->end) + int i; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + unsigned long long pgoff_end; + phys_addr_t phys; + + pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; + if (pgoff < dax_range->pgoff || pgoff > pgoff_end) + continue; + phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; + if (phys + size - 1 <= range->end) return phys; + break; } - return -1; } @@ -79,21 +77,19 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf, pfn_t *pfn) { struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; unsigned int fault_size = PAGE_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PAGE_SIZE) { + if (dev_dax->align > PAGE_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - if (fault_size != dax_region->align) + if (fault_size != dev_dax->align) return VM_FAULT_SIGBUS; phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); @@ -102,7 +98,7 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); } @@ -112,7 +108,6 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; unsigned int fault_size = PMD_SIZE; @@ -120,22 +115,15 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PMD_SIZE) { + if (dev_dax->align > PMD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); - return VM_FAULT_SIGBUS; - } - - /* dax pmd mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - if (fault_size < dax_region->align) + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -150,7 +138,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } @@ -161,7 +149,6 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, { unsigned long pud_addr = vmf->address & PUD_MASK; struct device *dev = &dev_dax->dev; - struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; unsigned int fault_size = PUD_SIZE; @@ -170,22 +157,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dev_dax->region; - if (dax_region->align > PUD_SIZE) { + if (dev_dax->align > PUD_SIZE) { dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", - dax_region->align, fault_size); - return VM_FAULT_SIGBUS; - } - - /* dax pud mappings require pfn_t_devmap() */ - if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "region lacks devmap flags\n"); + dev_dax->align, fault_size); return VM_FAULT_SIGBUS; } - if (fault_size < dax_region->align) + if (fault_size < dev_dax->align) return VM_FAULT_SIGBUS; - else if (fault_size > dax_region->align) + else if (fault_size > dev_dax->align) return VM_FAULT_FALLBACK; /* if we are outside of the VMA */ @@ -200,7 +180,7 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } @@ -280,9 +260,8 @@ static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - if (!IS_ALIGNED(addr, dax_region->align)) + if (!IS_ALIGNED(addr, dev_dax->align)) return -EINVAL; return 0; } @@ -291,9 +270,8 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; - struct dax_region *dax_region = dev_dax->region; - return dax_region->align; + return dev_dax->align; } static const struct vm_operations_struct dax_vm_ops = { @@ -332,13 +310,11 @@ static unsigned long dax_get_unmapped_area(struct file *filp, { unsigned long off, off_end, off_align, len_align, addr_align, align; struct dev_dax *dev_dax = filp ? filp->private_data : NULL; - struct dax_region *dax_region; if (!dev_dax || addr) goto out; - dax_region = dev_dax->region; - align = dax_region->align; + align = dev_dax->align; off = pgoff << PAGE_SHIFT; off_end = off + len; off_align = round_up(off, align); @@ -412,25 +388,45 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } -int dev_dax_probe(struct device *dev) +int dev_dax_probe(struct dev_dax *dev_dax) { - struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_device *dax_dev = dev_dax->dax_dev; - struct resource *res = &dev_dax->region->res; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; struct inode *inode; struct cdev *cdev; void *addr; - int rc; + int rc, i; - /* 1:1 map region resource range to device-dax instance range */ - if (!devm_request_mem_region(dev, res->start, resource_size(res), - dev_name(dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); - return -EBUSY; + pgmap = dev_dax->pgmap; + if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, + "static pgmap / multi-range device conflict\n")) + return -EINVAL; + + if (!pgmap) { + pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) + * (dev_dax->nr_range - 1), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + pgmap->nr_range = dev_dax->nr_range; + } + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + /* don't update the range for static pgmap */ + if (!dev_dax->pgmap) + pgmap->ranges[i] = *range; } - dev_dax->pgmap.type = MEMORY_DEVICE_GENERIC; - addr = devm_memremap_pages(dev, &dev_dax->pgmap); + pgmap->type = MEMORY_DEVICE_GENERIC; + addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); @@ -456,17 +452,15 @@ int dev_dax_probe(struct device *dev) } EXPORT_SYMBOL_GPL(dev_dax_probe); -static int dev_dax_remove(struct device *dev) +static int dev_dax_remove(struct dev_dax *dev_dax) { /* all probe actions are unwound by devm */ return 0; } static struct dax_device_driver device_dax_driver = { - .drv = { - .probe = dev_dax_probe, - .remove = dev_dax_remove, - }, + .probe = dev_dax_probe, + .remove = dev_dax_remove, .match_always = 1, }; diff --git a/drivers/dax/hmem/Makefile b/drivers/dax/hmem/Makefile new file mode 100644 index 000000000000..57377b4c3d47 --- /dev/null +++ b/drivers/dax/hmem/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o +obj-$(CONFIG_DEV_DAX_HMEM_DEVICES) += device_hmem.o + +device_hmem-y := device.o +dax_hmem-y := hmem.o diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c new file mode 100644 index 000000000000..cb6401c9e9a4 --- /dev/null +++ b/drivers/dax/hmem/device.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/platform_device.h> +#include <linux/memregion.h> +#include <linux/module.h> +#include <linux/dax.h> +#include <linux/mm.h> + +static bool nohmem; +module_param_named(disable, nohmem, bool, 0444); + +void hmem_register_device(int target_nid, struct resource *r) +{ + /* define a clean / non-busy resource for the platform device */ + struct resource res = { + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, + }; + struct platform_device *pdev; + struct memregion_info info; + int rc, id; + + if (nohmem) + return; + + rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + if (rc != REGION_INTERSECTS) + return; + + id = memregion_alloc(GFP_KERNEL); + if (id < 0) { + pr_err("memregion allocation failure for %pr\n", &res); + return; + } + + pdev = platform_device_alloc("hmem", id); + if (!pdev) { + pr_err("hmem device allocation failure for %pr\n", &res); + goto out_pdev; + } + + pdev->dev.numa_node = numa_map_to_online_node(target_nid); + info = (struct memregion_info) { + .target_node = target_nid, + }; + rc = platform_device_add_data(pdev, &info, sizeof(info)); + if (rc < 0) { + pr_err("hmem memregion_info allocation failure for %pr\n", &res); + goto out_pdev; + } + + rc = platform_device_add_resources(pdev, &res, 1); + if (rc < 0) { + pr_err("hmem resource allocation failure for %pr\n", &res); + goto out_resource; + } + + rc = platform_device_add(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "device add failed for %pr\n", &res); + goto out_resource; + } + + return; + +out_resource: + put_device(&pdev->dev); +out_pdev: + memregion_free(id); +} + +static __init int hmem_register_one(struct resource *res, void *data) +{ + /* + * If the resource is not a top-level resource it was already + * assigned to a device by the HMAT parsing. + */ + if (res->parent != &iomem_resource) { + pr_info("HMEM: skip %pr, already claimed\n", res); + return 0; + } + + hmem_register_device(phys_to_target_node(res->start), res); + + return 0; +} + +static __init int hmem_init(void) +{ + walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, + IORESOURCE_MEM, 0, -1, NULL, hmem_register_one); + return 0; +} + +/* + * As this is a fallback for address ranges unclaimed by the ACPI HMAT + * parsing it must be at an initcall level greater than hmat_init(). + */ +late_initcall(hmem_init); diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem/hmem.c index fe7214daf62e..1bf040dbc834 100644 --- a/drivers/dax/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -3,30 +3,39 @@ #include <linux/memregion.h> #include <linux/module.h> #include <linux/pfn_t.h> -#include "bus.h" +#include "../bus.h" + +static bool region_idle; +module_param_named(region_idle, region_idle, bool, 0644); static int dax_hmem_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct dev_pagemap pgmap = { }; struct dax_region *dax_region; struct memregion_info *mri; + struct dev_dax_data data; struct dev_dax *dev_dax; struct resource *res; + struct range range; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENOMEM; mri = dev->platform_data; - memcpy(&pgmap.res, res, sizeof(*res)); - - dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node, - PMD_SIZE, PFN_DEV|PFN_MAP); + range.start = res->start; + range.end = res->end; + dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node, + PMD_SIZE, 0); if (!dax_region) return -ENOMEM; - dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap); + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = -1, + .size = region_idle ? 0 : resource_size(res), + }; + dev_dax = devm_create_dev_dax(&data); if (IS_ERR(dev_dax)) return PTR_ERR(dev_dax); diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 275aa5f87399..6c933f2b604e 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -19,17 +19,28 @@ static const char *kmem_name; /* Set if any memory will remain added when the driver will be unloaded. */ static bool any_hotremove_failed; -int dev_dax_kmem_probe(struct device *dev) +static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) { - struct dev_dax *dev_dax = to_dev_dax(dev); - struct resource *res = &dev_dax->region->res; - resource_size_t kmem_start; - resource_size_t kmem_size; - resource_size_t kmem_end; - struct resource *new_res; - const char *new_res_name; + struct dev_dax_range *dax_range = &dev_dax->ranges[i]; + struct range *range = &dax_range->range; + + /* memory-block align the hotplug range */ + r->start = ALIGN(range->start, memory_block_size_bytes()); + r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; + if (r->start >= r->end) { + r->start = range->start; + r->end = range->end; + return -ENOSPC; + } + return 0; +} + +static int dev_dax_kmem_probe(struct dev_dax *dev_dax) +{ + struct device *dev = &dev_dax->dev; + int i, mapped = 0; + char *res_name; int numa_node; - int rc; /* * Ensure good NUMA information for the persistent memory. @@ -39,68 +50,80 @@ int dev_dax_kmem_probe(struct device *dev) */ numa_node = dev_dax->target_node; if (numa_node < 0) { - dev_warn(dev, "rejecting DAX region %pR with invalid node: %d\n", - res, numa_node); + dev_warn(dev, "rejecting DAX region with invalid node: %d\n", + numa_node); return -EINVAL; } - /* Hotplug starting at the beginning of the next block: */ - kmem_start = ALIGN(res->start, memory_block_size_bytes()); - - kmem_size = resource_size(res); - /* Adjust the size down to compensate for moving up kmem_start: */ - kmem_size -= kmem_start - res->start; - /* Align the size down to cover only complete blocks: */ - kmem_size &= ~(memory_block_size_bytes() - 1); - kmem_end = kmem_start + kmem_size; - - new_res_name = kstrdup(dev_name(dev), GFP_KERNEL); - if (!new_res_name) + res_name = kstrdup(dev_name(dev), GFP_KERNEL); + if (!res_name) return -ENOMEM; - /* Region is permanently reserved if hotremove fails. */ - new_res = request_mem_region(kmem_start, kmem_size, new_res_name); - if (!new_res) { - dev_warn(dev, "could not reserve region [%pa-%pa]\n", - &kmem_start, &kmem_end); - kfree(new_res_name); - return -EBUSY; + for (i = 0; i < dev_dax->nr_range; i++) { + struct resource *res; + struct range range; + int rc; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) { + dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", + i, range.start, range.end); + continue; + } + + /* Region is permanently reserved if hotremove fails. */ + res = request_mem_region(range.start, range_len(&range), res_name); + if (!res) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", + i, range.start, range.end); + /* + * Once some memory has been onlined we can't + * assume that it can be un-onlined safely. + */ + if (mapped) + continue; + kfree(res_name); + return -EBUSY; + } + + /* + * Set flags appropriate for System RAM. Leave ..._BUSY clear + * so that add_memory() can add a child resource. Do not + * inherit flags from the parent since it may set new flags + * unknown to us that will break add_memory() below. + */ + res->flags = IORESOURCE_SYSTEM_RAM; + + /* + * Ensure that future kexec'd kernels will not treat + * this as RAM automatically. + */ + rc = add_memory_driver_managed(numa_node, range.start, + range_len(&range), kmem_name); + + if (rc) { + dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", + i, range.start, range.end); + release_mem_region(range.start, range_len(&range)); + if (mapped) + continue; + kfree(res_name); + return rc; + } + mapped++; } - /* - * Set flags appropriate for System RAM. Leave ..._BUSY clear - * so that add_memory() can add a child resource. Do not - * inherit flags from the parent since it may set new flags - * unknown to us that will break add_memory() below. - */ - new_res->flags = IORESOURCE_SYSTEM_RAM; - - /* - * Ensure that future kexec'd kernels will not treat this as RAM - * automatically. - */ - rc = add_memory_driver_managed(numa_node, new_res->start, - resource_size(new_res), kmem_name); - if (rc) { - release_resource(new_res); - kfree(new_res); - kfree(new_res_name); - return rc; - } - dev_dax->dax_kmem_res = new_res; + dev_set_drvdata(dev, res_name); return 0; } #ifdef CONFIG_MEMORY_HOTREMOVE -static int dev_dax_kmem_remove(struct device *dev) +static int dev_dax_kmem_remove(struct dev_dax *dev_dax) { - struct dev_dax *dev_dax = to_dev_dax(dev); - struct resource *res = dev_dax->dax_kmem_res; - resource_size_t kmem_start = res->start; - resource_size_t kmem_size = resource_size(res); - const char *res_name = res->name; - int rc; + int i, success = 0; + struct device *dev = &dev_dax->dev; + const char *res_name = dev_get_drvdata(dev); /* * We have one shot for removing memory, if some memory blocks were not @@ -108,25 +131,36 @@ static int dev_dax_kmem_remove(struct device *dev) * there is no way to hotremove this memory until reboot because device * unbind will succeed even if we return failure. */ - rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size); - if (rc) { + for (i = 0; i < dev_dax->nr_range; i++) { + struct range range; + int rc; + + rc = dax_kmem_range(dev_dax, i, &range); + if (rc) + continue; + + rc = remove_memory(dev_dax->target_node, range.start, + range_len(&range)); + if (rc == 0) { + release_mem_region(range.start, range_len(&range)); + success++; + continue; + } any_hotremove_failed = true; dev_err(dev, - "DAX region %pR cannot be hotremoved until the next reboot\n", - res); - return rc; + "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", + i, range.start, range.end); } - /* Release and free dax resources */ - release_resource(res); - kfree(res); - kfree(res_name); - dev_dax->dax_kmem_res = NULL; + if (success >= dev_dax->nr_range) { + kfree(res_name); + dev_set_drvdata(dev, NULL); + } return 0; } #else -static int dev_dax_kmem_remove(struct device *dev) +static int dev_dax_kmem_remove(struct dev_dax *dev_dax) { /* * Without hotremove purposely leak the request_mem_region() for the @@ -141,10 +175,8 @@ static int dev_dax_kmem_remove(struct device *dev) #endif /* CONFIG_MEMORY_HOTREMOVE */ static struct dax_device_driver device_dax_kmem_driver = { - .drv = { - .probe = dev_dax_kmem_probe, - .remove = dev_dax_kmem_remove, - }, + .probe = dev_dax_kmem_probe, + .remove = dev_dax_kmem_remove, }; static int __init dax_kmem_init(void) diff --git a/drivers/dax/pmem/compat.c b/drivers/dax/pmem/compat.c index d7b15e6f30c5..863c114fd88c 100644 --- a/drivers/dax/pmem/compat.c +++ b/drivers/dax/pmem/compat.c @@ -22,7 +22,7 @@ static int dax_pmem_compat_probe(struct device *dev) return -ENOMEM; device_lock(&dev_dax->dev); - rc = dev_dax_probe(&dev_dax->dev); + rc = dev_dax_probe(dev_dax); device_unlock(&dev_dax->dev); devres_close_group(&dev_dax->dev, dev_dax); diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index 2bedf8414fff..62b26bfceab1 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -9,11 +9,12 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) { - struct resource res; + struct range range; int rc, id, region_id; resource_size_t offset; struct nd_pfn_sb *pfn_sb; struct dev_dax *dev_dax; + struct dev_dax_data data; struct nd_namespace_io *nsio; struct dax_region *dax_region; struct dev_pagemap pgmap = { }; @@ -49,16 +50,23 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) if (rc != 2) return ERR_PTR(-EINVAL); - /* adjust the dax_region resource to the start of data */ - memcpy(&res, &pgmap.res, sizeof(res)); - res.start += offset; - dax_region = alloc_dax_region(dev, region_id, &res, + /* adjust the dax_region range to the start of data */ + range = pgmap.range; + range.start += offset, + dax_region = alloc_dax_region(dev, region_id, &range, nd_region->target_node, le32_to_cpu(pfn_sb->align), - PFN_DEV|PFN_MAP); + IORESOURCE_DAX_STATIC); if (!dax_region) return ERR_PTR(-ENOMEM); - dev_dax = __devm_create_dev_dax(dax_region, id, &pgmap, subsys); + data = (struct dev_dax_data) { + .dax_region = dax_region, + .id = id, + .pgmap = &pgmap, + .subsys = subsys, + .size = range_len(&range), + }; + dev_dax = devm_create_dev_dax(&data); /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); diff --git a/drivers/firmware/efi/x86_fake_mem.c b/drivers/firmware/efi/x86_fake_mem.c index e5d6d5a1b240..0bafcc1bb0f6 100644 --- a/drivers/firmware/efi/x86_fake_mem.c +++ b/drivers/firmware/efi/x86_fake_mem.c @@ -38,7 +38,7 @@ void __init efi_fake_memmap_early(void) m_start = mem->range.start; m_end = mem->range.end; for_each_efi_memory_desc(md) { - u64 start, end; + u64 start, end, size; if (md->type != EFI_CONVENTIONAL_MEMORY) continue; @@ -58,11 +58,17 @@ void __init efi_fake_memmap_early(void) */ start = max(start, m_start); end = min(end, m_end); + size = end - start + 1; if (end <= start) continue; - e820__range_update(start, end - start + 1, E820_TYPE_RAM, - E820_TYPE_SOFT_RESERVED); + + /* + * Ensure each efi_fake_mem instance results in + * a unique e820 resource + */ + e820__range_remove(start, size, E820_TYPE_RAM, 1); + e820__range_add(start, size, E820_TYPE_SOFT_RESERVED); e820__update_table(e820_table); } } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 38113d3c0138..75e8b71c18b9 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -258,8 +258,8 @@ shmem_writeback(struct drm_i915_gem_object *obj) for (i = 0; i < obj->base.size >> PAGE_SHIFT; i++) { struct page *page; - page = find_lock_entry(mapping, i); - if (!page || xa_is_value(page)) + page = find_lock_page(mapping, i); + if (!page) continue; if (!page_mapped(page) && clear_page_dirty_for_io(page)) { diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 4e8112fde3e6..a13c6215bba8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -101,7 +101,7 @@ unsigned long nouveau_dmem_page_addr(struct page *page) { struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page); unsigned long off = (page_to_pfn(page) << PAGE_SHIFT) - - chunk->pagemap.res.start; + chunk->pagemap.range.start; return chunk->bo->offset + off; } @@ -249,7 +249,9 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) chunk->drm = drm; chunk->pagemap.type = MEMORY_DEVICE_PRIVATE; - chunk->pagemap.res = *res; + chunk->pagemap.range.start = res->start; + chunk->pagemap.range.end = res->end; + chunk->pagemap.nr_range = 1; chunk->pagemap.ops = &nouveau_dmem_pagemap_ops; chunk->pagemap.owner = drm->dev; @@ -273,7 +275,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage) list_add(&chunk->list, &drm->dmem->chunks); mutex_unlock(&drm->dmem->mutex); - pfn_first = chunk->pagemap.res.start >> PAGE_SHIFT; + pfn_first = chunk->pagemap.range.start >> PAGE_SHIFT; page = pfn_to_page(pfn_first); spin_lock(&drm->dmem->lock); for (i = 0; i < DMEM_CHUNK_NPAGES - 1; ++i, ++page) { @@ -294,8 +296,7 @@ out_bo_unpin: out_bo_free: nouveau_bo_ref(NULL, &chunk->bo); out_release: - release_mem_region(chunk->pagemap.res.start, - resource_size(&chunk->pagemap.res)); + release_mem_region(chunk->pagemap.range.start, range_len(&chunk->pagemap.range)); out_free: kfree(chunk); out: @@ -382,8 +383,8 @@ nouveau_dmem_fini(struct nouveau_drm *drm) nouveau_bo_ref(NULL, &chunk->bo); list_del(&chunk->list); memunmap_pages(&chunk->pagemap); - release_mem_region(chunk->pagemap.res.start, - resource_size(&chunk->pagemap.res)); + release_mem_region(chunk->pagemap.range.start, + range_len(&chunk->pagemap.range)); kfree(chunk); } diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0418071a3724..46d885575601 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2198,7 +2198,7 @@ static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) addr_end = addr + size - 1; - for_each_reserved_mem_region(i, &start, &end) { + for_each_reserved_mem_range(i, &start, &end) { if (addr >= start && addr_end <= end) return true; } diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c index b9eeefa27e3a..aaf6e215a8c6 100644 --- a/drivers/nvdimm/badrange.c +++ b/drivers/nvdimm/badrange.c @@ -211,7 +211,7 @@ static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len) } static void badblocks_populate(struct badrange *badrange, - struct badblocks *bb, const struct resource *res) + struct badblocks *bb, const struct range *range) { struct badrange_entry *bre; @@ -222,34 +222,34 @@ static void badblocks_populate(struct badrange *badrange, u64 bre_end = bre->start + bre->length - 1; /* Discard intervals with no intersection */ - if (bre_end < res->start) + if (bre_end < range->start) continue; - if (bre->start > res->end) + if (bre->start > range->end) continue; /* Deal with any overlap after start of the namespace */ - if (bre->start >= res->start) { + if (bre->start >= range->start) { u64 start = bre->start; u64 len; - if (bre_end <= res->end) + if (bre_end <= range->end) len = bre->length; else - len = res->start + resource_size(res) + len = range->start + range_len(range) - bre->start; - __add_badblock_range(bb, start - res->start, len); + __add_badblock_range(bb, start - range->start, len); continue; } /* * Deal with overlap for badrange starting before * the namespace. */ - if (bre->start < res->start) { + if (bre->start < range->start) { u64 len; - if (bre_end < res->end) - len = bre->start + bre->length - res->start; + if (bre_end < range->end) + len = bre->start + bre->length - range->start; else - len = resource_size(res); + len = range_len(range); __add_badblock_range(bb, 0, len); } } @@ -267,7 +267,7 @@ static void badblocks_populate(struct badrange *badrange, * and add badblocks entries for all matching sub-ranges */ void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res) + struct badblocks *bb, const struct range *range) { struct nvdimm_bus *nvdimm_bus; @@ -279,7 +279,7 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); nvdimm_bus_lock(&nvdimm_bus->dev); - badblocks_populate(&nvdimm_bus->badrange, bb, res); + badblocks_populate(&nvdimm_bus->badrange, bb, range); nvdimm_bus_unlock(&nvdimm_bus->dev); } EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 22d865ba6353..5a7c80053c62 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -303,13 +303,16 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, resource_size_t size) { - struct resource *res = &nsio->res; struct nd_namespace_common *ndns = &nsio->common; + struct range range = { + .start = nsio->res.start, + .end = nsio->res.end, + }; nsio->size = size; - if (!devm_request_mem_region(dev, res->start, size, + if (!devm_request_mem_region(dev, range.start, size, dev_name(&ndns->dev))) { - dev_warn(dev, "could not reserve region %pR\n", res); + dev_warn(dev, "could not reserve region %pR\n", &nsio->res); return -EBUSY; } @@ -317,9 +320,9 @@ int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio, if (devm_init_badblocks(dev, &nsio->bb)) return -ENOMEM; nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb, - &nsio->res); + &range); - nsio->addr = devm_memremap(dev, res->start, size, ARCH_MEMREMAP_PMEM); + nsio->addr = devm_memremap(dev, range.start, size, ARCH_MEMREMAP_PMEM); return PTR_ERR_OR_ZERO(nsio->addr); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 72740108ba42..696b55556d4d 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -377,8 +377,9 @@ int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); unsigned int pmem_sector_size(struct nd_namespace_common *ndns); +struct range; void nvdimm_badblocks_populate(struct nd_region *nd_region, - struct badblocks *bb, const struct resource *res); + struct badblocks *bb, const struct range *range); int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns, resource_size_t size); void devm_namespace_disable(struct device *dev, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 3e11ef8d3f5b..b499df630d4d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -672,7 +672,7 @@ static unsigned long init_altmap_reserve(resource_size_t base) static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) { - struct resource *res = &pgmap->res; + struct range *range = &pgmap->range; struct vmem_altmap *altmap = &pgmap->altmap; struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; u64 offset = le64_to_cpu(pfn_sb->dataoff); @@ -689,16 +689,17 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) .end_pfn = PHYS_PFN(end), }; - memcpy(res, &nsio->res, sizeof(*res)); - res->start += start_pad; - res->end -= end_trunc; - + *range = (struct range) { + .start = nsio->res.start + start_pad, + .end = nsio->res.end - end_trunc, + }; + pgmap->nr_range = 1; if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < reserve) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); } else if (nd_pfn->mode == PFN_MODE_PMEM) { - nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset)); + nd_pfn->npfns = PHYS_PFN((range_len(range) - offset)); if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) dev_info(&nd_pfn->dev, "number of pfns truncated from %lld to %ld\n", diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c86a0ceaece6..875076b0ea6c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -375,7 +375,7 @@ static int pmem_attach_disk(struct device *dev, struct nd_region *nd_region = to_nd_region(dev->parent); int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; - struct resource bb_res; + struct range bb_range; struct nd_pfn *nd_pfn = NULL; struct dax_device *dax_dev; struct nd_pfn_sb *pfn_sb; @@ -434,24 +434,27 @@ static int pmem_attach_disk(struct device *dev, pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); pmem->pfn_pad = resource_size(res) - - resource_size(&pmem->pgmap.res); + range_len(&pmem->pgmap.range); pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); - bb_res.start += pmem->data_offset; + bb_range = pmem->pgmap.range; + bb_range.start += pmem->data_offset; } else if (pmem_should_map_pages(dev)) { - memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); + pmem->pgmap.range.start = res->start; + pmem->pgmap.range.end = res->end; + pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; - memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); + bb_range = pmem->pgmap.range; } else { if (devm_add_action_or_reset(dev, pmem_release_queue, &pmem->pgmap)) return -ENOMEM; addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); - memcpy(&bb_res, &nsio->res, sizeof(bb_res)); + bb_range.start = res->start; + bb_range.end = res->end; } if (IS_ERR(addr)) @@ -480,7 +483,7 @@ static int pmem_attach_disk(struct device *dev, / 512); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; - nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res); + nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range); disk->bb = &pmem->bb; if (is_nvdimm_sync(nd_region)) @@ -591,8 +594,8 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) resource_size_t offset = 0, end_trunc = 0; struct nd_namespace_common *ndns; struct nd_namespace_io *nsio; - struct resource res; struct badblocks *bb; + struct range range; struct kernfs_node *bb_state; if (event != NVDIMM_REVALIDATE_POISON) @@ -628,9 +631,9 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) nsio = to_nd_namespace_io(&ndns->dev); } - res.start = nsio->res.start + offset; - res.end = nsio->res.end - end_trunc; - nvdimm_badblocks_populate(nd_region, bb, &res); + range.start = nsio->res.start + offset; + range.end = nsio->res.end - end_trunc; + nvdimm_badblocks_populate(nd_region, bb, &range); if (bb_state) sysfs_notify_dirent(bb_state); } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 0f6978e72e7c..bfce87ed72ab 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -35,7 +35,10 @@ static int nd_region_probe(struct device *dev) return rc; if (is_memory(&nd_region->dev)) { - struct resource ndr_res; + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + nd_region->ndr_size - 1, + }; if (devm_init_badblocks(dev, &nd_region->bb)) return -ENODEV; @@ -44,9 +47,7 @@ static int nd_region_probe(struct device *dev) if (!nd_region->bb_state) dev_warn(&nd_region->dev, "'badblocks' notification disabled\n"); - ndr_res.start = nd_region->ndr_start; - ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1; - nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res); + nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range); } rc = nd_region_register_namespaces(nd_region, &err); @@ -121,14 +122,16 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event) { if (event == NVDIMM_REVALIDATE_POISON) { struct nd_region *nd_region = to_nd_region(dev); - struct resource res; if (is_memory(&nd_region->dev)) { - res.start = nd_region->ndr_start; - res.end = nd_region->ndr_start + - nd_region->ndr_size - 1; + struct range range = { + .start = nd_region->ndr_start, + .end = nd_region->ndr_start + + nd_region->ndr_size - 1, + }; + nvdimm_badblocks_populate(nd_region, - &nd_region->bb, &res); + &nd_region->bb, &range); if (nd_region->bb_state) sysfs_notify_dirent(nd_region->bb_state); } diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index f357f9a32b3a..9d53c16b7329 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -185,9 +185,9 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, return -ENOMEM; pgmap = &p2p_pgmap->pgmap; - pgmap->res.start = pci_resource_start(pdev, bar) + offset; - pgmap->res.end = pgmap->res.start + size - 1; - pgmap->res.flags = pci_resource_flags(pdev, bar); + pgmap->range.start = pci_resource_start(pdev, bar) + offset; + pgmap->range.end = pgmap->range.start + size - 1; + pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; p2p_pgmap->provider = pdev; @@ -202,13 +202,13 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr, pci_bus_address(pdev, bar) + offset, - resource_size(&pgmap->res), dev_to_node(&pdev->dev), + range_len(&pgmap->range), dev_to_node(&pdev->dev), pgmap->ref); if (error) goto pages_free; - pci_info(pdev, "added peer-to-peer DMA memory %pR\n", - &pgmap->res); + pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n", + pgmap->range.start, pgmap->range.end); return 0; diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index c08512fcea90..834b7c13ef3d 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -36,18 +36,10 @@ enum virtio_mem_mb_state { VIRTIO_MEM_MB_STATE_OFFLINE, /* Partially plugged, fully added to Linux, offline. */ VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, - /* Fully plugged, fully added to Linux, online (!ZONE_MOVABLE). */ + /* Fully plugged, fully added to Linux, online. */ VIRTIO_MEM_MB_STATE_ONLINE, - /* Partially plugged, fully added to Linux, online (!ZONE_MOVABLE). */ + /* Partially plugged, fully added to Linux, online. */ VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, - /* - * Fully plugged, fully added to Linux, online (ZONE_MOVABLE). - * We are not allowed to allocate (unplug) parts of this block that - * are not movable (similar to gigantic pages). We will never allow - * to online OFFLINE_PARTIAL to ZONE_MOVABLE (as they would contain - * unmovable parts). - */ - VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE, VIRTIO_MEM_MB_STATE_COUNT }; @@ -526,21 +518,10 @@ static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) } static int virtio_mem_notify_going_online(struct virtio_mem *vm, - unsigned long mb_id, - enum zone_type zone) + unsigned long mb_id) { switch (virtio_mem_mb_get_state(vm, mb_id)) { case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - /* - * We won't allow to online a partially plugged memory block - * to the MOVABLE zone - it would contain unmovable parts. - */ - if (zone == ZONE_MOVABLE) { - dev_warn_ratelimited(&vm->vdev->dev, - "memory block has holes, MOVABLE not supported\n"); - return NOTIFY_BAD; - } - return NOTIFY_OK; case VIRTIO_MEM_MB_STATE_OFFLINE: return NOTIFY_OK; default: @@ -560,7 +541,6 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); break; case VIRTIO_MEM_MB_STATE_ONLINE: - case VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE: virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_OFFLINE); break; @@ -579,24 +559,17 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, virtio_mem_retry(vm); } -static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id, - enum zone_type zone) +static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) { unsigned long nb_offline; switch (virtio_mem_mb_get_state(vm, mb_id)) { case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - BUG_ON(zone == ZONE_MOVABLE); virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); break; case VIRTIO_MEM_MB_STATE_OFFLINE: - if (zone == ZONE_MOVABLE) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE); - else - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE); + virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE); break; default: BUG(); @@ -675,7 +648,6 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, const unsigned long start = PFN_PHYS(mhp->start_pfn); const unsigned long size = PFN_PHYS(mhp->nr_pages); const unsigned long mb_id = virtio_mem_phys_to_mb_id(start); - enum zone_type zone; int rc = NOTIFY_OK; if (!virtio_mem_overlaps_range(vm, start, size)) @@ -717,8 +689,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - zone = page_zonenum(pfn_to_page(mhp->start_pfn)); - rc = virtio_mem_notify_going_online(vm, mb_id, zone); + rc = virtio_mem_notify_going_online(vm, mb_id); break; case MEM_OFFLINE: virtio_mem_notify_offline(vm, mb_id); @@ -726,8 +697,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, mutex_unlock(&vm->hotplug_mutex); break; case MEM_ONLINE: - zone = page_zonenum(pfn_to_page(mhp->start_pfn)); - virtio_mem_notify_online(vm, mb_id, zone); + virtio_mem_notify_online(vm, mb_id); vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; @@ -1906,8 +1876,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] || vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] || vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_MOVABLE]) { + vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) { dev_warn(&vdev->dev, "device still has system memory added\n"); } else { virtio_mem_delete_resource(vm); diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c index 3b98dc921426..8c512ea550bb 100644 --- a/drivers/xen/unpopulated-alloc.c +++ b/drivers/xen/unpopulated-alloc.c @@ -18,27 +18,38 @@ static unsigned int list_count; static int fill_list(unsigned int nr_pages) { struct dev_pagemap *pgmap; + struct resource *res; void *vaddr; unsigned int i, alloc_pages = round_up(nr_pages, PAGES_PER_SECTION); - int ret; + int ret = -ENOMEM; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); if (!pgmap) - return -ENOMEM; + goto err_pgmap; pgmap->type = MEMORY_DEVICE_GENERIC; - pgmap->res.name = "Xen scratch"; - pgmap->res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->name = "Xen scratch"; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - ret = allocate_resource(&iomem_resource, &pgmap->res, + ret = allocate_resource(&iomem_resource, res, alloc_pages * PAGE_SIZE, 0, -1, PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); if (ret < 0) { pr_err("Cannot allocate new IOMEM resource\n"); - kfree(pgmap); - return ret; + goto err_resource; } + pgmap->range = (struct range) { + .start = res->start, + .end = res->end, + }; + pgmap->nr_range = 1; + pgmap->owner = res; + #ifdef CONFIG_XEN_HAVE_PVMMU /* * memremap will build page tables for the new memory so @@ -50,14 +61,13 @@ static int fill_list(unsigned int nr_pages) * conflict with any devices. */ if (!xen_feature(XENFEAT_auto_translated_physmap)) { - xen_pfn_t pfn = PFN_DOWN(pgmap->res.start); + xen_pfn_t pfn = PFN_DOWN(res->start); for (i = 0; i < alloc_pages; i++) { if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { pr_warn("set_phys_to_machine() failed, no memory added\n"); - release_resource(&pgmap->res); - kfree(pgmap); - return -ENOMEM; + ret = -ENOMEM; + goto err_memremap; } } } @@ -66,9 +76,8 @@ static int fill_list(unsigned int nr_pages) vaddr = memremap_pages(pgmap, NUMA_NO_NODE); if (IS_ERR(vaddr)) { pr_err("Cannot remap memory range\n"); - release_resource(&pgmap->res); - kfree(pgmap); - return PTR_ERR(vaddr); + ret = PTR_ERR(vaddr); + goto err_memremap; } for (i = 0; i < alloc_pages; i++) { @@ -80,6 +89,14 @@ static int fill_list(unsigned int nr_pages) } return 0; + +err_memremap: + release_resource(res); +err_resource: + kfree(pgmap); +err_pgmap: + kfree(res); + return ret; } /** |