diff options
256 files changed, 2723 insertions, 2275 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index f5306ee40ea9..0b302a11718a 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -98,11 +98,10 @@ Mount Options size. rsize=X - Specify the maximum read size in bytes. By default there is no - maximum. + Specify the maximum read size in bytes. Default: 64 MB. rasize=X - Specify the maximum readahead. + Specify the maximum readahead. Default: 8 MB. mount_timeout=X Specify the timeout value for mount (in seconds), in the case diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index d918d268cd72..51cf6fa5591f 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt @@ -212,3 +212,117 @@ Finally we move core 4-7 over to the new group and make sure that the kernel and the tasks running there get 50% of the cache. # echo C0 > p0/cpus + +4) Locking between applications + +Certain operations on the resctrl filesystem, composed of read/writes +to/from multiple files, must be atomic. + +As an example, the allocation of an exclusive reservation of L3 cache +involves: + + 1. Read the cbmmasks from each directory + 2. Find a contiguous set of bits in the global CBM bitmask that is clear + in any of the directory cbmmasks + 3. Create a new directory + 4. Set the bits found in step 2 to the new directory "schemata" file + +If two applications attempt to allocate space concurrently then they can +end up allocating the same bits so the reservations are shared instead of +exclusive. + +To coordinate atomic operations on the resctrlfs and to avoid the problem +above, the following locking procedure is recommended: + +Locking is based on flock, which is available in libc and also as a shell +script command + +Write lock: + + A) Take flock(LOCK_EX) on /sys/fs/resctrl + B) Read/write the directory structure. + C) funlock + +Read lock: + + A) Take flock(LOCK_SH) on /sys/fs/resctrl + B) If success read the directory structure. + C) funlock + +Example with bash: + +# Atomically read directory structure +$ flock -s /sys/fs/resctrl/ find /sys/fs/resctrl + +# Read directory contents and create new subdirectory + +$ cat create-dir.sh +find /sys/fs/resctrl/ > output.txt +mask = function-of(output.txt) +mkdir /sys/fs/resctrl/newres/ +echo mask > /sys/fs/resctrl/newres/schemata + +$ flock /sys/fs/resctrl/ ./create-dir.sh + +Example with C: + +/* + * Example code do take advisory locks + * before accessing resctrl filesystem + */ +#include <sys/file.h> +#include <stdlib.h> + +void resctrl_take_shared_lock(int fd) +{ + int ret; + + /* take shared lock on resctrl filesystem */ + ret = flock(fd, LOCK_SH); + if (ret) { + perror("flock"); + exit(-1); + } +} + +void resctrl_take_exclusive_lock(int fd) +{ + int ret; + + /* release lock on resctrl filesystem */ + ret = flock(fd, LOCK_EX); + if (ret) { + perror("flock"); + exit(-1); + } +} + +void resctrl_release_lock(int fd) +{ + int ret; + + /* take shared lock on resctrl filesystem */ + ret = flock(fd, LOCK_UN); + if (ret) { + perror("flock"); + exit(-1); + } +} + +void main(void) +{ + int fd, ret; + + fd = open("/sys/fs/resctrl", O_DIRECTORY); + if (fd == -1) { + perror("open"); + exit(-1); + } + resctrl_take_shared_lock(fd); + /* code to read directory contents */ + resctrl_release_lock(fd); + + resctrl_take_exclusive_lock(fd); + /* code to read and write directory contents */ + resctrl_release_lock(fd); +} @@ -910,6 +910,18 @@ mod_sign_cmd = true endif export mod_sign_cmd +ifdef CONFIG_STACK_VALIDATION + has_libelf := $(call try-run,\ + echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0) + ifeq ($(has_libelf),1) + objtool_target := tools/objtool FORCE + else + $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + SKIP_STACK_VALIDATION := 1 + export SKIP_STACK_VALIDATION + endif +endif + ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ @@ -1037,18 +1049,6 @@ prepare0: archprepare gcc-plugins # All the preparing.. prepare: prepare0 prepare-objtool -ifdef CONFIG_STACK_VALIDATION - has_libelf := $(call try-run,\ - echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0) - ifeq ($(has_libelf),1) - objtool_target := tools/objtool FORCE - else - $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel") - SKIP_STACK_VALIDATION := 1 - export SKIP_STACK_VALIDATION - endif -endif - PHONY += prepare-objtool prepare-objtool: $(objtool_target) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fda6a46d27cf..0d4e71b42c77 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2,6 +2,7 @@ config ARM bool default y select ARCH_CLOCKSOURCE_DATA + select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_SET_MEMORY diff --git a/arch/arm/Kconfig-nommu b/arch/arm/Kconfig-nommu index aed66d5df7f1..b7576349528c 100644 --- a/arch/arm/Kconfig-nommu +++ b/arch/arm/Kconfig-nommu @@ -34,8 +34,7 @@ config PROCESSOR_ID used instead of the auto-probing which utilizes the register. config REMAP_VECTORS_TO_RAM - bool 'Install vectors to the beginning of RAM' if DRAM_BASE - depends on DRAM_BASE + bool 'Install vectors to the beginning of RAM' help The kernel needs to change the hardware exception vectors. In nommu mode, the hardware exception vectors are normally diff --git a/arch/arm/boot/compressed/decompress.c b/arch/arm/boot/compressed/decompress.c index a0765e7ed6c7..ea7832702a8f 100644 --- a/arch/arm/boot/compressed/decompress.c +++ b/arch/arm/boot/compressed/decompress.c @@ -32,6 +32,7 @@ extern void error(char *); /* Not needed, but used in some headers pulled in by decompressors */ extern char * strstr(const char * s1, const char *s2); +extern size_t strlen(const char *s); #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index a923524d1040..cf062472e07b 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -144,7 +144,7 @@ extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER]; void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr) { - unsigned long val = ptr ? virt_to_phys(ptr) : 0; + unsigned long val = ptr ? __pa_symbol(ptr) : 0; mcpm_entry_vectors[cluster][cpu] = val; sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); } @@ -299,8 +299,8 @@ void mcpm_cpu_power_down(void) * the kernel as if the power_up method just had deasserted reset * on the CPU. */ - phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset); - phys_reset(virt_to_phys(mcpm_entry_point)); + phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset); + phys_reset(__pa_symbol(mcpm_entry_point)); /* should never get here */ BUG(); @@ -388,8 +388,8 @@ static int __init nocache_trampoline(unsigned long _arg) __mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN); __mcpm_cpu_down(cpu, cluster); - phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset); - phys_reset(virt_to_phys(mcpm_entry_point)); + phys_reset = (phys_reset_t)(unsigned long)__pa_symbol(cpu_reset); + phys_reset(__pa_symbol(mcpm_entry_point)); BUG(); } @@ -449,7 +449,7 @@ int __init mcpm_sync_init( sync_cache_w(&mcpm_sync); if (power_up_setup) { - mcpm_power_up_setup_phys = virt_to_phys(power_up_setup); + mcpm_power_up_setup_phys = __pa_symbol(power_up_setup); sync_cache_w(&mcpm_power_up_setup_phys); } diff --git a/arch/arm/include/asm/hardware/cache-uniphier.h b/arch/arm/include/asm/hardware/cache-uniphier.h index eaa60da7dac3..0ef42ae75b6c 100644 --- a/arch/arm/include/asm/hardware/cache-uniphier.h +++ b/arch/arm/include/asm/hardware/cache-uniphier.h @@ -16,7 +16,7 @@ #ifndef __CACHE_UNIPHIER_H #define __CACHE_UNIPHIER_H -#include <linux/types.h> +#include <linux/errno.h> #ifdef CONFIG_CACHE_UNIPHIER int uniphier_cache_init(void); diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 76cbd9c674df..1f54e4e98c1e 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -83,8 +83,15 @@ #define IOREMAP_MAX_ORDER 24 #endif +#define VECTORS_BASE UL(0xffff0000) + #else /* CONFIG_MMU */ +#ifndef __ASSEMBLY__ +extern unsigned long vectors_base; +#define VECTORS_BASE vectors_base +#endif + /* * The limitation of user task size can grow up to the end of free ram region. * It is difficult to define and perhaps will never meet the original meaning @@ -111,6 +118,13 @@ #endif /* !CONFIG_MMU */ +#ifdef CONFIG_XIP_KERNEL +#define KERNEL_START _sdata +#else +#define KERNEL_START _stext +#endif +#define KERNEL_END _end + /* * We fix the TCM memories max 32 KiB ITCM resp DTCM at these * locations @@ -206,7 +220,7 @@ extern const void *__pv_table_begin, *__pv_table_end; : "r" (x), "I" (__PV_BITS_31_24) \ : "cc") -static inline phys_addr_t __virt_to_phys(unsigned long x) +static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x) { phys_addr_t t; @@ -238,7 +252,7 @@ static inline unsigned long __phys_to_virt(phys_addr_t x) #define PHYS_OFFSET PLAT_PHYS_OFFSET #define PHYS_PFN_OFFSET ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT)) -static inline phys_addr_t __virt_to_phys(unsigned long x) +static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x) { return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET; } @@ -254,6 +268,16 @@ static inline unsigned long __phys_to_virt(phys_addr_t x) ((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \ PHYS_PFN_OFFSET) +#define __pa_symbol_nodebug(x) __virt_to_phys_nodebug((x)) + +#ifdef CONFIG_DEBUG_VIRTUAL +extern phys_addr_t __virt_to_phys(unsigned long x); +extern phys_addr_t __phys_addr_symbol(unsigned long x); +#else +#define __virt_to_phys(x) __virt_to_phys_nodebug(x) +#define __phys_addr_symbol(x) __pa_symbol_nodebug(x) +#endif + /* * These are *only* valid on the kernel direct mapped RAM memory. * Note: Drivers should NOT use these. They are the wrong @@ -276,6 +300,7 @@ static inline void *phys_to_virt(phys_addr_t x) * Drivers should NOT use these either. */ #define __pa(x) __virt_to_phys((unsigned long)(x)) +#define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0)) #define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x))) #define pfn_to_kaddr(pfn) __va((phys_addr_t)(pfn) << PAGE_SHIFT) diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index add094d09e3e..302240c19a5a 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -63,9 +63,9 @@ typedef pte_t *pte_addr_t; /* * Mark the prot value as uncacheable and unbufferable. */ -#define pgprot_noncached(prot) __pgprot(0) -#define pgprot_writecombine(prot) __pgprot(0) -#define pgprot_dmacoherent(prot) __pgprot(0) +#define pgprot_noncached(prot) (prot) +#define pgprot_writecombine(prot) (prot) +#define pgprot_dmacoherent(prot) (prot) /* diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index 6b4eb27b8758..2e21e08de747 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -152,11 +152,6 @@ __after_proc_init: #ifdef CONFIG_CPU_ICACHE_DISABLE bic r0, r0, #CR_I #endif -#ifdef CONFIG_CPU_HIGH_VECTOR - orr r0, r0, #CR_V -#else - bic r0, r0, #CR_V -#endif mcr p15, 0, r0, c1, c0, 0 @ write control reg #elif defined (CONFIG_CPU_V7M) /* For V7M systems we want to modify the CCR similarly to the SCTLR */ diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 4f14b5ce6535..80254b47dc34 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -155,8 +155,17 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, break; case R_ARM_PREL31: - offset = *(u32 *)loc + sym->st_value - loc; - *(u32 *)loc = offset & 0x7fffffff; + offset = (*(s32 *)loc << 1) >> 1; /* sign extend */ + offset += sym->st_value - loc; + if (offset >= 0x40000000 || offset < -0x40000000) { + pr_err("%s: section %u reloc %u sym '%s': relocation %u out of range (%#lx -> %#x)\n", + module->name, relindex, i, symname, + ELF32_R_TYPE(rel->r_info), loc, + sym->st_value); + return -ENOEXEC; + } + *(u32 *)loc &= 0x80000000; + *(u32 *)loc |= offset & 0x7fffffff; break; case R_ARM_MOVW_ABS_NC: diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 34e3f3c45634..f4e54503afa9 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -81,7 +81,7 @@ __setup("fpe=", fpe_setup); extern void init_default_cache_policy(unsigned long); extern void paging_init(const struct machine_desc *desc); extern void early_paging_init(const struct machine_desc *); -extern void sanity_check_meminfo(void); +extern void adjust_lowmem_bounds(void); extern enum reboot_mode reboot_mode; extern void setup_dma_zone(const struct machine_desc *desc); @@ -1093,8 +1093,14 @@ void __init setup_arch(char **cmdline_p) setup_dma_zone(mdesc); xen_early_init(); efi_init(); - sanity_check_meminfo(); + /* + * Make sure the calculation for lowmem/highmem is set appropriately + * before reserving/allocating any mmeory + */ + adjust_lowmem_bounds(); arm_memblock_init(mdesc); + /* Memory may have been removed so recalculate the bounds. */ + adjust_lowmem_bounds(); early_ioremap_reset(); diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index c6514ce0fcbc..5a07c5a4b894 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -251,7 +251,7 @@ void __cpu_die(unsigned int cpu) pr_err("CPU%u: cpu didn't die\n", cpu); return; } - pr_notice("CPU%u: shutdown\n", cpu); + pr_debug("CPU%u: shutdown\n", cpu); /* * platform_cpu_kill() is generally expected to do the powering off diff --git a/arch/arm/mach-alpine/platsmp.c b/arch/arm/mach-alpine/platsmp.c index dd77ea25e7ca..6dc6d491f88a 100644 --- a/arch/arm/mach-alpine/platsmp.c +++ b/arch/arm/mach-alpine/platsmp.c @@ -27,7 +27,7 @@ static int alpine_boot_secondary(unsigned int cpu, struct task_struct *idle) { phys_addr_t addr; - addr = virt_to_phys(secondary_startup); + addr = __pa_symbol(secondary_startup); if (addr > (phys_addr_t)(uint32_t)(-1)) { pr_err("FAIL: resume address over 32bit (%pa)", &addr); diff --git a/arch/arm/mach-axxia/platsmp.c b/arch/arm/mach-axxia/platsmp.c index ffbd71d45008..502e3df69f69 100644 --- a/arch/arm/mach-axxia/platsmp.c +++ b/arch/arm/mach-axxia/platsmp.c @@ -25,7 +25,7 @@ static void write_release_addr(u32 release_phys) { u32 *virt = (u32 *) phys_to_virt(release_phys); - writel_relaxed(virt_to_phys(secondary_startup), virt); + writel_relaxed(__pa_symbol(secondary_startup), virt); /* Make sure this store is visible to other CPUs */ smp_wmb(); __cpuc_flush_dcache_area(virt, sizeof(u32)); diff --git a/arch/arm/mach-bcm/bcm63xx_smp.c b/arch/arm/mach-bcm/bcm63xx_smp.c index 9b6727ed68cd..f5fb10b4376f 100644 --- a/arch/arm/mach-bcm/bcm63xx_smp.c +++ b/arch/arm/mach-bcm/bcm63xx_smp.c @@ -135,7 +135,7 @@ static int bcm63138_smp_boot_secondary(unsigned int cpu, } /* Write the secondary init routine to the BootLUT reset vector */ - val = virt_to_phys(secondary_startup); + val = __pa_symbol(secondary_startup); writel_relaxed(val, bootlut_base + BOOTLUT_RESET_VECT); /* Power up the core, will jump straight to its reset vector when we diff --git a/arch/arm/mach-bcm/platsmp-brcmstb.c b/arch/arm/mach-bcm/platsmp-brcmstb.c index 40dc8448445e..12379960e982 100644 --- a/arch/arm/mach-bcm/platsmp-brcmstb.c +++ b/arch/arm/mach-bcm/platsmp-brcmstb.c @@ -151,7 +151,7 @@ static void brcmstb_cpu_boot(u32 cpu) * Set the reset vector to point to the secondary_startup * routine */ - cpu_set_boot_addr(cpu, virt_to_phys(secondary_startup)); + cpu_set_boot_addr(cpu, __pa_symbol(secondary_startup)); /* Unhalt the cpu */ cpu_rst_cfg_set(cpu, 0); diff --git a/arch/arm/mach-bcm/platsmp.c b/arch/arm/mach-bcm/platsmp.c index 3ac3a9bc663c..582886d0d02f 100644 --- a/arch/arm/mach-bcm/platsmp.c +++ b/arch/arm/mach-bcm/platsmp.c @@ -116,7 +116,7 @@ static int nsp_write_lut(unsigned int cpu) return -ENOMEM; } - secondary_startup_phy = virt_to_phys(secondary_startup); + secondary_startup_phy = __pa_symbol(secondary_startup); BUG_ON(secondary_startup_phy > (phys_addr_t)U32_MAX); writel_relaxed(secondary_startup_phy, sku_rom_lut); @@ -189,7 +189,7 @@ static int kona_boot_secondary(unsigned int cpu, struct task_struct *idle) * Secondary cores will start in secondary_startup(), * defined in "arch/arm/kernel/head.S" */ - boot_func = virt_to_phys(secondary_startup); + boot_func = __pa_symbol(secondary_startup); BUG_ON(boot_func & BOOT_ADDR_CPUID_MASK); BUG_ON(boot_func > (phys_addr_t)U32_MAX); diff --git a/arch/arm/mach-berlin/platsmp.c b/arch/arm/mach-berlin/platsmp.c index 93f90688db18..7586b7aec272 100644 --- a/arch/arm/mach-berlin/platsmp.c +++ b/arch/arm/mach-berlin/platsmp.c @@ -15,6 +15,7 @@ #include <asm/cacheflush.h> #include <asm/cp15.h> +#include <asm/memory.h> #include <asm/smp_plat.h> #include <asm/smp_scu.h> @@ -75,7 +76,7 @@ static void __init berlin_smp_prepare_cpus(unsigned int max_cpus) if (!cpu_ctrl) goto unmap_scu; - vectors_base = ioremap(CONFIG_VECTORS_BASE, SZ_32K); + vectors_base = ioremap(VECTORS_BASE, SZ_32K); if (!vectors_base) goto unmap_scu; @@ -92,7 +93,7 @@ static void __init berlin_smp_prepare_cpus(unsigned int max_cpus) * Write the secondary startup address into the SW reset address * vector. This is used by boot_inst. */ - writel(virt_to_phys(secondary_startup), vectors_base + SW_RESET_ADDR); + writel(__pa_symbol(secondary_startup), vectors_base + SW_RESET_ADDR); iounmap(vectors_base); unmap_scu: diff --git a/arch/arm/mach-exynos/firmware.c b/arch/arm/mach-exynos/firmware.c index fd6da5419b51..e81a78b125d9 100644 --- a/arch/arm/mach-exynos/firmware.c +++ b/arch/arm/mach-exynos/firmware.c @@ -41,7 +41,7 @@ static int exynos_do_idle(unsigned long mode) case FW_DO_IDLE_AFTR: if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9) exynos_save_cp15(); - writel_relaxed(virt_to_phys(exynos_cpu_resume_ns), + writel_relaxed(__pa_symbol(exynos_cpu_resume_ns), sysram_ns_base_addr + 0x24); writel_relaxed(EXYNOS_AFTR_MAGIC, sysram_ns_base_addr + 0x20); if (soc_is_exynos3250()) { @@ -135,7 +135,7 @@ static int exynos_suspend(void) exynos_save_cp15(); writel(EXYNOS_SLEEP_MAGIC, sysram_ns_base_addr + EXYNOS_BOOT_FLAG); - writel(virt_to_phys(exynos_cpu_resume_ns), + writel(__pa_symbol(exynos_cpu_resume_ns), sysram_ns_base_addr + EXYNOS_BOOT_ADDR); return cpu_suspend(0, exynos_cpu_suspend); diff --git a/arch/arm/mach-exynos/mcpm-exynos.c b/arch/arm/mach-exynos/mcpm-exynos.c index 038fd8c993d0..b42622562ea7 100644 --- a/arch/arm/mach-exynos/mcpm-exynos.c +++ b/arch/arm/mach-exynos/mcpm-exynos.c @@ -221,7 +221,7 @@ static void exynos_mcpm_setup_entry_point(void) */ __raw_writel(0xe59f0000, ns_sram_base_addr); /* ldr r0, [pc, #0] */ __raw_writel(0xe12fff10, ns_sram_base_addr + 4); /* bx r0 */ - __raw_writel(virt_to_phys(mcpm_entry_point), ns_sram_base_addr + 8); + __raw_writel(__pa_symbol(mcpm_entry_point), ns_sram_base_addr + 8); } static struct syscore_ops exynos_mcpm_syscore_ops = { diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c index a5d68411a037..5a03bffe7226 100644 --- a/arch/arm/mach-exynos/platsmp.c +++ b/arch/arm/mach-exynos/platsmp.c @@ -353,7 +353,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle) smp_rmb(); - boot_addr = virt_to_phys(exynos4_secondary_startup); + boot_addr = __pa_symbol(exynos4_secondary_startup); ret = exynos_set_boot_addr(core_id, boot_addr); if (ret) @@ -413,7 +413,7 @@ static void __init exynos_smp_prepare_cpus(unsigned int max_cpus) mpidr = cpu_logical_map(i); core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - boot_addr = virt_to_phys(exynos4_secondary_startup); + boot_addr = __pa_symbol(exynos4_secondary_startup); ret = exynos_set_boot_addr(core_id, boot_addr); if (ret) diff --git a/arch/arm/mach-exynos/pm.c b/arch/arm/mach-exynos/pm.c index 487295f4a56b..1a7e5b5d08d8 100644 --- a/arch/arm/mach-exynos/pm.c +++ b/arch/arm/mach-exynos/pm.c @@ -132,7 +132,7 @@ static void exynos_set_wakeupmask(long mask) static void exynos_cpu_set_boot_vector(long flags) { - writel_relaxed(virt_to_phys(exynos_cpu_resume), + writel_relaxed(__pa_symbol(exynos_cpu_resume), exynos_boot_vector_addr()); writel_relaxed(flags, exynos_boot_vector_flag()); } @@ -238,7 +238,7 @@ static int exynos_cpu0_enter_aftr(void) abort: if (cpu_online(1)) { - unsigned long boot_addr = virt_to_phys(exynos_cpu_resume); + unsigned long boot_addr = __pa_symbol(exynos_cpu_resume); /* * Set the boot vector to something non-zero @@ -330,7 +330,7 @@ cpu1_aborted: static void exynos_pre_enter_aftr(void) { - unsigned long boot_addr = virt_to_phys(exynos_cpu_resume); + unsigned long boot_addr = __pa_symbol(exynos_cpu_resume); (void)exynos_set_boot_addr(1, boot_addr); } diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c index adf4e8f182bd..748cfb8d5212 100644 --- a/arch/arm/mach-exynos/suspend.c +++ b/arch/arm/mach-exynos/suspend.c @@ -301,7 +301,7 @@ static void exynos_pm_prepare(void) exynos_pm_enter_sleep_mode(); /* ensure at least INFORM0 has the resume address */ - pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0); + pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0); } static void exynos3250_pm_prepare(void) @@ -318,7 +318,7 @@ static void exynos3250_pm_prepare(void) exynos_pm_enter_sleep_mode(); /* ensure at least INFORM0 has the resume address */ - pmu_raw_writel(virt_to_phys(exynos_cpu_resume), S5P_INFORM0); + pmu_raw_writel(__pa_symbol(exynos_cpu_resume), S5P_INFORM0); } static void exynos5420_pm_prepare(void) @@ -343,7 +343,7 @@ static void exynos5420_pm_prepare(void) /* ensure at least INFORM0 has the resume address */ if (IS_ENABLED(CONFIG_EXYNOS5420_MCPM)) - pmu_raw_writel(virt_to_phys(mcpm_entry_point), S5P_INFORM0); + pmu_raw_writel(__pa_symbol(mcpm_entry_point), S5P_INFORM0); tmp = pmu_raw_readl(EXYNOS_L2_OPTION(0)); tmp &= ~EXYNOS_L2_USE_RETENTION; diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c index 4b653a8cb75c..a6c117622d67 100644 --- a/arch/arm/mach-hisi/platmcpm.c +++ b/arch/arm/mach-hisi/platmcpm.c @@ -327,7 +327,7 @@ static int __init hip04_smp_init(void) */ writel_relaxed(hip04_boot_method[0], relocation); writel_relaxed(0xa5a5a5a5, relocation + 4); /* magic number */ - writel_relaxed(virt_to_phys(secondary_startup), relocation + 8); + writel_relaxed(__pa_symbol(secondary_startup), relocation + 8); writel_relaxed(0, relocation + 12); iounmap(relocation); diff --git a/arch/arm/mach-hisi/platsmp.c b/arch/arm/mach-hisi/platsmp.c index e1d67648d5d0..91bb02dec20f 100644 --- a/arch/arm/mach-hisi/platsmp.c +++ b/arch/arm/mach-hisi/platsmp.c @@ -28,7 +28,7 @@ void hi3xxx_set_cpu_jump(int cpu, void *jump_addr) cpu = cpu_logical_map(cpu); if (!cpu || !ctrl_base) return; - writel_relaxed(virt_to_phys(jump_addr), ctrl_base + ((cpu - 1) << 2)); + writel_relaxed(__pa_symbol(jump_addr), ctrl_base + ((cpu - 1) << 2)); } int hi3xxx_get_cpu_jump(int cpu) @@ -118,7 +118,7 @@ static int hix5hd2_boot_secondary(unsigned int cpu, struct task_struct *idle) { phys_addr_t jumpaddr; - jumpaddr = virt_to_phys(secondary_startup); + jumpaddr = __pa_symbol(secondary_startup); hix5hd2_set_scu_boot_addr(HIX5HD2_BOOT_ADDRESS, jumpaddr); hix5hd2_set_cpu(cpu, true); arch_send_wakeup_ipi_mask(cpumask_of(cpu)); @@ -156,7 +156,7 @@ static int hip01_boot_secondary(unsigned int cpu, struct task_struct *idle) struct device_node *node; - jumpaddr = virt_to_phys(secondary_startup); + jumpaddr = __pa_symbol(secondary_startup); hip01_set_boot_addr(HIP01_BOOT_ADDRESS, jumpaddr); node = of_find_compatible_node(NULL, NULL, "hisilicon,hip01-sysctrl"); diff --git a/arch/arm/mach-imx/platsmp.c b/arch/arm/mach-imx/platsmp.c index 711dbbd5badd..c2d1b329fba1 100644 --- a/arch/arm/mach-imx/platsmp.c +++ b/arch/arm/mach-imx/platsmp.c @@ -117,7 +117,7 @@ static void __init ls1021a_smp_prepare_cpus(unsigned int max_cpus) dcfg_base = of_iomap(np, 0); BUG_ON(!dcfg_base); - paddr = virt_to_phys(secondary_startup); + paddr = __pa_symbol(secondary_startup); writel_relaxed(cpu_to_be32(paddr), dcfg_base + DCFG_CCSR_SCRATCHRW1); iounmap(dcfg_base); diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index 1515e498d348..e61b1d1027e1 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -499,7 +499,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata) memset(suspend_ocram_base, 0, sizeof(*pm_info)); pm_info = suspend_ocram_base; pm_info->pbase = ocram_pbase; - pm_info->resume_addr = virt_to_phys(v7_cpu_resume); + pm_info->resume_addr = __pa_symbol(v7_cpu_resume); pm_info->pm_info_size = sizeof(*pm_info); /* diff --git a/arch/arm/mach-imx/src.c b/arch/arm/mach-imx/src.c index 70b083fe934a..495d85d0fe7e 100644 --- a/arch/arm/mach-imx/src.c +++ b/arch/arm/mach-imx/src.c @@ -99,7 +99,7 @@ void imx_enable_cpu(int cpu, bool enable) void imx_set_cpu_jump(int cpu, void *jump_addr) { cpu = cpu_logical_map(cpu); - writel_relaxed(virt_to_phys(jump_addr), + writel_relaxed(__pa_symbol(jump_addr), src_base + SRC_GPR1 + cpu * 8); } diff --git a/arch/arm/mach-mediatek/platsmp.c b/arch/arm/mach-mediatek/platsmp.c index b821e34474b6..726eb69bb655 100644 --- a/arch/arm/mach-mediatek/platsmp.c +++ b/arch/arm/mach-mediatek/platsmp.c @@ -122,7 +122,7 @@ static void __init __mtk_smp_prepare_cpus(unsigned int max_cpus, int trustzone) * write the address of slave startup address into the system-wide * jump register */ - writel_relaxed(virt_to_phys(secondary_startup_arm), + writel_relaxed(__pa_symbol(secondary_startup_arm), mtk_smp_base + mtk_smp_info->jump_reg); } diff --git a/arch/arm/mach-mvebu/pm.c b/arch/arm/mach-mvebu/pm.c index 2990c5269b18..c487be61d6d8 100644 --- a/arch/arm/mach-mvebu/pm.c +++ b/arch/arm/mach-mvebu/pm.c @@ -110,7 +110,7 @@ static void mvebu_pm_store_armadaxp_bootinfo(u32 *store_addr) { phys_addr_t resume_pc; - resume_pc = virt_to_phys(armada_370_xp_cpu_resume); + resume_pc = __pa_symbol(armada_370_xp_cpu_resume); /* * The bootloader expects the first two words to be a magic diff --git a/arch/arm/mach-mvebu/pmsu.c b/arch/arm/mach-mvebu/pmsu.c index f39bd51bce18..27a78c80e5b1 100644 --- a/arch/arm/mach-mvebu/pmsu.c +++ b/arch/arm/mach-mvebu/pmsu.c @@ -112,7 +112,7 @@ static const struct of_device_id of_pmsu_table[] = { void mvebu_pmsu_set_cpu_boot_addr(int hw_cpu, void *boot_addr) { - writel(virt_to_phys(boot_addr), pmsu_mp_base + + writel(__pa_symbol(boot_addr), pmsu_mp_base + PMSU_BOOT_ADDR_REDIRECT_OFFSET(hw_cpu)); } diff --git a/arch/arm/mach-mvebu/system-controller.c b/arch/arm/mach-mvebu/system-controller.c index 76cbc82a7407..04d9ebe6a90a 100644 --- a/arch/arm/mach-mvebu/system-controller.c +++ b/arch/arm/mach-mvebu/system-controller.c @@ -153,7 +153,7 @@ void mvebu_system_controller_set_cpu_boot_addr(void *boot_addr) if (of_machine_is_compatible("marvell,armada375")) mvebu_armada375_smp_wa_init(); - writel(virt_to_phys(boot_addr), system_controller_base + + writel(__pa_symbol(boot_addr), system_controller_base + mvebu_sc->resume_boot_addr); } #endif diff --git a/arch/arm/mach-omap2/control.c b/arch/arm/mach-omap2/control.c index 1662071bb2cc..bd8089ff929f 100644 --- a/arch/arm/mach-omap2/control.c +++ b/arch/arm/mach-omap2/control.c @@ -315,15 +315,15 @@ void omap3_save_scratchpad_contents(void) scratchpad_contents.boot_config_ptr = 0x0; if (cpu_is_omap3630()) scratchpad_contents.public_restore_ptr = - virt_to_phys(omap3_restore_3630); + __pa_symbol(omap3_restore_3630); else if (omap_rev() != OMAP3430_REV_ES3_0 && omap_rev() != OMAP3430_REV_ES3_1 && omap_rev() != OMAP3430_REV_ES3_1_2) scratchpad_contents.public_restore_ptr = - virt_to_phys(omap3_restore); + __pa_symbol(omap3_restore); else scratchpad_contents.public_restore_ptr = - virt_to_phys(omap3_restore_es3); + __pa_symbol(omap3_restore_es3); if (omap_type() == OMAP2_DEVICE_TYPE_GP) scratchpad_contents.secure_ram_restore_ptr = 0x0; @@ -395,7 +395,7 @@ void omap3_save_scratchpad_contents(void) sdrc_block_contents.flags = 0x0; sdrc_block_contents.block_size = 0x0; - arm_context_addr = virt_to_phys(omap3_arm_context); + arm_context_addr = __pa_symbol(omap3_arm_context); /* Copy all the contents to the scratchpad location */ scratchpad_address = OMAP2_L4_IO_ADDRESS(OMAP343X_SCRATCHPAD); diff --git a/arch/arm/mach-omap2/omap-mpuss-lowpower.c b/arch/arm/mach-omap2/omap-mpuss-lowpower.c index 7d62ad48c7c9..113ab2dd2ee9 100644 --- a/arch/arm/mach-omap2/omap-mpuss-lowpower.c +++ b/arch/arm/mach-omap2/omap-mpuss-lowpower.c @@ -273,7 +273,7 @@ int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state) cpu_clear_prev_logic_pwrst(cpu); pwrdm_set_next_pwrst(pm_info->pwrdm, power_state); pwrdm_set_logic_retst(pm_info->pwrdm, cpu_logic_state); - set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.resume)); + set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.resume)); omap_pm_ops.scu_prepare(cpu, power_state); l2x0_pwrst_prepare(cpu, save_state); @@ -325,7 +325,7 @@ int omap4_hotplug_cpu(unsigned int cpu, unsigned int power_state) pwrdm_clear_all_prev_pwrst(pm_info->pwrdm); pwrdm_set_next_pwrst(pm_info->pwrdm, power_state); - set_cpu_wakeup_addr(cpu, virt_to_phys(omap_pm_ops.hotplug_restart)); + set_cpu_wakeup_addr(cpu, __pa_symbol(omap_pm_ops.hotplug_restart)); omap_pm_ops.scu_prepare(cpu, power_state); /* @@ -467,13 +467,13 @@ void __init omap4_mpuss_early_init(void) sar_base = omap4_get_sar_ram_base(); if (cpu_is_omap443x()) - startup_pa = virt_to_phys(omap4_secondary_startup); + startup_pa = __pa_symbol(omap4_secondary_startup); else if (cpu_is_omap446x()) - startup_pa = virt_to_phys(omap4460_secondary_startup); + startup_pa = __pa_symbol(omap4460_secondary_startup); else if ((__boot_cpu_mode & MODE_MASK) == HYP_MODE) - startup_pa = virt_to_phys(omap5_secondary_hyp_startup); + startup_pa = __pa_symbol(omap5_secondary_hyp_startup); else - startup_pa = virt_to_phys(omap5_secondary_startup); + startup_pa = __pa_symbol(omap5_secondary_startup); if (cpu_is_omap44xx()) writel_relaxed(startup_pa, sar_base + diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c index b4de3da6dffa..003353b0b794 100644 --- a/arch/arm/mach-omap2/omap-smp.c +++ b/arch/arm/mach-omap2/omap-smp.c @@ -316,9 +316,9 @@ static void __init omap4_smp_prepare_cpus(unsigned int max_cpus) * A barrier is added to ensure that write buffer is drained */ if (omap_secure_apis_support()) - omap_auxcoreboot_addr(virt_to_phys(cfg.startup_addr)); + omap_auxcoreboot_addr(__pa_symbol(cfg.startup_addr)); else - writel_relaxed(virt_to_phys(cfg.startup_addr), + writel_relaxed(__pa_symbol(cfg.startup_addr), base + OMAP_AUX_CORE_BOOT_1); } diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c index 0875b99add18..75ef5d4be554 100644 --- a/arch/arm/mach-prima2/platsmp.c +++ b/arch/arm/mach-prima2/platsmp.c @@ -65,7 +65,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle) * waiting for. This would wake up the secondary core from WFE */ #define SIRFSOC_CPU1_JUMPADDR_OFFSET 0x2bc - __raw_writel(virt_to_phys(sirfsoc_secondary_startup), + __raw_writel(__pa_symbol(sirfsoc_secondary_startup), clk_base + SIRFSOC_CPU1_JUMPADDR_OFFSET); #define SIRFSOC_CPU1_WAKEMAGIC_OFFSET 0x2b8 diff --git a/arch/arm/mach-prima2/pm.c b/arch/arm/mach-prima2/pm.c index 83e94c95e314..b0bcf1ff02dd 100644 --- a/arch/arm/mach-prima2/pm.c +++ b/arch/arm/mach-prima2/pm.c @@ -54,7 +54,7 @@ static void sirfsoc_set_sleep_mode(u32 mode) static int sirfsoc_pre_suspend_power_off(void) { - u32 wakeup_entry = virt_to_phys(cpu_resume); + u32 wakeup_entry = __pa_symbol(cpu_resume); sirfsoc_rtc_iobrg_writel(wakeup_entry, sirfsoc_pwrc_base + SIRFSOC_PWRC_SCRATCH_PAD1); diff --git a/arch/arm/mach-pxa/palmz72.c b/arch/arm/mach-pxa/palmz72.c index 9c308de158c6..29630061e700 100644 --- a/arch/arm/mach-pxa/palmz72.c +++ b/arch/arm/mach-pxa/palmz72.c @@ -249,7 +249,7 @@ static int palmz72_pm_suspend(void) store_ptr = *PALMZ72_SAVE_DWORD; /* Setting PSPR to a proper value */ - PSPR = virt_to_phys(&palmz72_resume_info); + PSPR = __pa_symbol(&palmz72_resume_info); return 0; } diff --git a/arch/arm/mach-pxa/pxa25x.c b/arch/arm/mach-pxa/pxa25x.c index c725baf119e1..ba431fad5c47 100644 --- a/arch/arm/mach-pxa/pxa25x.c +++ b/arch/arm/mach-pxa/pxa25x.c @@ -85,7 +85,7 @@ static void pxa25x_cpu_pm_enter(suspend_state_t state) static int pxa25x_cpu_pm_prepare(void) { /* set resume return address */ - PSPR = virt_to_phys(cpu_resume); + PSPR = __pa_symbol(cpu_resume); return 0; } diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c index c0185c5c5a08..9b69be4e9fe3 100644 --- a/arch/arm/mach-pxa/pxa27x.c +++ b/arch/arm/mach-pxa/pxa27x.c @@ -168,7 +168,7 @@ static int pxa27x_cpu_pm_valid(suspend_state_t state) static int pxa27x_cpu_pm_prepare(void) { /* set resume return address */ - PSPR = virt_to_phys(cpu_resume); + PSPR = __pa_symbol(cpu_resume); return 0; } diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c index 87acc96388c7..0cc9f124c9ac 100644 --- a/arch/arm/mach-pxa/pxa3xx.c +++ b/arch/arm/mach-pxa/pxa3xx.c @@ -123,7 +123,7 @@ static void pxa3xx_cpu_pm_suspend(void) PSPR = 0x5c014000; /* overwrite with the resume address */ - *p = virt_to_phys(cpu_resume); + *p = __pa_symbol(cpu_resume); cpu_suspend(0, pxa3xx_finish_suspend); diff --git a/arch/arm/mach-realview/platsmp-dt.c b/arch/arm/mach-realview/platsmp-dt.c index 70ca99eb52c6..c242423bf8db 100644 --- a/arch/arm/mach-realview/platsmp-dt.c +++ b/arch/arm/mach-realview/platsmp-dt.c @@ -76,7 +76,7 @@ static void __init realview_smp_prepare_cpus(unsigned int max_cpus) } /* Put the boot address in this magic register */ regmap_write(map, REALVIEW_SYS_FLAGSSET_OFFSET, - virt_to_phys(versatile_secondary_startup)); + __pa_symbol(versatile_secondary_startup)); } static const struct smp_operations realview_dt_smp_ops __initconst = { diff --git a/arch/arm/mach-rockchip/platsmp.c b/arch/arm/mach-rockchip/platsmp.c index 4d827a069d49..3abafdbdd7f4 100644 --- a/arch/arm/mach-rockchip/platsmp.c +++ b/arch/arm/mach-rockchip/platsmp.c @@ -156,7 +156,7 @@ static int rockchip_boot_secondary(unsigned int cpu, struct task_struct *idle) */ mdelay(1); /* ensure the cpus other than cpu0 to startup */ - writel(virt_to_phys(secondary_startup), sram_base_addr + 8); + writel(__pa_symbol(secondary_startup), sram_base_addr + 8); writel(0xDEADBEAF, sram_base_addr + 4); dsb_sev(); } @@ -195,7 +195,7 @@ static int __init rockchip_smp_prepare_sram(struct device_node *node) } /* set the boot function for the sram code */ - rockchip_boot_fn = virt_to_phys(secondary_startup); + rockchip_boot_fn = __pa_symbol(secondary_startup); /* copy the trampoline to sram, that runs during startup of the core */ memcpy(sram_base_addr, &rockchip_secondary_trampoline, trampoline_sz); diff --git a/arch/arm/mach-rockchip/pm.c b/arch/arm/mach-rockchip/pm.c index bee8c8051929..0592534e0b88 100644 --- a/arch/arm/mach-rockchip/pm.c +++ b/arch/arm/mach-rockchip/pm.c @@ -62,7 +62,7 @@ static inline u32 rk3288_l2_config(void) static void rk3288_config_bootdata(void) { rkpm_bootdata_cpusp = rk3288_bootram_phy + (SZ_4K - 8); - rkpm_bootdata_cpu_code = virt_to_phys(cpu_resume); + rkpm_bootdata_cpu_code = __pa_symbol(cpu_resume); rkpm_bootdata_l2ctlr_f = 1; rkpm_bootdata_l2ctlr = rk3288_l2_config(); diff --git a/arch/arm/mach-s3c24xx/mach-jive.c b/arch/arm/mach-s3c24xx/mach-jive.c index 895aca225952..f5b5c49b56ac 100644 --- a/arch/arm/mach-s3c24xx/mach-jive.c +++ b/arch/arm/mach-s3c24xx/mach-jive.c @@ -484,7 +484,7 @@ static int jive_pm_suspend(void) * correct address to resume from. */ __raw_writel(0x2BED, S3C2412_INFORM0); - __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1); + __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1); return 0; } diff --git a/arch/arm/mach-s3c24xx/pm-s3c2410.c b/arch/arm/mach-s3c24xx/pm-s3c2410.c index 20e481d8a33a..a4588daeddb0 100644 --- a/arch/arm/mach-s3c24xx/pm-s3c2410.c +++ b/arch/arm/mach-s3c24xx/pm-s3c2410.c @@ -45,7 +45,7 @@ static void s3c2410_pm_prepare(void) { /* ensure at least GSTATUS3 has the resume address */ - __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2410_GSTATUS3); + __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2410_GSTATUS3); S3C_PMDBG("GSTATUS3 0x%08x\n", __raw_readl(S3C2410_GSTATUS3)); S3C_PMDBG("GSTATUS4 0x%08x\n", __raw_readl(S3C2410_GSTATUS4)); diff --git a/arch/arm/mach-s3c24xx/pm-s3c2416.c b/arch/arm/mach-s3c24xx/pm-s3c2416.c index c0e328e37bd6..b5bbf0d5985c 100644 --- a/arch/arm/mach-s3c24xx/pm-s3c2416.c +++ b/arch/arm/mach-s3c24xx/pm-s3c2416.c @@ -48,7 +48,7 @@ static void s3c2416_pm_prepare(void) * correct address to resume from. */ __raw_writel(0x2BED, S3C2412_INFORM0); - __raw_writel(virt_to_phys(s3c_cpu_resume), S3C2412_INFORM1); + __raw_writel(__pa_symbol(s3c_cpu_resume), S3C2412_INFORM1); } static int s3c2416_pm_add(struct device *dev, struct subsys_interface *sif) diff --git a/arch/arm/mach-s3c64xx/pm.c b/arch/arm/mach-s3c64xx/pm.c index b0be382ff6bb..2f579be8fe67 100644 --- a/arch/arm/mach-s3c64xx/pm.c +++ b/arch/arm/mach-s3c64xx/pm.c @@ -304,7 +304,7 @@ static void s3c64xx_pm_prepare(void) wake_irqs, ARRAY_SIZE(wake_irqs)); /* store address of resume. */ - __raw_writel(virt_to_phys(s3c_cpu_resume), S3C64XX_INFORM0); + __raw_writel(__pa_symbol(s3c_cpu_resume), S3C64XX_INFORM0); /* ensure previous wakeup state is cleared before sleeping */ __raw_writel(__raw_readl(S3C64XX_WAKEUP_STAT), S3C64XX_WAKEUP_STAT); diff --git a/arch/arm/mach-s5pv210/pm.c b/arch/arm/mach-s5pv210/pm.c index 7d69666de5ba..07cee14a363b 100644 --- a/arch/arm/mach-s5pv210/pm.c +++ b/arch/arm/mach-s5pv210/pm.c @@ -69,7 +69,7 @@ static void s5pv210_pm_prepare(void) __raw_writel(s5pv210_irqwake_intmask, S5P_WAKEUP_MASK); /* ensure at least INFORM0 has the resume address */ - __raw_writel(virt_to_phys(s5pv210_cpu_resume), S5P_INFORM0); + __raw_writel(__pa_symbol(s5pv210_cpu_resume), S5P_INFORM0); tmp = __raw_readl(S5P_SLEEP_CFG); tmp &= ~(S5P_SLEEP_CFG_OSC_EN | S5P_SLEEP_CFG_USBOSC_EN); diff --git a/arch/arm/mach-sa1100/pm.c b/arch/arm/mach-sa1100/pm.c index 34853d5dfda2..9a7079f565bd 100644 --- a/arch/arm/mach-sa1100/pm.c +++ b/arch/arm/mach-sa1100/pm.c @@ -73,7 +73,7 @@ static int sa11x0_pm_enter(suspend_state_t state) RCSR = RCSR_HWR | RCSR_SWR | RCSR_WDR | RCSR_SMR; /* set resume return address */ - PSPR = virt_to_phys(cpu_resume); + PSPR = __pa_symbol(cpu_resume); /* go zzz */ cpu_suspend(0, sa1100_finish_suspend); diff --git a/arch/arm/mach-shmobile/platsmp-apmu.c b/arch/arm/mach-shmobile/platsmp-apmu.c index e19266844e16..3ca2c13346f0 100644 --- a/arch/arm/mach-shmobile/platsmp-apmu.c +++ b/arch/arm/mach-shmobile/platsmp-apmu.c @@ -190,7 +190,7 @@ static void apmu_parse_dt(void (*fn)(struct resource *res, int cpu, int bit)) static void __init shmobile_smp_apmu_setup_boot(void) { /* install boot code shared by all CPUs */ - shmobile_boot_fn = virt_to_phys(shmobile_smp_boot); + shmobile_boot_fn = __pa_symbol(shmobile_smp_boot); } void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus, @@ -204,7 +204,7 @@ void __init shmobile_smp_apmu_prepare_cpus(unsigned int max_cpus, int shmobile_smp_apmu_boot_secondary(unsigned int cpu, struct task_struct *idle) { /* For this particular CPU register boot vector */ - shmobile_smp_hook(cpu, virt_to_phys(secondary_startup), 0); + shmobile_smp_hook(cpu, __pa_symbol(secondary_startup), 0); return apmu_wrap(cpu, apmu_power_on); } @@ -308,7 +308,7 @@ int shmobile_smp_apmu_cpu_kill(unsigned int cpu) #if defined(CONFIG_SUSPEND) static int shmobile_smp_apmu_do_suspend(unsigned long cpu) { - shmobile_smp_hook(cpu, virt_to_phys(cpu_resume), 0); + shmobile_smp_hook(cpu, __pa_symbol(cpu_resume), 0); shmobile_smp_apmu_cpu_shutdown(cpu); cpu_do_idle(); /* WFI selects Core Standby */ return 1; diff --git a/arch/arm/mach-shmobile/platsmp-scu.c b/arch/arm/mach-shmobile/platsmp-scu.c index d1ecaf37d142..f1a1efde4beb 100644 --- a/arch/arm/mach-shmobile/platsmp-scu.c +++ b/arch/arm/mach-shmobile/platsmp-scu.c @@ -24,7 +24,7 @@ static void __iomem *shmobile_scu_base; static int shmobile_scu_cpu_prepare(unsigned int cpu) { /* For this particular CPU register SCU SMP boot vector */ - shmobile_smp_hook(cpu, virt_to_phys(shmobile_boot_scu), + shmobile_smp_hook(cpu, __pa_symbol(shmobile_boot_scu), shmobile_scu_base_phys); return 0; } @@ -33,7 +33,7 @@ void __init shmobile_smp_scu_prepare_cpus(phys_addr_t scu_base_phys, unsigned int max_cpus) { /* install boot code shared by all CPUs */ - shmobile_boot_fn = virt_to_phys(shmobile_smp_boot); + shmobile_boot_fn = __pa_symbol(shmobile_smp_boot); /* enable SCU and cache coherency on booting CPU */ shmobile_scu_base_phys = scu_base_phys; diff --git a/arch/arm/mach-socfpga/platsmp.c b/arch/arm/mach-socfpga/platsmp.c index 07945748b571..0ee76772b507 100644 --- a/arch/arm/mach-socfpga/platsmp.c +++ b/arch/arm/mach-socfpga/platsmp.c @@ -40,7 +40,7 @@ static int socfpga_boot_secondary(unsigned int cpu, struct task_struct *idle) memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size); - writel(virt_to_phys(secondary_startup), + writel(__pa_symbol(secondary_startup), sys_manager_base_addr + (socfpga_cpu1start_addr & 0x000000ff)); flush_cache_all(); @@ -63,7 +63,7 @@ static int socfpga_a10_boot_secondary(unsigned int cpu, struct task_struct *idle SOCFPGA_A10_RSTMGR_MODMPURST); memcpy(phys_to_virt(0), &secondary_trampoline, trampoline_size); - writel(virt_to_phys(secondary_startup), + writel(__pa_symbol(secondary_startup), sys_manager_base_addr + (socfpga_cpu1start_addr & 0x00000fff)); flush_cache_all(); diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c index 8d1e2d551786..39038a03836a 100644 --- a/arch/arm/mach-spear/platsmp.c +++ b/arch/arm/mach-spear/platsmp.c @@ -117,7 +117,7 @@ static void __init spear13xx_smp_prepare_cpus(unsigned int max_cpus) * (presently it is in SRAM). The BootMonitor waits until it receives a * soft interrupt, and then the secondary CPU branches to this address. */ - __raw_writel(virt_to_phys(spear13xx_secondary_startup), SYS_LOCATION); + __raw_writel(__pa_symbol(spear13xx_secondary_startup), SYS_LOCATION); } const struct smp_operations spear13xx_smp_ops __initconst = { diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c index ea5a2277ee46..231f19e17436 100644 --- a/arch/arm/mach-sti/platsmp.c +++ b/arch/arm/mach-sti/platsmp.c @@ -103,7 +103,7 @@ static void __init sti_smp_prepare_cpus(unsigned int max_cpus) u32 __iomem *cpu_strt_ptr; u32 release_phys; int cpu; - unsigned long entry_pa = virt_to_phys(sti_secondary_startup); + unsigned long entry_pa = __pa_symbol(sti_secondary_startup); np = of_find_compatible_node(NULL, NULL, "arm,cortex-a9-scu"); diff --git a/arch/arm/mach-sunxi/platsmp.c b/arch/arm/mach-sunxi/platsmp.c index 6642267812c9..8fb5088464db 100644 --- a/arch/arm/mach-sunxi/platsmp.c +++ b/arch/arm/mach-sunxi/platsmp.c @@ -80,7 +80,7 @@ static int sun6i_smp_boot_secondary(unsigned int cpu, spin_lock(&cpu_lock); /* Set CPU boot address */ - writel(virt_to_phys(secondary_startup), + writel(__pa_symbol(secondary_startup), cpucfg_membase + CPUCFG_PRIVATE0_REG); /* Assert the CPU core in reset */ @@ -162,7 +162,7 @@ static int sun8i_smp_boot_secondary(unsigned int cpu, spin_lock(&cpu_lock); /* Set CPU boot address */ - writel(virt_to_phys(secondary_startup), + writel(__pa_symbol(secondary_startup), cpucfg_membase + CPUCFG_PRIVATE0_REG); /* Assert the CPU core in reset */ diff --git a/arch/arm/mach-tango/platsmp.c b/arch/arm/mach-tango/platsmp.c index 98c62a4a8623..2f0c6c050fed 100644 --- a/arch/arm/mach-tango/platsmp.c +++ b/arch/arm/mach-tango/platsmp.c @@ -5,7 +5,7 @@ static int tango_boot_secondary(unsigned int cpu, struct task_struct *idle) { - tango_set_aux_boot_addr(virt_to_phys(secondary_startup)); + tango_set_aux_boot_addr(__pa_symbol(secondary_startup)); tango_start_aux_core(cpu); return 0; } diff --git a/arch/arm/mach-tango/pm.c b/arch/arm/mach-tango/pm.c index b05c6d6f99d0..406c0814eb6e 100644 --- a/arch/arm/mach-tango/pm.c +++ b/arch/arm/mach-tango/pm.c @@ -5,7 +5,7 @@ static int tango_pm_powerdown(unsigned long arg) { - tango_suspend(virt_to_phys(cpu_resume)); + tango_suspend(__pa_symbol(cpu_resume)); return -EIO; /* tango_suspend has failed */ } diff --git a/arch/arm/mach-tegra/reset.c b/arch/arm/mach-tegra/reset.c index 6fd9db54887e..dc558892753c 100644 --- a/arch/arm/mach-tegra/reset.c +++ b/arch/arm/mach-tegra/reset.c @@ -94,14 +94,14 @@ void __init tegra_cpu_reset_handler_init(void) __tegra_cpu_reset_handler_data[TEGRA_RESET_MASK_PRESENT] = *((u32 *)cpu_possible_mask); __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_SECONDARY] = - virt_to_phys((void *)secondary_startup); + __pa_symbol((void *)secondary_startup); #endif #ifdef CONFIG_PM_SLEEP __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP1] = TEGRA_IRAM_LPx_RESUME_AREA; __tegra_cpu_reset_handler_data[TEGRA_RESET_STARTUP_LP2] = - virt_to_phys((void *)tegra_resume); + __pa_symbol((void *)tegra_resume); #endif tegra_cpu_reset_handler_enable(); diff --git a/arch/arm/mach-ux500/platsmp.c b/arch/arm/mach-ux500/platsmp.c index e0ee139fdebf..9b124c22035f 100644 --- a/arch/arm/mach-ux500/platsmp.c +++ b/arch/arm/mach-ux500/platsmp.c @@ -79,7 +79,7 @@ static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle) * backup ram register at offset 0x1FF0, which is what boot rom code * is waiting for. This will wake up the secondary core from WFE. */ - writel(virt_to_phys(secondary_startup), + writel(__pa_symbol(secondary_startup), backupram + UX500_CPU1_JUMPADDR_OFFSET); writel(0xA1FEED01, backupram + UX500_CPU1_WAKEMAGIC_OFFSET); diff --git a/arch/arm/mach-vexpress/dcscb.c b/arch/arm/mach-vexpress/dcscb.c index 5cedcf572104..ee2a0faafaa1 100644 --- a/arch/arm/mach-vexpress/dcscb.c +++ b/arch/arm/mach-vexpress/dcscb.c @@ -166,7 +166,7 @@ static int __init dcscb_init(void) * Future entries into the kernel can now go * through the cluster entry vectors. */ - vexpress_flags_set(virt_to_phys(mcpm_entry_point)); + vexpress_flags_set(__pa_symbol(mcpm_entry_point)); return 0; } diff --git a/arch/arm/mach-vexpress/platsmp.c b/arch/arm/mach-vexpress/platsmp.c index 98e29dee91e8..742499bac6d0 100644 --- a/arch/arm/mach-vexpress/platsmp.c +++ b/arch/arm/mach-vexpress/platsmp.c @@ -79,7 +79,7 @@ static void __init vexpress_smp_dt_prepare_cpus(unsigned int max_cpus) * until it receives a soft interrupt, and then the * secondary CPU branches to this address. */ - vexpress_flags_set(virt_to_phys(versatile_secondary_startup)); + vexpress_flags_set(__pa_symbol(versatile_secondary_startup)); } const struct smp_operations vexpress_smp_dt_ops __initconst = { diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c index 1aa4ccece69f..9b5f3c427086 100644 --- a/arch/arm/mach-vexpress/tc2_pm.c +++ b/arch/arm/mach-vexpress/tc2_pm.c @@ -54,7 +54,7 @@ static int tc2_pm_cpu_powerup(unsigned int cpu, unsigned int cluster) if (cluster >= TC2_CLUSTERS || cpu >= tc2_nr_cpus[cluster]) return -EINVAL; ve_spc_set_resume_addr(cluster, cpu, - virt_to_phys(mcpm_entry_point)); + __pa_symbol(mcpm_entry_point)); ve_spc_cpu_wakeup_irq(cluster, cpu, true); return 0; } @@ -159,7 +159,7 @@ static int tc2_pm_wait_for_powerdown(unsigned int cpu, unsigned int cluster) static void tc2_pm_cpu_suspend_prepare(unsigned int cpu, unsigned int cluster) { - ve_spc_set_resume_addr(cluster, cpu, virt_to_phys(mcpm_entry_point)); + ve_spc_set_resume_addr(cluster, cpu, __pa_symbol(mcpm_entry_point)); } static void tc2_pm_cpu_is_up(unsigned int cpu, unsigned int cluster) diff --git a/arch/arm/mach-zx/platsmp.c b/arch/arm/mach-zx/platsmp.c index 0297f92084e0..afb9a82dedc3 100644 --- a/arch/arm/mach-zx/platsmp.c +++ b/arch/arm/mach-zx/platsmp.c @@ -76,7 +76,7 @@ void __init zx_smp_prepare_cpus(unsigned int max_cpus) * until it receives a soft interrupt, and then the * secondary CPU branches to this address. */ - __raw_writel(virt_to_phys(zx_secondary_startup), + __raw_writel(__pa_symbol(zx_secondary_startup), aonsysctrl_base + AON_SYS_CTRL_RESERVED1); iounmap(aonsysctrl_base); @@ -94,7 +94,7 @@ void __init zx_smp_prepare_cpus(unsigned int max_cpus) /* Map the first 4 KB IRAM for suspend usage */ sys_iram = __arm_ioremap_exec(ZX_IRAM_BASE, PAGE_SIZE, false); - zx_secondary_startup_pa = virt_to_phys(zx_secondary_startup); + zx_secondary_startup_pa = __pa_symbol(zx_secondary_startup); fncpy(sys_iram, &zx_resume_jump, zx_suspend_iram_sz); } diff --git a/arch/arm/mach-zynq/platsmp.c b/arch/arm/mach-zynq/platsmp.c index 7cd9865bdeb7..caa6d5fe9078 100644 --- a/arch/arm/mach-zynq/platsmp.c +++ b/arch/arm/mach-zynq/platsmp.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL(zynq_cpun_start); static int zynq_boot_secondary(unsigned int cpu, struct task_struct *idle) { - return zynq_cpun_start(virt_to_phys(secondary_startup), cpu); + return zynq_cpun_start(__pa_symbol(secondary_startup), cpu); } /* diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 35e3a56e5d86..c6c4c9c8824b 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -29,6 +29,7 @@ config CPU_ARM720T select CPU_COPY_V4WT if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WT if MMU help A 32-bit RISC processor with 8kByte Cache, Write Buffer and @@ -46,6 +47,7 @@ config CPU_ARM740T select CPU_CACHE_V4 select CPU_CP15_MPU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE help A 32-bit RISC processor with 8KB cache or 4KB variants, write buffer and MPU(Protection Unit) built around @@ -79,6 +81,7 @@ config CPU_ARM920T select CPU_COPY_V4WB if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU help The ARM920T is licensed to be produced by numerous vendors, @@ -97,6 +100,7 @@ config CPU_ARM922T select CPU_COPY_V4WB if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU help The ARM922T is a version of the ARM920T, but with smaller @@ -116,6 +120,7 @@ config CPU_ARM925T select CPU_COPY_V4WB if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU help The ARM925T is a mix between the ARM920T and ARM926T, but with @@ -134,6 +139,7 @@ config CPU_ARM926T select CPU_COPY_V4WB if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU help This is a variant of the ARM920. It has slightly different @@ -170,6 +176,7 @@ config CPU_ARM940T select CPU_CACHE_VIVT select CPU_CP15_MPU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE help ARM940T is a member of the ARM9TDMI family of general- purpose microprocessors with MPU and separate 4KB @@ -188,6 +195,7 @@ config CPU_ARM946E select CPU_CACHE_VIVT select CPU_CP15_MPU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE help ARM946E-S is a member of the ARM9E-S family of high- performance, 32-bit system-on-chip processor solutions. @@ -206,6 +214,7 @@ config CPU_ARM1020 select CPU_COPY_V4WB if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU help The ARM1020 is the 32K cached version of the ARM10 processor, @@ -225,6 +234,7 @@ config CPU_ARM1020E select CPU_COPY_V4WB if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU # ARM1022E @@ -236,6 +246,7 @@ config CPU_ARM1022 select CPU_COPY_V4WB if MMU # can probably do better select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU help The ARM1022E is an implementation of the ARMv5TE architecture @@ -254,6 +265,7 @@ config CPU_ARM1026 select CPU_COPY_V4WB if MMU # can probably do better select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU help The ARM1026EJ-S is an implementation of the ARMv5TEJ architecture @@ -302,6 +314,7 @@ config CPU_XSCALE select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU # XScale Core Version 3 @@ -312,6 +325,7 @@ config CPU_XSC3 select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU select IO_36 @@ -324,6 +338,7 @@ config CPU_MOHAWK select CPU_COPY_V4WB if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_V4WBI if MMU # Feroceon @@ -335,6 +350,7 @@ config CPU_FEROCEON select CPU_COPY_FEROCEON if MMU select CPU_CP15_MMU select CPU_PABRT_LEGACY + select CPU_THUMB_CAPABLE select CPU_TLB_FEROCEON if MMU config CPU_FEROCEON_OLD_ID @@ -367,6 +383,7 @@ config CPU_V6 select CPU_CP15_MMU select CPU_HAS_ASID if MMU select CPU_PABRT_V6 + select CPU_THUMB_CAPABLE select CPU_TLB_V6 if MMU # ARMv6k @@ -381,6 +398,7 @@ config CPU_V6K select CPU_CP15_MMU select CPU_HAS_ASID if MMU select CPU_PABRT_V6 + select CPU_THUMB_CAPABLE select CPU_TLB_V6 if MMU # ARMv7 @@ -396,6 +414,7 @@ config CPU_V7 select CPU_CP15_MPU if !MMU select CPU_HAS_ASID if MMU select CPU_PABRT_V7 + select CPU_THUMB_CAPABLE select CPU_TLB_V7 if MMU # ARMv7M @@ -410,11 +429,17 @@ config CPU_V7M config CPU_THUMBONLY bool + select CPU_THUMB_CAPABLE # There are no CPUs available with MMU that don't implement an ARM ISA: depends on !MMU help Select this if your CPU doesn't support the 32 bit ARM instructions. +config CPU_THUMB_CAPABLE + bool + help + Select this if your CPU can support Thumb mode. + # Figure out what processor architecture version we should be using. # This defines the compiler instruction set which depends on the machine type. config CPU_32v3 @@ -655,11 +680,7 @@ config ARCH_DMA_ADDR_T_64BIT config ARM_THUMB bool "Support Thumb user binaries" if !CPU_THUMBONLY - depends on CPU_ARM720T || CPU_ARM740T || CPU_ARM920T || CPU_ARM922T || \ - CPU_ARM925T || CPU_ARM926T || CPU_ARM940T || CPU_ARM946E || \ - CPU_ARM1020 || CPU_ARM1020E || CPU_ARM1022 || CPU_ARM1026 || \ - CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_V6 || CPU_V6K || \ - CPU_V7 || CPU_FEROCEON || CPU_V7M + depends on CPU_THUMB_CAPABLE default y help Say Y if you want to include kernel support for running user space diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index e8698241ece9..b3dea80715b4 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -14,6 +14,7 @@ endif obj-$(CONFIG_ARM_PTDUMP) += dump.o obj-$(CONFIG_MODULES) += proc-syms.o +obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/arm/mm/cache-uniphier.c b/arch/arm/mm/cache-uniphier.c index dfe97b409916..f57b080b6fd4 100644 --- a/arch/arm/mm/cache-uniphier.c +++ b/arch/arm/mm/cache-uniphier.c @@ -15,6 +15,7 @@ #define pr_fmt(fmt) "uniphier: " fmt +#include <linux/bitops.h> #include <linux/init.h> #include <linux/io.h> #include <linux/log2.h> @@ -71,8 +72,7 @@ * @ctrl_base: virtual base address of control registers * @rev_base: virtual base address of revision registers * @op_base: virtual base address of operation registers - * @way_present_mask: each bit specifies if the way is present - * @way_locked_mask: each bit specifies if the way is locked + * @way_mask: each bit specifies if the way is present * @nsets: number of associativity sets * @line_size: line size in bytes * @range_op_max_size: max size that can be handled by a single range operation @@ -83,8 +83,7 @@ struct uniphier_cache_data { void __iomem *rev_base; void __iomem *op_base; void __iomem *way_ctrl_base; - u32 way_present_mask; - u32 way_locked_mask; + u32 way_mask; u32 nsets; u32 line_size; u32 range_op_max_size; @@ -234,17 +233,13 @@ static void __uniphier_cache_enable(struct uniphier_cache_data *data, bool on) writel_relaxed(val, data->ctrl_base + UNIPHIER_SSCC); } -static void __init __uniphier_cache_set_locked_ways( - struct uniphier_cache_data *data, - u32 way_mask) +static void __init __uniphier_cache_set_active_ways( + struct uniphier_cache_data *data) { unsigned int cpu; - data->way_locked_mask = way_mask & data->way_present_mask; - for_each_possible_cpu(cpu) - writel_relaxed(~data->way_locked_mask & data->way_present_mask, - data->way_ctrl_base + 4 * cpu); + writel_relaxed(data->way_mask, data->way_ctrl_base + 4 * cpu); } static void uniphier_cache_maint_range(unsigned long start, unsigned long end, @@ -307,7 +302,7 @@ static void __init uniphier_cache_enable(void) list_for_each_entry(data, &uniphier_cache_list, list) { __uniphier_cache_enable(data, true); - __uniphier_cache_set_locked_ways(data, 0); + __uniphier_cache_set_active_ways(data); } } @@ -382,8 +377,8 @@ static int __init __uniphier_cache_init(struct device_node *np, goto err; } - data->way_present_mask = - ((u32)1 << cache_size / data->nsets / data->line_size) - 1; + data->way_mask = GENMASK(cache_size / data->nsets / data->line_size - 1, + 0); data->ctrl_base = of_iomap(np, 0); if (!data->ctrl_base) { diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index e309a5e2c935..63eabb06f9f1 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -870,6 +870,9 @@ static int __arm_dma_mmap(struct device *dev, struct vm_area_struct *vma, vma->vm_end - vma->vm_start, vma->vm_page_prot); } +#else + ret = vm_iomap_memory(vma, vma->vm_start, + (vma->vm_end - vma->vm_start)); #endif /* CONFIG_MMU */ return ret; diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 9fe8e241335c..21192d6eda40 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -18,6 +18,7 @@ #include <linux/seq_file.h> #include <asm/fixmap.h> +#include <asm/memory.h> #include <asm/pgtable.h> struct addr_marker { @@ -31,8 +32,8 @@ static struct addr_marker address_markers[] = { { 0, "vmalloc() Area" }, { VMALLOC_END, "vmalloc() End" }, { FIXADDR_START, "Fixmap Area" }, - { CONFIG_VECTORS_BASE, "Vectors" }, - { CONFIG_VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" }, + { VECTORS_BASE, "Vectors" }, + { VECTORS_BASE + PAGE_SIZE * 2, "Vectors End" }, { -1, NULL }, }; diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 3cced8455727..f1e6190aa7ea 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -327,6 +327,12 @@ void flush_dcache_page(struct page *page) if (page == ZERO_PAGE(0)) return; + if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) { + if (test_bit(PG_dcache_clean, &page->flags)) + clear_bit(PG_dcache_clean, &page->flags); + return; + } + mapping = page_mapping(page); if (!cache_ops_need_broadcast() && diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 4be0bee4c357..bf4d3bc41a7a 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -27,6 +27,7 @@ #include <asm/cp15.h> #include <asm/mach-types.h> #include <asm/memblock.h> +#include <asm/memory.h> #include <asm/prom.h> #include <asm/sections.h> #include <asm/setup.h> @@ -227,41 +228,59 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) return phys; } -void __init arm_memblock_init(const struct machine_desc *mdesc) +static void __init arm_initrd_init(void) { - /* Register the kernel text, kernel data and initrd with memblock. */ -#ifdef CONFIG_XIP_KERNEL - memblock_reserve(__pa(_sdata), _end - _sdata); -#else - memblock_reserve(__pa(_stext), _end - _stext); -#endif #ifdef CONFIG_BLK_DEV_INITRD + phys_addr_t start; + unsigned long size; + /* FDT scan will populate initrd_start */ if (initrd_start && !phys_initrd_size) { phys_initrd_start = __virt_to_phys(initrd_start); phys_initrd_size = initrd_end - initrd_start; } + initrd_start = initrd_end = 0; - if (phys_initrd_size && - !memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) { + + if (!phys_initrd_size) + return; + + /* + * Round the memory region to page boundaries as per free_initrd_mem() + * This allows us to detect whether the pages overlapping the initrd + * are in use, but more importantly, reserves the entire set of pages + * as we don't want these pages allocated for other purposes. + */ + start = round_down(phys_initrd_start, PAGE_SIZE); + size = phys_initrd_size + (phys_initrd_start - start); + size = round_up(size, PAGE_SIZE); + + if (!memblock_is_region_memory(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region - disabling initrd\n", - (u64)phys_initrd_start, phys_initrd_size); - phys_initrd_start = phys_initrd_size = 0; + (u64)start, size); + return; } - if (phys_initrd_size && - memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) { + + if (memblock_is_region_reserved(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region - disabling initrd\n", - (u64)phys_initrd_start, phys_initrd_size); - phys_initrd_start = phys_initrd_size = 0; + (u64)start, size); + return; } - if (phys_initrd_size) { - memblock_reserve(phys_initrd_start, phys_initrd_size); - /* Now convert initrd to virtual addresses */ - initrd_start = __phys_to_virt(phys_initrd_start); - initrd_end = initrd_start + phys_initrd_size; - } + memblock_reserve(start, size); + + /* Now convert initrd to virtual addresses */ + initrd_start = __phys_to_virt(phys_initrd_start); + initrd_end = initrd_start + phys_initrd_size; #endif +} + +void __init arm_memblock_init(const struct machine_desc *mdesc) +{ + /* Register the kernel text, kernel data and initrd with memblock. */ + memblock_reserve(__pa(KERNEL_START), KERNEL_END - KERNEL_START); + + arm_initrd_init(); arm_mm_memblock_reserve(); @@ -521,8 +540,7 @@ void __init mem_init(void) " .data : 0x%p" " - 0x%p" " (%4td kB)\n" " .bss : 0x%p" " - 0x%p" " (%4td kB)\n", - MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) + - (PAGE_SIZE)), + MLK(VECTORS_BASE, VECTORS_BASE + PAGE_SIZE), #ifdef CONFIG_HAVE_TCM MLK(DTCM_OFFSET, (unsigned long) dtcm_end), MLK(ITCM_OFFSET, (unsigned long) itcm_end), diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 4001dd15818d..4e016d7f37b3 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1152,13 +1152,12 @@ early_param("vmalloc", early_vmalloc); phys_addr_t arm_lowmem_limit __initdata = 0; -void __init sanity_check_meminfo(void) +void __init adjust_lowmem_bounds(void) { phys_addr_t memblock_limit = 0; - int highmem = 0; u64 vmalloc_limit; struct memblock_region *reg; - bool should_use_highmem = false; + phys_addr_t lowmem_limit = 0; /* * Let's use our own (unoptimized) equivalent of __pa() that is @@ -1172,43 +1171,18 @@ void __init sanity_check_meminfo(void) for_each_memblock(memory, reg) { phys_addr_t block_start = reg->base; phys_addr_t block_end = reg->base + reg->size; - phys_addr_t size_limit = reg->size; - if (reg->base >= vmalloc_limit) - highmem = 1; - else - size_limit = vmalloc_limit - reg->base; - - - if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { - - if (highmem) { - pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n", - &block_start, &block_end); - memblock_remove(reg->base, reg->size); - should_use_highmem = true; - continue; - } - - if (reg->size > size_limit) { - phys_addr_t overlap_size = reg->size - size_limit; - - pr_notice("Truncating RAM at %pa-%pa", - &block_start, &block_end); - block_end = vmalloc_limit; - pr_cont(" to -%pa", &block_end); - memblock_remove(vmalloc_limit, overlap_size); - should_use_highmem = true; - } - } - - if (!highmem) { - if (block_end > arm_lowmem_limit) { - if (reg->size > size_limit) - arm_lowmem_limit = vmalloc_limit; - else - arm_lowmem_limit = block_end; - } + if (reg->base < vmalloc_limit) { + if (block_end > lowmem_limit) + /* + * Compare as u64 to ensure vmalloc_limit does + * not get truncated. block_end should always + * fit in phys_addr_t so there should be no + * issue with assignment. + */ + lowmem_limit = min_t(u64, + vmalloc_limit, + block_end); /* * Find the first non-pmd-aligned page, and point @@ -1227,14 +1201,13 @@ void __init sanity_check_meminfo(void) if (!IS_ALIGNED(block_start, PMD_SIZE)) memblock_limit = block_start; else if (!IS_ALIGNED(block_end, PMD_SIZE)) - memblock_limit = arm_lowmem_limit; + memblock_limit = lowmem_limit; } } } - if (should_use_highmem) - pr_notice("Consider using a HIGHMEM enabled kernel.\n"); + arm_lowmem_limit = lowmem_limit; high_memory = __va(arm_lowmem_limit - 1) + 1; @@ -1248,6 +1221,18 @@ void __init sanity_check_meminfo(void) if (!memblock_limit) memblock_limit = arm_lowmem_limit; + if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { + if (memblock_end_of_DRAM() > arm_lowmem_limit) { + phys_addr_t end = memblock_end_of_DRAM(); + + pr_notice("Ignoring RAM at %pa-%pa\n", + &memblock_limit, &end); + pr_notice("Consider using a HIGHMEM enabled kernel.\n"); + + memblock_remove(memblock_limit, end - memblock_limit); + } + } + memblock_set_current_limit(memblock_limit); } @@ -1437,11 +1422,7 @@ static void __init kmap_init(void) static void __init map_lowmem(void) { struct memblock_region *reg; -#ifdef CONFIG_XIP_KERNEL - phys_addr_t kernel_x_start = round_down(__pa(_sdata), SECTION_SIZE); -#else - phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE); -#endif + phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE); phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); /* Map all the lowmem memory banks. */ diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 2740967727e2..3b5c7aaf9c76 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <asm/cacheflush.h> +#include <asm/cp15.h> #include <asm/sections.h> #include <asm/page.h> #include <asm/setup.h> @@ -22,6 +23,8 @@ #include "mm.h" +unsigned long vectors_base; + #ifdef CONFIG_ARM_MPU struct mpu_rgn_info mpu_rgn_info; @@ -85,7 +88,7 @@ static unsigned long irbar_read(void) } /* MPU initialisation functions */ -void __init sanity_check_meminfo_mpu(void) +void __init adjust_lowmem_bounds_mpu(void) { phys_addr_t phys_offset = PHYS_OFFSET; phys_addr_t aligned_region_size, specified_mem_size, rounded_mem_size; @@ -274,19 +277,64 @@ void __init mpu_setup(void) } } #else -static void sanity_check_meminfo_mpu(void) {} +static void adjust_lowmem_bounds_mpu(void) {} static void __init mpu_setup(void) {} #endif /* CONFIG_ARM_MPU */ +#ifdef CONFIG_CPU_CP15 +#ifdef CONFIG_CPU_HIGH_VECTOR +static unsigned long __init setup_vectors_base(void) +{ + unsigned long reg = get_cr(); + + set_cr(reg | CR_V); + return 0xffff0000; +} +#else /* CONFIG_CPU_HIGH_VECTOR */ +/* Write exception base address to VBAR */ +static inline void set_vbar(unsigned long val) +{ + asm("mcr p15, 0, %0, c12, c0, 0" : : "r" (val) : "cc"); +} + +/* + * Security extensions, bits[7:4], permitted values, + * 0b0000 - not implemented, 0b0001/0b0010 - implemented + */ +static inline bool security_extensions_enabled(void) +{ + return !!cpuid_feature_extract(CPUID_EXT_PFR1, 4); +} + +static unsigned long __init setup_vectors_base(void) +{ + unsigned long base = 0, reg = get_cr(); + + set_cr(reg & ~CR_V); + if (security_extensions_enabled()) { + if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM)) + base = CONFIG_DRAM_BASE; + set_vbar(base); + } else if (IS_ENABLED(CONFIG_REMAP_VECTORS_TO_RAM)) { + if (CONFIG_DRAM_BASE != 0) + pr_err("Security extensions not enabled, vectors cannot be remapped to RAM, vectors base will be 0x00000000\n"); + } + + return base; +} +#endif /* CONFIG_CPU_HIGH_VECTOR */ +#endif /* CONFIG_CPU_CP15 */ + void __init arm_mm_memblock_reserve(void) { #ifndef CONFIG_CPU_V7M + vectors_base = IS_ENABLED(CONFIG_CPU_CP15) ? setup_vectors_base() : 0; /* * Register the exception vector page. * some architectures which the DRAM is the exception vector to trap, * alloc_page breaks with error, although it is not NULL, but "0." */ - memblock_reserve(CONFIG_VECTORS_BASE, 2 * PAGE_SIZE); + memblock_reserve(vectors_base, 2 * PAGE_SIZE); #else /* ifndef CONFIG_CPU_V7M */ /* * There is no dedicated vector page on V7-M. So nothing needs to be @@ -295,10 +343,10 @@ void __init arm_mm_memblock_reserve(void) #endif } -void __init sanity_check_meminfo(void) +void __init adjust_lowmem_bounds(void) { phys_addr_t end; - sanity_check_meminfo_mpu(); + adjust_lowmem_bounds_mpu(); end = memblock_end_of_DRAM(); high_memory = __va(end - 1) + 1; memblock_set_current_limit(end); @@ -310,7 +358,7 @@ void __init sanity_check_meminfo(void) */ void __init paging_init(const struct machine_desc *mdesc) { - early_trap_init((void *)CONFIG_VECTORS_BASE); + early_trap_init((void *)vectors_base); mpu_setup(); bootmem_init(); } diff --git a/arch/arm/mm/physaddr.c b/arch/arm/mm/physaddr.c new file mode 100644 index 000000000000..02e60f495608 --- /dev/null +++ b/arch/arm/mm/physaddr.c @@ -0,0 +1,57 @@ +#include <linux/bug.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/mmdebug.h> +#include <linux/mm.h> + +#include <asm/sections.h> +#include <asm/memory.h> +#include <asm/fixmap.h> +#include <asm/dma.h> + +#include "mm.h" + +static inline bool __virt_addr_valid(unsigned long x) +{ + /* + * high_memory does not get immediately defined, and there + * are early callers of __pa() against PAGE_OFFSET + */ + if (!high_memory && x >= PAGE_OFFSET) + return true; + + if (high_memory && x >= PAGE_OFFSET && x < (unsigned long)high_memory) + return true; + + /* + * MAX_DMA_ADDRESS is a virtual address that may not correspond to an + * actual physical address. Enough code relies on __pa(MAX_DMA_ADDRESS) + * that we just need to work around it and always return true. + */ + if (x == MAX_DMA_ADDRESS) + return true; + + return false; +} + +phys_addr_t __virt_to_phys(unsigned long x) +{ + WARN(!__virt_addr_valid(x), + "virt_to_phys used for non-linear address: %pK (%pS)\n", + (void *)x, (void *)x); + + return __virt_to_phys_nodebug(x); +} +EXPORT_SYMBOL(__virt_to_phys); + +phys_addr_t __phys_addr_symbol(unsigned long x) +{ + /* This is bounds checking against the kernel image only. + * __pa_symbol should only be used on kernel symbol addresses. + */ + VIRTUAL_BUG_ON(x < (unsigned long)KERNEL_START || + x > (unsigned long)KERNEL_END); + + return __pa_symbol_nodebug(x); +} +EXPORT_SYMBOL(__phys_addr_symbol); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8567c851172c..4261b3282ad9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1865,14 +1865,14 @@ static void __smp_spurious_interrupt(u8 vector) "should never happen.\n", vector, smp_processor_id()); } -__visible void smp_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); __smp_spurious_interrupt(~regs->orig_ax); exiting_irq(); } -__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) { u8 vector = ~regs->orig_ax; @@ -1923,14 +1923,14 @@ static void __smp_error_interrupt(struct pt_regs *regs) } -__visible void smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { entering_irq(); __smp_error_interrupt(regs); exiting_irq(); } -__visible void smp_trace_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 5d30c5e42bb1..f3557a1eb562 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(data); } -asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9e5427df3243..524cc5780a77 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -816,14 +816,14 @@ static inline void __smp_deferred_error_interrupt(void) deferred_error_int_vector(); } -asmlinkage __visible void smp_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); __smp_deferred_error_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) { entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 85469f84c921..d7cc190ae457 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -396,14 +396,16 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry +smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 9beb092d68a5..bb0e75eed10a 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -23,14 +23,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage __visible void smp_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage __visible void smp_trace_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7c6e9ffe4424..4d8183b5f113 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -264,7 +264,7 @@ void __smp_x86_platform_ipi(void) x86_platform_ipi_callback(); } -__visible void smp_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -315,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) } #endif -__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) +__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 3512ba607361..275487872be2 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <asm/apic.h> #include <asm/trace/irq_vectors.h> +#include <linux/interrupt.h> static inline void __smp_irq_work_interrupt(void) { @@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void) irq_work_run(); } -__visible void smp_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } -__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 69780edf0dde..4bf0c8926a1c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { /* - * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX + * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, + * as old kexec-tools loads bzImage below that, unless + * "crashkernel=size[KMG],high" is specified. */ crash_base = memblock_find_in_range(CRASH_ALIGN, high ? CRASH_ADDR_HIGH_MAX diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 68f8cc222f25..d3c66a15bbde 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -259,7 +259,7 @@ static inline void __smp_reschedule_interrupt(void) scheduler_ipi(); } -__visible void smp_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __smp_reschedule_interrupt(); @@ -268,7 +268,7 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* * Need to call irq_enter() before calling the trace point. @@ -292,14 +292,15 @@ static inline void __smp_call_function_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); @@ -314,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void) inc_irq_stat(irq_call_count); } -__visible void smp_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } -__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry +smp_trace_call_function_single_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e79f15f108a8..ad0118fbce90 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -346,6 +346,7 @@ SECTIONS /DISCARD/ : { *(.eh_frame) *(__func_stack_frame_non_standard) + *(__unreachable) } } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 362cecc77130..4d6807723798 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -123,9 +123,11 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_FEATURE_LAYERING (1<<0) #define RBD_FEATURE_STRIPINGV2 (1<<1) #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) +#define RBD_FEATURE_DATA_POOL (1<<7) #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ RBD_FEATURE_STRIPINGV2 | \ - RBD_FEATURE_EXCLUSIVE_LOCK) + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_DATA_POOL) /* Features supported by this (client software) implementation. */ @@ -144,10 +146,9 @@ struct rbd_image_header { /* These six fields never change for a given rbd image */ char *object_prefix; __u8 obj_order; - __u8 crypt_type; - __u8 comp_type; u64 stripe_unit; u64 stripe_count; + s64 data_pool_id; u64 features; /* Might be changeable someday? */ /* The remaining fields need to be updated occasionally */ @@ -230,7 +231,7 @@ enum obj_req_flags { }; struct rbd_obj_request { - const char *object_name; + u64 object_no; u64 offset; /* object start byte */ u64 length; /* bytes from offset */ unsigned long flags; @@ -438,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; -static struct kmem_cache *rbd_segment_name_cache; static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -973,6 +973,30 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) } /* + * returns the size of an object in the image + */ +static u32 rbd_obj_bytes(struct rbd_image_header *header) +{ + return 1U << header->obj_order; +} + +static void rbd_init_layout(struct rbd_device *rbd_dev) +{ + if (rbd_dev->header.stripe_unit == 0 || + rbd_dev->header.stripe_count == 0) { + rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); + rbd_dev->header.stripe_count = 1; + } + + rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; + rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; + rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); + rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? + rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; + RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); +} + +/* * Fill an rbd image header with information from the given format 1 * on-disk header. */ @@ -992,15 +1016,11 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, /* Allocate this now to avoid having to handle failure below */ if (first_time) { - size_t len; - - len = strnlen(ondisk->object_prefix, - sizeof (ondisk->object_prefix)); - object_prefix = kmalloc(len + 1, GFP_KERNEL); + object_prefix = kstrndup(ondisk->object_prefix, + sizeof(ondisk->object_prefix), + GFP_KERNEL); if (!object_prefix) return -ENOMEM; - memcpy(object_prefix, ondisk->object_prefix, len); - object_prefix[len] = '\0'; } /* Allocate the snapshot context and fill it in */ @@ -1051,12 +1071,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; - header->crypt_type = ondisk->options.crypt_type; - header->comp_type = ondisk->options.comp_type; - /* The rest aren't used for format 1 images */ - header->stripe_unit = 0; - header->stripe_count = 0; - header->features = 0; + rbd_init_layout(rbd_dev); } else { ceph_put_snap_context(header->snapc); kfree(header->snap_names); @@ -1232,42 +1247,9 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.features = 0; } -static void rbd_segment_name_free(const char *name) -{ - /* The explicit cast here is needed to drop the const qualifier */ - - kmem_cache_free(rbd_segment_name_cache, (void *)name); -} - -static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) -{ - char *name; - u64 segment; - int ret; - char *name_format; - - name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); - if (!name) - return NULL; - segment = offset >> rbd_dev->header.obj_order; - name_format = "%s.%012llx"; - if (rbd_dev->image_format == 2) - name_format = "%s.%016llx"; - ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, - rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { - pr_err("error formatting segment name for #%llu (%d)\n", - segment, ret); - rbd_segment_name_free(name); - name = NULL; - } - - return name; -} - static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) { - u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; + u64 segment_size = rbd_obj_bytes(&rbd_dev->header); return offset & (segment_size - 1); } @@ -1275,7 +1257,7 @@ static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) static u64 rbd_segment_length(struct rbd_device *rbd_dev, u64 offset, u64 length) { - u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; + u64 segment_size = rbd_obj_bytes(&rbd_dev->header); offset &= segment_size - 1; @@ -1287,14 +1269,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, } /* - * returns the size of an object in the image - */ -static u64 rbd_obj_bytes(struct rbd_image_header *header) -{ - return 1 << header->obj_order; -} - -/* * bio helpers */ @@ -1623,7 +1597,9 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; - dout("%s %p osd_req %p\n", __func__, obj_request, osd_req); + dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, + obj_request, obj_request->object_no, obj_request->offset, + obj_request->length, osd_req); if (obj_request_img_data_test(obj_request)) { WARN_ON(obj_request->callback != rbd_img_obj_callback); rbd_img_request_get(obj_request->img_request); @@ -1631,44 +1607,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); } -static void rbd_obj_request_end(struct rbd_obj_request *obj_request) -{ - dout("%s %p\n", __func__, obj_request); - ceph_osdc_cancel_request(obj_request->osd_req); -} - -/* - * Wait for an object request to complete. If interrupted, cancel the - * underlying osd request. - * - * @timeout: in jiffies, 0 means "wait forever" - */ -static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, - unsigned long timeout) -{ - long ret; - - dout("%s %p\n", __func__, obj_request); - ret = wait_for_completion_interruptible_timeout( - &obj_request->completion, - ceph_timeout_jiffies(timeout)); - if (ret <= 0) { - if (ret == 0) - ret = -ETIMEDOUT; - rbd_obj_request_end(obj_request); - } else { - ret = 0; - } - - dout("%s %p ret %d\n", __func__, obj_request, (int)ret); - return ret; -} - -static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) -{ - return __rbd_obj_request_wait(obj_request, 0); -} - static void rbd_img_request_complete(struct rbd_img_request *img_request) { @@ -1955,8 +1893,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) rbd_osd_call_callback(obj_request); break; default: - rbd_warn(NULL, "%s: unsupported op %hu", - obj_request->object_name, (unsigned short) opcode); + rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", + obj_request->object_no, opcode); break; } @@ -1980,6 +1918,40 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) osd_req->r_data_offset = obj_request->offset; } +static struct ceph_osd_request * +__rbd_osd_req_create(struct rbd_device *rbd_dev, + struct ceph_snap_context *snapc, + int num_ops, unsigned int flags, + struct rbd_obj_request *obj_request) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request *req; + const char *name_format = rbd_dev->image_format == 1 ? + RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; + + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); + if (!req) + return NULL; + + req->r_flags = flags; + req->r_callback = rbd_osd_req_callback; + req->r_priv = obj_request; + + req->r_base_oloc.pool = rbd_dev->layout.pool_id; + if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, + rbd_dev->header.object_prefix, obj_request->object_no)) + goto err_req; + + if (ceph_osdc_alloc_messages(req, GFP_NOIO)) + goto err_req; + + return req; + +err_req: + ceph_osdc_put_request(req); + return NULL; +} + /* * Create an osd request. A read request has one osd op (read). * A write request has either one (watch) or two (hint+write) osd ops. @@ -1993,8 +1965,6 @@ static struct ceph_osd_request *rbd_osd_req_create( struct rbd_obj_request *obj_request) { struct ceph_snap_context *snapc = NULL; - struct ceph_osd_client *osdc; - struct ceph_osd_request *osd_req; if (obj_request_img_data_test(obj_request) && (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { @@ -2009,35 +1979,9 @@ static struct ceph_osd_request *rbd_osd_req_create( rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); - /* Allocate and initialize the request, for the num_ops ops */ - - osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, - GFP_NOIO); - if (!osd_req) - goto fail; - - if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) - osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; - else - osd_req->r_flags = CEPH_OSD_FLAG_READ; - - osd_req->r_callback = rbd_osd_req_callback; - osd_req->r_priv = obj_request; - - osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; - if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", - obj_request->object_name)) - goto fail; - - if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) - goto fail; - - return osd_req; - -fail: - ceph_osdc_put_request(osd_req); - return NULL; + return __rbd_osd_req_create(rbd_dev, snapc, num_ops, + (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? + CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); } /* @@ -2050,10 +1994,6 @@ static struct ceph_osd_request * rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; - struct ceph_snap_context *snapc; - struct rbd_device *rbd_dev; - struct ceph_osd_client *osdc; - struct ceph_osd_request *osd_req; int num_osd_ops = 3; rbd_assert(obj_request_img_data_test(obj_request)); @@ -2065,77 +2005,34 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) if (img_request_discard_test(img_request)) num_osd_ops = 2; - /* Allocate and initialize the request, for all the ops */ - - snapc = img_request->snapc; - rbd_dev = img_request->rbd_dev; - osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, - false, GFP_NOIO); - if (!osd_req) - goto fail; - - osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; - osd_req->r_callback = rbd_osd_req_callback; - osd_req->r_priv = obj_request; - - osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; - if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", - obj_request->object_name)) - goto fail; - - if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) - goto fail; - - return osd_req; - -fail: - ceph_osdc_put_request(osd_req); - return NULL; + return __rbd_osd_req_create(img_request->rbd_dev, + img_request->snapc, num_osd_ops, + CEPH_OSD_FLAG_WRITE, obj_request); } - static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) { ceph_osdc_put_request(osd_req); } -/* object_name is assumed to be a non-null pointer and NUL-terminated */ - -static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, - u64 offset, u64 length, - enum obj_request_type type) +static struct rbd_obj_request * +rbd_obj_request_create(enum obj_request_type type) { struct rbd_obj_request *obj_request; - size_t size; - char *name; rbd_assert(obj_request_type_valid(type)); - size = strlen(object_name) + 1; - name = kmalloc(size, GFP_NOIO); - if (!name) - return NULL; - obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); - if (!obj_request) { - kfree(name); + if (!obj_request) return NULL; - } - obj_request->object_name = memcpy(name, object_name, size); - obj_request->offset = offset; - obj_request->length = length; - obj_request->flags = 0; obj_request->which = BAD_WHICH; obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); init_completion(&obj_request->completion); kref_init(&obj_request->kref); - dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, - offset, length, (int)type, obj_request); - + dout("%s %p\n", __func__, obj_request); return obj_request; } @@ -2170,8 +2067,6 @@ static void rbd_obj_request_destroy(struct kref *kref) break; } - kfree(obj_request->object_name); - obj_request->object_name = NULL; kmem_cache_free(rbd_obj_request_cache, obj_request); } @@ -2546,22 +2441,18 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, while (resid) { struct ceph_osd_request *osd_req; - const char *object_name; - u64 offset; - u64 length; + u64 object_no = img_offset >> rbd_dev->header.obj_order; + u64 offset = rbd_segment_offset(rbd_dev, img_offset); + u64 length = rbd_segment_length(rbd_dev, img_offset, resid); - object_name = rbd_segment_name(rbd_dev, img_offset); - if (!object_name) - goto out_unwind; - offset = rbd_segment_offset(rbd_dev, img_offset); - length = rbd_segment_length(rbd_dev, img_offset, resid); - obj_request = rbd_obj_request_create(object_name, - offset, length, type); - /* object request has its own copy of the object name */ - rbd_segment_name_free(object_name); + obj_request = rbd_obj_request_create(type); if (!obj_request) goto out_unwind; + obj_request->object_no = object_no; + obj_request->offset = offset; + obj_request->length = length; + /* * set obj_request->img_request before creating the * osd_request so that it gets the right snapc @@ -2771,7 +2662,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) * child image to which the original request was to be sent. */ img_offset = obj_request->img_offset - obj_request->offset; - length = (u64)1 << rbd_dev->header.obj_order; + length = rbd_obj_bytes(&rbd_dev->header); /* * There is no defined parent data beyond the parent @@ -2900,11 +2791,12 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) size_t size; int ret; - stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, - OBJ_REQUEST_PAGES); + stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); if (!stat_request) return -ENOMEM; + stat_request->object_no = obj_request->object_no; + stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, stat_request); if (!stat_request->osd_req) { @@ -3983,17 +3875,17 @@ out: * returned in the outbound buffer, or a negative error code. */ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, - const char *object_name, - const char *class_name, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, const char *method_name, const void *outbound, size_t outbound_size, void *inbound, size_t inbound_size) { - struct rbd_obj_request *obj_request; - struct page **pages; - u32 page_count; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct page *req_page = NULL; + struct page *reply_page; int ret; /* @@ -4003,61 +3895,35 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, * method. Currently if this is present it will be a * snapshot id. */ - page_count = (u32)calc_pages_for(0, inbound_size); - pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = -ENOMEM; - obj_request = rbd_obj_request_create(object_name, 0, inbound_size, - OBJ_REQUEST_PAGES); - if (!obj_request) - goto out; + if (outbound) { + if (outbound_size > PAGE_SIZE) + return -E2BIG; - obj_request->pages = pages; - obj_request->page_count = page_count; - - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - obj_request); - if (!obj_request->osd_req) - goto out; - - osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, - class_name, method_name); - if (outbound_size) { - struct ceph_pagelist *pagelist; - - pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); - if (!pagelist) - goto out; + req_page = alloc_page(GFP_KERNEL); + if (!req_page) + return -ENOMEM; - ceph_pagelist_init(pagelist); - ceph_pagelist_append(pagelist, outbound, outbound_size); - osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, - pagelist); + memcpy(page_address(req_page), outbound, outbound_size); } - osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, - obj_request->pages, inbound_size, - 0, false, false); - - rbd_obj_request_submit(obj_request); - ret = rbd_obj_request_wait(obj_request); - if (ret) - goto out; - ret = obj_request->result; - if (ret < 0) - goto out; + reply_page = alloc_page(GFP_KERNEL); + if (!reply_page) { + if (req_page) + __free_page(req_page); + return -ENOMEM; + } - rbd_assert(obj_request->xferred < (u64)INT_MAX); - ret = (int)obj_request->xferred; - ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); -out: - if (obj_request) - rbd_obj_request_put(obj_request); - else - ceph_release_page_vector(pages, page_count); + ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, + CEPH_OSD_FLAG_READ, req_page, outbound_size, + reply_page, &inbound_size); + if (!ret) { + memcpy(inbound, page_address(reply_page), inbound_size); + ret = inbound_size; + } + if (req_page) + __free_page(req_page); + __free_page(reply_page); return ret; } @@ -4256,63 +4122,46 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) } static int rbd_obj_read_sync(struct rbd_device *rbd_dev, - const char *object_name, - u64 offset, u64 length, void *buf) + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *buf, int buf_len) { - struct rbd_obj_request *obj_request; - struct page **pages = NULL; - u32 page_count; - size_t size; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request *req; + struct page **pages; + int num_pages = calc_pages_for(0, buf_len); int ret; - page_count = (u32) calc_pages_for(offset, length); - pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = -ENOMEM; - obj_request = rbd_obj_request_create(object_name, offset, length, - OBJ_REQUEST_PAGES); - if (!obj_request) - goto out; - - obj_request->pages = pages; - obj_request->page_count = page_count; - - obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, - obj_request); - if (!obj_request->osd_req) - goto out; + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); + if (!req) + return -ENOMEM; - osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, - offset, length, 0, 0); - osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, - obj_request->pages, - obj_request->length, - obj_request->offset & ~PAGE_MASK, - false, false); + ceph_oid_copy(&req->r_base_oid, oid); + ceph_oloc_copy(&req->r_base_oloc, oloc); + req->r_flags = CEPH_OSD_FLAG_READ; - rbd_obj_request_submit(obj_request); - ret = rbd_obj_request_wait(obj_request); + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); if (ret) - goto out; + goto out_req; - ret = obj_request->result; - if (ret < 0) - goto out; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out_req; + } - rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); - size = (size_t) obj_request->xferred; - ceph_copy_from_page_vector(pages, buf, 0, size); - rbd_assert(size <= (size_t)INT_MAX); - ret = (int)size; -out: - if (obj_request) - rbd_obj_request_put(obj_request); - else - ceph_release_page_vector(pages, page_count); + osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); + osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, + true); + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + if (ret >= 0) + ceph_copy_from_page_vector(pages, buf, 0, ret); +out_req: + ceph_osdc_put_request(req); return ret; } @@ -4348,8 +4197,8 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) if (!ondisk) return -ENOMEM; - ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, - 0, size, ondisk); + ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, ondisk, size); if (ret < 0) goto out; if ((size_t)ret < size) { @@ -4781,7 +4630,7 @@ static const struct attribute_group *rbd_attr_groups[] = { static void rbd_dev_release(struct device *dev); -static struct device_type rbd_device_type = { +static const struct device_type rbd_device_type = { .name = "rbd", .groups = rbd_attr_groups, .release = rbd_dev_release, @@ -4876,8 +4725,9 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); + rbd_dev->header.data_pool_id = CEPH_NOPOOL; ceph_oid_init(&rbd_dev->header_oid); - ceph_oloc_init(&rbd_dev->header_oloc); + rbd_dev->header_oloc.pool = spec->pool_id; mutex_init(&rbd_dev->watch_mutex); rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; @@ -4899,12 +4749,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, rbd_dev->rbd_client = rbdc; rbd_dev->spec = spec; - rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; - rbd_dev->layout.stripe_count = 1; - rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; - rbd_dev->layout.pool_id = spec->pool_id; - RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); - return rbd_dev; } @@ -4970,10 +4814,10 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_size", - &snapid, sizeof (snapid), - &size_buf, sizeof (size_buf)); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_size", + &snapid, sizeof(snapid), + &size_buf, sizeof(size_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -5010,9 +4854,9 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_object_prefix", NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_object_prefix", + NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -5045,10 +4889,10 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 unsup; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_features", - &snapid, sizeof (snapid), - &features_buf, sizeof (features_buf)); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_features", + &snapid, sizeof(snapid), + &features_buf, sizeof(features_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -5107,10 +4951,9 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(rbd_dev->spec->snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_parent", - &snapid, sizeof (snapid), - reply_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_parent", + &snapid, sizeof(snapid), reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -5210,9 +5053,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) u64 stripe_count; int ret; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_stripe_unit_count", NULL, 0, - (char *)&striping_info_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_stripe_unit_count", + NULL, 0, &striping_info_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -5226,7 +5069,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) * out, and only fail if the image has non-default values. */ ret = -EINVAL; - obj_size = (u64)1 << rbd_dev->header.obj_order; + obj_size = rbd_obj_bytes(&rbd_dev->header); p = &striping_info_buf; stripe_unit = ceph_decode_64(&p); if (stripe_unit != obj_size) { @@ -5247,8 +5090,27 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) return 0; } +static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) +{ + __le64 data_pool_id; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_data_pool", + NULL, 0, &data_pool_id, sizeof(data_pool_id)); + if (ret < 0) + return ret; + if (ret < sizeof(data_pool_id)) + return -EBADMSG; + + rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); + WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); + return 0; +} + static char *rbd_dev_image_name(struct rbd_device *rbd_dev) { + CEPH_DEFINE_OID_ONSTACK(oid); size_t image_id_size; char *image_id; void *p; @@ -5276,10 +5138,10 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) if (!reply_buf) goto out; - ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, - "rbd", "dir_get_name", - image_id, image_id_size, - reply_buf, size); + ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); + ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, + "dir_get_name", image_id, image_id_size, + reply_buf, size); if (ret < 0) goto out; p = reply_buf; @@ -5458,9 +5320,9 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_snapcontext", NULL, 0, - reply_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_snapcontext", + NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -5523,10 +5385,9 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, return ERR_PTR(-ENOMEM); snapid = cpu_to_le64(snap_id); - ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, - "rbd", "get_snapshot_name", - &snapid, sizeof (snapid), - reply_buf, size); + ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, + &rbd_dev->header_oloc, "get_snapshot_name", + &snapid, sizeof(snapid), reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) { snap_name = ERR_PTR(ret); @@ -5833,7 +5694,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) { int ret; size_t size; - char *object_name; + CEPH_DEFINE_OID_ONSTACK(oid); void *response; char *image_id; @@ -5853,12 +5714,12 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. */ - size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); - object_name = kmalloc(size, GFP_NOIO); - if (!object_name) - return -ENOMEM; - sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); - dout("rbd id object name is %s\n", object_name); + ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, + rbd_dev->spec->image_name); + if (ret) + return ret; + + dout("rbd id object name is %s\n", oid.name); /* Response will be an encoded string, which includes a length */ @@ -5871,9 +5732,9 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) /* If it doesn't exist we'll assume it's a format 1 image */ - ret = rbd_obj_method_sync(rbd_dev, object_name, - "rbd", "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX); + ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, + "get_id", NULL, 0, + response, RBD_IMAGE_ID_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); @@ -5896,8 +5757,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) } out: kfree(response); - kfree(object_name); - + ceph_oid_destroy(&oid); return ret; } @@ -5944,14 +5804,20 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; } - /* No support for crypto and compression type format 2 images */ + if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { + ret = rbd_dev_v2_data_pool(rbd_dev); + if (ret) + goto out_err; + } + + rbd_init_layout(rbd_dev); return 0; + out_err: rbd_dev->header.features = 0; kfree(rbd_dev->header.object_prefix); rbd_dev->header.object_prefix = NULL; - return ret; } @@ -6077,8 +5943,6 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) /* Record the header object name for this rbd image. */ rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - - rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; if (rbd_dev->image_format == 1) ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", spec->image_name, RBD_SUFFIX); @@ -6471,27 +6335,16 @@ static int rbd_slab_init(void) if (!rbd_obj_request_cache) goto out_err; - rbd_assert(!rbd_segment_name_cache); - rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); - if (rbd_segment_name_cache) - return 0; -out_err: - kmem_cache_destroy(rbd_obj_request_cache); - rbd_obj_request_cache = NULL; + return 0; +out_err: kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; - return -ENOMEM; } static void rbd_slab_exit(void) { - rbd_assert(rbd_segment_name_cache); - kmem_cache_destroy(rbd_segment_name_cache); - rbd_segment_name_cache = NULL; - rbd_assert(rbd_obj_request_cache); kmem_cache_destroy(rbd_obj_request_cache); rbd_obj_request_cache = NULL; diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index 94f367db27b0..62ff50d3e7a6 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -25,8 +25,8 @@ */ #define RBD_HEADER_PREFIX "rbd_header." -#define RBD_DATA_PREFIX "rbd_data." #define RBD_ID_PREFIX "rbd_id." +#define RBD_V2_DATA_FORMAT "%s.%016llx" #define RBD_LOCK_NAME "rbd_lock" #define RBD_LOCK_TAG "internal" @@ -42,13 +42,14 @@ enum rbd_notify_op { /* * For format version 1, rbd image 'foo' consists of objects * foo.rbd - image metadata - * rb.<idhi>.<idlo>.00000000 - * rb.<idhi>.<idlo>.00000001 + * rb.<idhi>.<idlo>.<extra>.000000000000 + * rb.<idhi>.<idlo>.<extra>.000000000001 * ... - data * There is no notion of a persistent image id in rbd format 1. */ #define RBD_SUFFIX ".rbd" +#define RBD_V1_DATA_FORMAT "%s.%012llx" #define RBD_DIRECTORY "rbd_directory" #define RBD_INFO "rbd_info" @@ -57,9 +58,6 @@ enum rbd_notify_op { #define RBD_MIN_OBJ_ORDER 16 #define RBD_MAX_OBJ_ORDER 30 -#define RBD_COMP_NONE 0 -#define RBD_CRYPT_NONE 0 - #define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" #define RBD_HEADER_SIGNATURE "RBD" #define RBD_HEADER_VERSION "001.005" diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c index 46427ea01753..157f2d1fb7e1 100644 --- a/drivers/ide/palm_bk3710.c +++ b/drivers/ide/palm_bk3710.c @@ -300,7 +300,7 @@ static const struct ide_port_ops palm_bk3710_ports_ops = { .cable_detect = palm_bk3710_cable_detect, }; -static struct ide_port_info palm_bk3710_port_info = { +static struct ide_port_info palm_bk3710_port_info __initdata = { .init_dma = palm_bk3710_init_dma, .port_ops = &palm_bk3710_ports_ops, .dma_ops = &sff_dma_ops, diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c index 82bd00af5cc3..268aae45b514 100644 --- a/drivers/mtd/devices/lart.c +++ b/drivers/mtd/devices/lart.c @@ -75,18 +75,18 @@ static char module_name[] = "lart"; /* blob */ #define NUM_BLOB_BLOCKS FLASH_NUMBLOCKS_16m_PARAM -#define BLOB_START 0x00000000 -#define BLOB_LEN (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM) +#define PART_BLOB_START 0x00000000 +#define PART_BLOB_LEN (NUM_BLOB_BLOCKS * FLASH_BLOCKSIZE_PARAM) /* kernel */ #define NUM_KERNEL_BLOCKS 7 -#define KERNEL_START (BLOB_START + BLOB_LEN) -#define KERNEL_LEN (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN) +#define PART_KERNEL_START (PART_BLOB_START + PART_BLOB_LEN) +#define PART_KERNEL_LEN (NUM_KERNEL_BLOCKS * FLASH_BLOCKSIZE_MAIN) /* initial ramdisk */ #define NUM_INITRD_BLOCKS 24 -#define INITRD_START (KERNEL_START + KERNEL_LEN) -#define INITRD_LEN (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN) +#define PART_INITRD_START (PART_KERNEL_START + PART_KERNEL_LEN) +#define PART_INITRD_LEN (NUM_INITRD_BLOCKS * FLASH_BLOCKSIZE_MAIN) /* * See section 4.0 in "3 Volt Fast Boot Block Flash Memory" Intel Datasheet @@ -587,20 +587,20 @@ static struct mtd_partition lart_partitions[] = { /* blob */ { .name = "blob", - .offset = BLOB_START, - .size = BLOB_LEN, + .offset = PART_BLOB_START, + .size = PART_BLOB_LEN, }, /* kernel */ { .name = "kernel", - .offset = KERNEL_START, /* MTDPART_OFS_APPEND */ - .size = KERNEL_LEN, + .offset = PART_KERNEL_START, /* MTDPART_OFS_APPEND */ + .size = PART_KERNEL_LEN, }, /* initial ramdisk / file system */ { .name = "file system", - .offset = INITRD_START, /* MTDPART_OFS_APPEND */ - .size = INITRD_LEN, /* MTDPART_SIZ_FULL */ + .offset = PART_INITRD_START, /* MTDPART_OFS_APPEND */ + .size = PART_INITRD_LEN, /* MTDPART_SIZ_FULL */ } }; #define NUM_PARTITIONS ARRAY_SIZE(lart_partitions) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index d0d0d12b531f..e536301acfde 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -293,36 +293,29 @@ static int xgene_enet_tx_completion(struct xgene_enet_desc_ring *cp_ring, static int xgene_enet_setup_mss(struct net_device *ndev, u32 mss) { struct xgene_enet_pdata *pdata = netdev_priv(ndev); - bool mss_index_found = false; - int mss_index; + int mss_index = -EBUSY; int i; spin_lock(&pdata->mss_lock); /* Reuse the slot if MSS matches */ - for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) { + for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) { if (pdata->mss[i] == mss) { pdata->mss_refcnt[i]++; mss_index = i; - mss_index_found = true; } } /* Overwrite the slot with ref_count = 0 */ - for (i = 0; !mss_index_found && i < NUM_MSS_REG; i++) { + for (i = 0; mss_index < 0 && i < NUM_MSS_REG; i++) { if (!pdata->mss_refcnt[i]) { pdata->mss_refcnt[i]++; pdata->mac_ops->set_mss(pdata, mss, i); pdata->mss[i] = mss; mss_index = i; - mss_index_found = true; } } - /* No slots with ref_count = 0 available, return busy */ - if (!mss_index_found) - mss_index = -EBUSY; - spin_unlock(&pdata->mss_lock); return mss_index; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c index e7b81a305469..024788549c25 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c @@ -89,10 +89,17 @@ void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev) } } +#define MLX4_EN_WRAP_AROUND_SEC 10UL +/* By scheduling the overflow check every 5 seconds, we have a reasonably + * good chance we wont miss a wrap around. + * TOTO: Use a timer instead of a work queue to increase the guarantee. + */ +#define MLX4_EN_OVERFLOW_PERIOD (MLX4_EN_WRAP_AROUND_SEC * HZ / 2) + void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev) { bool timeout = time_is_before_jiffies(mdev->last_overflow_check + - mdev->overflow_period); + MLX4_EN_OVERFLOW_PERIOD); unsigned long flags; if (timeout) { @@ -237,7 +244,6 @@ static const struct ptp_clock_info mlx4_en_ptp_clock_info = { .enable = mlx4_en_phc_enable, }; -#define MLX4_EN_WRAP_AROUND_SEC 10ULL /* This function calculates the max shift that enables the user range * of MLX4_EN_WRAP_AROUND_SEC values in the cycles register. @@ -258,7 +264,6 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev) { struct mlx4_dev *dev = mdev->dev; unsigned long flags; - u64 ns, zero = 0; /* mlx4_en_init_timestamp is called for each netdev. * mdev->ptp_clock is common for all ports, skip initialization if @@ -282,13 +287,6 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev) ktime_to_ns(ktime_get_real())); write_sequnlock_irqrestore(&mdev->clock_lock, flags); - /* Calculate period in seconds to call the overflow watchdog - to make - * sure counter is checked at least once every wrap around. - */ - ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask, zero, &zero); - do_div(ns, NSEC_PER_SEC / 2 / HZ); - mdev->overflow_period = ns; - /* Configure the PHC */ mdev->ptp_clock_info = mlx4_en_ptp_clock_info; snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp"); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 4941b692e947..3629ce11a68b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -430,7 +430,6 @@ struct mlx4_en_dev { seqlock_t clock_lock; struct timecounter clock; unsigned long last_overflow_check; - unsigned long overflow_period; struct ptp_clock *ptp_clock; struct ptp_clock_info ptp_clock_info; struct notifier_block nb; diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index c5c1d0e0c16f..118723ea681a 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -5397,7 +5397,7 @@ static void s2io_ethtool_gdrvinfo(struct net_device *dev, * s2io_nic structure. * @regs : pointer to the structure with parameters given by ethtool for * dumping the registers. - * @reg_space: The input argumnet into which all the registers are dumped. + * @reg_space: The input argument into which all the registers are dumped. * Description: * Dumps the entire register space of xFrame NIC into the user given * buffer area. diff --git a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c index db55e6d89cf4..0452848d1316 100644 --- a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c +++ b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c @@ -119,7 +119,7 @@ static void vxge_ethtool_gdrvinfo(struct net_device *dev, * @dev: device pointer. * @regs: pointer to the structure with parameters given by ethtool for * dumping the registers. - * @reg_space: The input argumnet into which all the registers are dumped. + * @reg_space: The input argument into which all the registers are dumped. * * Dumps the vpath register space of Titan NIC into the user given * buffer area. diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index 61a9cd5be497..00c17fa6545b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -688,7 +688,9 @@ static inline u8 qed_concrete_to_sw_fid(struct qed_dev *cdev, #define OOO_LB_TC 9 int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate); -void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate); +void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, + struct qed_ptt *p_ptt, + u32 min_pf_rate); void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); #define QED_LEADING_HWFN(dev) (&dev->hwfns[0]) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index d6c5a8165b5f..e2a081ceaf52 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -3198,7 +3198,8 @@ int qed_configure_vport_wfq(struct qed_dev *cdev, u16 vp_id, u32 rate) } /* API to configure WFQ from mcp link change */ -void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate) +void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, + struct qed_ptt *p_ptt, u32 min_pf_rate) { int i; @@ -3212,8 +3213,7 @@ void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev, u32 min_pf_rate) for_each_hwfn(cdev, i) { struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; - __qed_configure_vp_wfq_on_link_change(p_hwfn, - p_hwfn->p_dpc_ptt, + __qed_configure_vp_wfq_on_link_change(p_hwfn, p_ptt, min_pf_rate); } } diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 314022df3469..87fde205149f 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -679,7 +679,8 @@ static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn, /* Min bandwidth configuration */ __qed_configure_pf_min_bandwidth(p_hwfn, p_ptt, p_link, min_bw); - qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_link->min_pf_rate); + qed_configure_vp_wfq_on_link_change(p_hwfn->cdev, p_ptt, + p_link->min_pf_rate); p_link->an = !!(status & LINK_STATUS_AUTO_NEGOTIATE_ENABLED); p_link->an_complete = !!(status & diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index 29ed785f1dc2..253c2bbe1e4e 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -3014,8 +3014,7 @@ cleanup: ack_vfs[vfid / 32] |= BIT((vfid % 32)); p_hwfn->pf_iov_info->pending_flr[rel_vf_id / 64] &= ~(1ULL << (rel_vf_id % 64)); - p_hwfn->pf_iov_info->pending_events[rel_vf_id / 64] &= - ~(1ULL << (rel_vf_id % 64)); + p_vf->vf_mbx.b_pending_msg = false; } return rc; @@ -3128,11 +3127,20 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn, mbx = &p_vf->vf_mbx; /* qed_iov_process_mbx_request */ - DP_VERBOSE(p_hwfn, QED_MSG_IOV, - "VF[%02x]: Processing mailbox message\n", p_vf->abs_vf_id); + if (!mbx->b_pending_msg) { + DP_NOTICE(p_hwfn, + "VF[%02x]: Trying to process mailbox message when none is pending\n", + p_vf->abs_vf_id); + return; + } + mbx->b_pending_msg = false; mbx->first_tlv = mbx->req_virt->first_tlv; + DP_VERBOSE(p_hwfn, QED_MSG_IOV, + "VF[%02x]: Processing mailbox message [type %04x]\n", + p_vf->abs_vf_id, mbx->first_tlv.tl.type); + /* check if tlv type is known */ if (qed_iov_tlv_supported(mbx->first_tlv.tl.type) && !p_vf->b_malicious) { @@ -3219,20 +3227,19 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn, } } -static void qed_iov_pf_add_pending_events(struct qed_hwfn *p_hwfn, u8 vfid) +void qed_iov_pf_get_pending_events(struct qed_hwfn *p_hwfn, u64 *events) { - u64 add_bit = 1ULL << (vfid % 64); + int i; - p_hwfn->pf_iov_info->pending_events[vfid / 64] |= add_bit; -} + memset(events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH); -static void qed_iov_pf_get_and_clear_pending_events(struct qed_hwfn *p_hwfn, - u64 *events) -{ - u64 *p_pending_events = p_hwfn->pf_iov_info->pending_events; + qed_for_each_vf(p_hwfn, i) { + struct qed_vf_info *p_vf; - memcpy(events, p_pending_events, sizeof(u64) * QED_VF_ARRAY_LENGTH); - memset(p_pending_events, 0, sizeof(u64) * QED_VF_ARRAY_LENGTH); + p_vf = &p_hwfn->pf_iov_info->vfs_array[i]; + if (p_vf->vf_mbx.b_pending_msg) + events[i / 64] |= 1ULL << (i % 64); + } } static struct qed_vf_info *qed_sriov_get_vf_from_absid(struct qed_hwfn *p_hwfn, @@ -3266,7 +3273,7 @@ static int qed_sriov_vfpf_msg(struct qed_hwfn *p_hwfn, p_vf->vf_mbx.pending_req = (((u64)vf_msg->hi) << 32) | vf_msg->lo; /* Mark the event and schedule the workqueue */ - qed_iov_pf_add_pending_events(p_hwfn, p_vf->relative_vf_id); + p_vf->vf_mbx.b_pending_msg = true; qed_schedule_iov(p_hwfn, QED_IOV_WQ_MSG_FLAG); return 0; @@ -4030,7 +4037,7 @@ static void qed_handle_vf_msg(struct qed_hwfn *hwfn) return; } - qed_iov_pf_get_and_clear_pending_events(hwfn, events); + qed_iov_pf_get_pending_events(hwfn, events); DP_VERBOSE(hwfn, QED_MSG_IOV, "Event mask of VF events: 0x%llx 0x%llx 0x%llx\n", diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h index fc08cc2da6a7..a89605821522 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h @@ -140,6 +140,9 @@ struct qed_iov_vf_mbx { /* Address in VF where a pending message is located */ dma_addr_t pending_req; + /* Message from VF awaits handling */ + bool b_pending_msg; + u8 *offset; /* saved VF request header */ @@ -232,7 +235,6 @@ struct qed_vf_info { */ struct qed_pf_iov { struct qed_vf_info vfs_array[MAX_NUM_VFS]; - u64 pending_events[QED_VF_ARRAY_LENGTH]; u64 pending_flr[QED_VF_ARRAY_LENGTH]; /* Allocate message address continuosuly and split to each VF */ diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 144fe84e8a53..04d9245b7149 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -416,7 +416,7 @@ struct stmmac_dma_ops { /* Configure the AXI Bus Mode Register */ void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi); /* Dump DMA registers */ - void (*dump_regs) (void __iomem *ioaddr); + void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space); /* Set tx/rx threshold in the csr6 register * An invalid value enables the store-and-forward mode */ void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode, @@ -456,7 +456,7 @@ struct stmmac_ops { /* Enable RX Queues */ void (*rx_queue_enable)(struct mac_device_info *hw, u32 queue); /* Dump MAC registers */ - void (*dump_regs)(struct mac_device_info *hw); + void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space); /* Handle extra events on specific interrupts hw dependent */ int (*host_irq_status)(struct mac_device_info *hw, struct stmmac_extra_stats *x); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 91c8926b7479..19b9b3087099 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -92,17 +92,13 @@ static int dwmac1000_rx_ipc_enable(struct mac_device_info *hw) return !!(value & GMAC_CONTROL_IPC); } -static void dwmac1000_dump_regs(struct mac_device_info *hw) +static void dwmac1000_dump_regs(struct mac_device_info *hw, u32 *reg_space) { void __iomem *ioaddr = hw->pcsr; int i; - pr_info("\tDWMAC1000 regs (base addr = 0x%p)\n", ioaddr); - for (i = 0; i < 55; i++) { - int offset = i * 4; - pr_info("\tReg No. %d (offset 0x%x): 0x%08x\n", i, - offset, readl(ioaddr + offset)); - } + for (i = 0; i < 55; i++) + reg_space[i] = readl(ioaddr + i * 4); } static void dwmac1000_set_umac_addr(struct mac_device_info *hw, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c index fbaec0ffd9ef..d3654a447046 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c @@ -201,18 +201,14 @@ static void dwmac1000_dma_operation_mode(void __iomem *ioaddr, int txmode, writel(csr6, ioaddr + DMA_CONTROL); } -static void dwmac1000_dump_dma_regs(void __iomem *ioaddr) +static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) { int i; - pr_info(" DMA registers\n"); - for (i = 0; i < 22; i++) { - if ((i < 9) || (i > 17)) { - int offset = i * 4; - pr_err("\t Reg No. %d (offset 0x%x): 0x%08x\n", i, - (DMA_BUS_MODE + offset), - readl(ioaddr + DMA_BUS_MODE + offset)); - } - } + + for (i = 0; i < 22; i++) + if ((i < 9) || (i > 17)) + reg_space[DMA_BUS_MODE / 4 + i] = + readl(ioaddr + DMA_BUS_MODE + i * 4); } static void dwmac1000_get_hw_feature(void __iomem *ioaddr, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index 8ab518997b1b..e370ccec6176 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -40,28 +40,18 @@ static void dwmac100_core_init(struct mac_device_info *hw, int mtu) #endif } -static void dwmac100_dump_mac_regs(struct mac_device_info *hw) +static void dwmac100_dump_mac_regs(struct mac_device_info *hw, u32 *reg_space) { void __iomem *ioaddr = hw->pcsr; - pr_info("\t----------------------------------------------\n" - "\t DWMAC 100 CSR (base addr = 0x%p)\n" - "\t----------------------------------------------\n", ioaddr); - pr_info("\tcontrol reg (offset 0x%x): 0x%08x\n", MAC_CONTROL, - readl(ioaddr + MAC_CONTROL)); - pr_info("\taddr HI (offset 0x%x): 0x%08x\n ", MAC_ADDR_HIGH, - readl(ioaddr + MAC_ADDR_HIGH)); - pr_info("\taddr LO (offset 0x%x): 0x%08x\n", MAC_ADDR_LOW, - readl(ioaddr + MAC_ADDR_LOW)); - pr_info("\tmulticast hash HI (offset 0x%x): 0x%08x\n", - MAC_HASH_HIGH, readl(ioaddr + MAC_HASH_HIGH)); - pr_info("\tmulticast hash LO (offset 0x%x): 0x%08x\n", - MAC_HASH_LOW, readl(ioaddr + MAC_HASH_LOW)); - pr_info("\tflow control (offset 0x%x): 0x%08x\n", - MAC_FLOW_CTRL, readl(ioaddr + MAC_FLOW_CTRL)); - pr_info("\tVLAN1 tag (offset 0x%x): 0x%08x\n", MAC_VLAN1, - readl(ioaddr + MAC_VLAN1)); - pr_info("\tVLAN2 tag (offset 0x%x): 0x%08x\n", MAC_VLAN2, - readl(ioaddr + MAC_VLAN2)); + + reg_space[MAC_CONTROL / 4] = readl(ioaddr + MAC_CONTROL); + reg_space[MAC_ADDR_HIGH / 4] = readl(ioaddr + MAC_ADDR_HIGH); + reg_space[MAC_ADDR_LOW / 4] = readl(ioaddr + MAC_ADDR_LOW); + reg_space[MAC_HASH_HIGH / 4] = readl(ioaddr + MAC_HASH_HIGH); + reg_space[MAC_HASH_LOW / 4] = readl(ioaddr + MAC_HASH_LOW); + reg_space[MAC_FLOW_CTRL / 4] = readl(ioaddr + MAC_FLOW_CTRL); + reg_space[MAC_VLAN1 / 4] = readl(ioaddr + MAC_VLAN1); + reg_space[MAC_VLAN2 / 4] = readl(ioaddr + MAC_VLAN2); } static int dwmac100_rx_ipc_enable(struct mac_device_info *hw) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c index d40e91e8fc7b..eef2f222ce9a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c @@ -66,19 +66,18 @@ static void dwmac100_dma_operation_mode(void __iomem *ioaddr, int txmode, writel(csr6, ioaddr + DMA_CONTROL); } -static void dwmac100_dump_dma_regs(void __iomem *ioaddr) +static void dwmac100_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) { int i; - pr_debug("DWMAC 100 DMA CSR\n"); for (i = 0; i < 9; i++) - pr_debug("\t CSR%d (offset 0x%x): 0x%08x\n", i, - (DMA_BUS_MODE + i * 4), - readl(ioaddr + DMA_BUS_MODE + i * 4)); + reg_space[DMA_BUS_MODE / 4 + i] = + readl(ioaddr + DMA_BUS_MODE + i * 4); - pr_debug("\tCSR20 (0x%x): 0x%08x, CSR21 (0x%x): 0x%08x\n", - DMA_CUR_TX_BUF_ADDR, readl(ioaddr + DMA_CUR_TX_BUF_ADDR), - DMA_CUR_RX_BUF_ADDR, readl(ioaddr + DMA_CUR_RX_BUF_ADDR)); + reg_space[DMA_CUR_TX_BUF_ADDR / 4] = + readl(ioaddr + DMA_CUR_TX_BUF_ADDR); + reg_space[DMA_CUR_RX_BUF_ADDR / 4] = + readl(ioaddr + DMA_CUR_RX_BUF_ADDR); } /* DMA controller has two counters to track the number of the missed frames. */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 202216cd6789..1e79e6529c4a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -70,19 +70,13 @@ static void dwmac4_rx_queue_enable(struct mac_device_info *hw, u32 queue) writel(value, ioaddr + GMAC_RXQ_CTRL0); } -static void dwmac4_dump_regs(struct mac_device_info *hw) +static void dwmac4_dump_regs(struct mac_device_info *hw, u32 *reg_space) { void __iomem *ioaddr = hw->pcsr; int i; - pr_debug("\tDWMAC4 regs (base addr = 0x%p)\n", ioaddr); - - for (i = 0; i < GMAC_REG_NUM; i++) { - int offset = i * 4; - - pr_debug("\tReg No. %d (offset 0x%x): 0x%08x\n", i, - offset, readl(ioaddr + offset)); - } + for (i = 0; i < GMAC_REG_NUM; i++) + reg_space[i] = readl(ioaddr + i * 4); } static int dwmac4_rx_ipc_enable(struct mac_device_info *hw) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index 377d1b44d4f2..f97b0d5d9987 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -127,53 +127,51 @@ static void dwmac4_dma_init(void __iomem *ioaddr, dwmac4_dma_init_channel(ioaddr, dma_cfg, dma_tx, dma_rx, i); } -static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel) +static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel, + u32 *reg_space) { - pr_debug(" Channel %d\n", channel); - pr_debug("\tDMA_CHAN_CONTROL, offset: 0x%x, val: 0x%x\n", 0, - readl(ioaddr + DMA_CHAN_CONTROL(channel))); - pr_debug("\tDMA_CHAN_TX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x4, - readl(ioaddr + DMA_CHAN_TX_CONTROL(channel))); - pr_debug("\tDMA_CHAN_RX_CONTROL, offset: 0x%x, val: 0x%x\n", 0x8, - readl(ioaddr + DMA_CHAN_RX_CONTROL(channel))); - pr_debug("\tDMA_CHAN_TX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x14, - readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel))); - pr_debug("\tDMA_CHAN_RX_BASE_ADDR, offset: 0x%x, val: 0x%x\n", 0x1c, - readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel))); - pr_debug("\tDMA_CHAN_TX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x20, - readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel))); - pr_debug("\tDMA_CHAN_RX_END_ADDR, offset: 0x%x, val: 0x%x\n", 0x28, - readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel))); - pr_debug("\tDMA_CHAN_TX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x2c, - readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel))); - pr_debug("\tDMA_CHAN_RX_RING_LEN, offset: 0x%x, val: 0x%x\n", 0x30, - readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel))); - pr_debug("\tDMA_CHAN_INTR_ENA, offset: 0x%x, val: 0x%x\n", 0x34, - readl(ioaddr + DMA_CHAN_INTR_ENA(channel))); - pr_debug("\tDMA_CHAN_RX_WATCHDOG, offset: 0x%x, val: 0x%x\n", 0x38, - readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel))); - pr_debug("\tDMA_CHAN_SLOT_CTRL_STATUS, offset: 0x%x, val: 0x%x\n", 0x3c, - readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel))); - pr_debug("\tDMA_CHAN_CUR_TX_DESC, offset: 0x%x, val: 0x%x\n", 0x44, - readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel))); - pr_debug("\tDMA_CHAN_CUR_RX_DESC, offset: 0x%x, val: 0x%x\n", 0x4c, - readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel))); - pr_debug("\tDMA_CHAN_CUR_TX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x54, - readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel))); - pr_debug("\tDMA_CHAN_CUR_RX_BUF_ADDR, offset: 0x%x, val: 0x%x\n", 0x5c, - readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel))); - pr_debug("\tDMA_CHAN_STATUS, offset: 0x%x, val: 0x%x\n", 0x60, - readl(ioaddr + DMA_CHAN_STATUS(channel))); + reg_space[DMA_CHAN_CONTROL(channel) / 4] = + readl(ioaddr + DMA_CHAN_CONTROL(channel)); + reg_space[DMA_CHAN_TX_CONTROL(channel) / 4] = + readl(ioaddr + DMA_CHAN_TX_CONTROL(channel)); + reg_space[DMA_CHAN_RX_CONTROL(channel) / 4] = + readl(ioaddr + DMA_CHAN_RX_CONTROL(channel)); + reg_space[DMA_CHAN_TX_BASE_ADDR(channel) / 4] = + readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel)); + reg_space[DMA_CHAN_RX_BASE_ADDR(channel) / 4] = + readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel)); + reg_space[DMA_CHAN_TX_END_ADDR(channel) / 4] = + readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel)); + reg_space[DMA_CHAN_RX_END_ADDR(channel) / 4] = + readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel)); + reg_space[DMA_CHAN_TX_RING_LEN(channel) / 4] = + readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel)); + reg_space[DMA_CHAN_RX_RING_LEN(channel) / 4] = + readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel)); + reg_space[DMA_CHAN_INTR_ENA(channel) / 4] = + readl(ioaddr + DMA_CHAN_INTR_ENA(channel)); + reg_space[DMA_CHAN_RX_WATCHDOG(channel) / 4] = + readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel)); + reg_space[DMA_CHAN_SLOT_CTRL_STATUS(channel) / 4] = + readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel)); + reg_space[DMA_CHAN_CUR_TX_DESC(channel) / 4] = + readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel)); + reg_space[DMA_CHAN_CUR_RX_DESC(channel) / 4] = + readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel)); + reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(channel) / 4] = + readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel)); + reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(channel) / 4] = + readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel)); + reg_space[DMA_CHAN_STATUS(channel) / 4] = + readl(ioaddr + DMA_CHAN_STATUS(channel)); } -static void dwmac4_dump_dma_regs(void __iomem *ioaddr) +static void dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) { int i; - pr_debug(" GMAC4 DMA registers\n"); - for (i = 0; i < DMA_CHANNEL_NB_MAX; i++) - _dwmac4_dump_dma_regs(ioaddr, i); + _dwmac4_dump_dma_regs(ioaddr, i, reg_space); } static void dwmac4_rx_watchdog(void __iomem *ioaddr, u32 riwt) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 5ff6bc4eb8f1..85d64114e159 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -435,32 +435,14 @@ static int stmmac_ethtool_get_regs_len(struct net_device *dev) static void stmmac_ethtool_gregs(struct net_device *dev, struct ethtool_regs *regs, void *space) { - int i; u32 *reg_space = (u32 *) space; struct stmmac_priv *priv = netdev_priv(dev); memset(reg_space, 0x0, REG_SPACE_SIZE); - if (priv->plat->has_gmac || priv->plat->has_gmac4) { - /* MAC registers */ - for (i = 0; i < 55; i++) - reg_space[i] = readl(priv->ioaddr + (i * 4)); - /* DMA registers */ - for (i = 0; i < 22; i++) - reg_space[i + 55] = - readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4))); - } else { - /* MAC registers */ - for (i = 0; i < 12; i++) - reg_space[i] = readl(priv->ioaddr + (i * 4)); - /* DMA registers */ - for (i = 0; i < 9; i++) - reg_space[i + 12] = - readl(priv->ioaddr + (DMA_BUS_MODE + (i * 4))); - reg_space[22] = readl(priv->ioaddr + DMA_CUR_TX_BUF_ADDR); - reg_space[23] = readl(priv->ioaddr + DMA_CUR_RX_BUF_ADDR); - } + priv->hw->mac->dump_regs(priv->hw, reg_space); + priv->hw->dma->dump_regs(priv->ioaddr, reg_space); } static void diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 3cbe09682afe..4498a3861aa3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1729,11 +1729,6 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) priv->hw->dma->start_tx(priv->ioaddr); priv->hw->dma->start_rx(priv->ioaddr); - /* Dump DMA/MAC registers */ - if (netif_msg_hw(priv)) { - priv->hw->mac->dump_regs(priv->hw); - priv->hw->dma->dump_regs(priv->ioaddr); - } priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS; if ((priv->use_riwt) && (priv->hw->dma->rx_watchdog)) { diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index d6f7838455dd..1be69d8bc909 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -146,7 +146,7 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts) */ int phy_aneg_done(struct phy_device *phydev) { - if (phydev->drv->aneg_done) + if (phydev->drv && phydev->drv->aneg_done) return phydev->drv->aneg_done(phydev); return genphy_aneg_done(phydev); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 556953f53437..b7911994112a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2035,7 +2035,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, const struct iphdr *old_iph = ip_hdr(skb); union vxlan_addr *dst; union vxlan_addr remote_ip, local_ip; - union vxlan_addr *src; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; @@ -2062,7 +2061,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = (rdst->remote_vni) ? : default_vni; - src = &vxlan->cfg.saddr; + local_ip = vxlan->cfg.saddr; dst_cache = &rdst->dst_cache; md->gbp = skb->mark; ttl = vxlan->cfg.ttl; @@ -2095,7 +2094,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst = &remote_ip; dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = tunnel_id_to_key32(info->key.tun_id); - src = &local_ip; dst_cache = &info->dst_cache; if (info->options_len) md = ip_tunnel_info_opts(info); @@ -2115,7 +2113,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rt = vxlan_get_route(vxlan, dev, sock4, skb, rdst ? rdst->remote_ifindex : 0, tos, dst->sin.sin_addr.s_addr, - &src->sin.sin_addr.s_addr, + &local_ip.sin.sin_addr.s_addr, dst_port, src_port, dst_cache, info); if (IS_ERR(rt)) { @@ -2142,7 +2140,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (err < 0) goto tx_error; - udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, src->sin.sin_addr.s_addr, + udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr, dst->sin.sin_addr.s_addr, tos, ttl, df, src_port, dst_port, xnet, !udp_sum); #if IS_ENABLED(CONFIG_IPV6) @@ -2152,7 +2150,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = vxlan6_get_route(vxlan, dev, sock6, skb, rdst ? rdst->remote_ifindex : 0, tos, label, &dst->sin6.sin6_addr, - &src->sin6.sin6_addr, + &local_ip.sin6.sin6_addr, dst_port, src_port, dst_cache, info); if (IS_ERR(ndst)) { @@ -2180,7 +2178,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, - &src->sin6.sin6_addr, + &local_ip.sin6.sin6_addr, &dst->sin6.sin6_addr, tos, ttl, label, src_port, dst_port, !udp_sum); #endif @@ -2675,7 +2673,7 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) if (data[IFLA_VXLAN_ID]) { __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); - if (id >= VXLAN_VID_MASK) + if (id >= VXLAN_N_VID) return -ERANGE; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7ce35aec8c76..f297a9e18642 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) nr_pages = i; if (nr_pages > 0) { len = nr_pages << PAGE_SHIFT; + osd_req_op_extent_update(req, 0, len); break; } goto out_pages; @@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited( "writepage_start %p %lld forced umount\n", @@ -1017,8 +1018,7 @@ new_request: &ci->i_layout, vino, offset, &len, 0, num_ops, CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, false); if (IS_ERR(req)) { @@ -1028,8 +1028,7 @@ new_request: min(num_ops, CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, true); BUG_ON(IS_ERR(req)); @@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file, int r; struct ceph_snap_context *snapc, *oldest; - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout(" page %p forced umount\n", page); unlock_page(page); return -EIO; @@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, - CEPH_OSD_OP_CREATE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 5bc5d37b1217..4e7421caf380 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, inode); if (fscache_cookie_enabled(ci->fscache)) { - dout("fscache_file_set_cookie %p %p enabing cache\n", + dout("fscache_file_set_cookie %p %p enabling cache\n", inode, filp); } } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 94fd76d04683..cd966f276a8d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) /* * Return caps we have registered with the MDS(s) as 'wanted'. */ -int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; @@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (!__cap_is_valid(cap)) + if (check && !__cap_is_valid(cap)) continue; if (cap == ci->i_auth_cap) mds_wanted |= cap->mds_wanted; @@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, delayed = 1; } ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + if (want & ~cap->mds_wanted) { + /* user space may open/close single file frequently. + * This avoids droping mds_wanted immediately after + * requesting new mds_wanted. + */ + __cap_set_timeouts(mdsc, ci); + } cap->issued &= retain; /* drop bits we don't want */ if (cap->implemented & ~cap->issued) { @@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - ceph_sync_write_wait(inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) goto out; @@ -2477,23 +2482,22 @@ again: if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { int mds_wanted; - if (ACCESS_ONCE(mdsc->fsc->mount_state) == + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("get_cap_refs %p forced umount\n", inode); *err = -EIO; ret = 1; goto out_unlock; } - mds_wanted = __ceph_caps_mds_wanted(ci); - if ((mds_wanted & need) != need) { + mds_wanted = __ceph_caps_mds_wanted(ci, false); + if (need & ~(mds_wanted & need)) { dout("get_cap_refs %p caps were dropped" " (session killed?)\n", inode); *err = -ESTALE; ret = 1; goto out_unlock; } - if ((mds_wanted & file_wanted) == - (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + if (!(file_wanted & ~mds_wanted)) ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } @@ -3404,6 +3408,7 @@ retry: tcap->implemented |= issued; if (cap == ci->i_auth_cap) ci->i_auth_cap = tcap; + if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); @@ -3417,9 +3422,18 @@ retry: } else if (tsession) { /* add placeholder for the export tagert */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + if (!list_empty(&ci->i_cap_flush_list) && + ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + } + __ceph_remove_cap(cap, false); goto out_unlock; } @@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode, } int ceph_encode_dentry_release(void **p, struct dentry *dentry, + struct inode *dir, int mds, int drop, int unless) { - struct inode *dir = d_inode(dentry->d_parent); + struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); int force = 0; @@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; + if (!dir) { + parent = dget(dentry->d_parent); + dir = d_inode(parent); + } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + dput(parent); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 39ff678e567f..f2ae393e2c31 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) seq_puts(s, "\t(unsafe)"); else seq_puts(s, "\t"); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8ab1fdf0bd49..3e9ad501addf 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -371,7 +371,7 @@ more: /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); - req->r_direct_is_hash = true; + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { @@ -417,7 +417,7 @@ more: fi->frag = frag; fi->last_readdir = req; - if (req->r_did_prepopulate) { + if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { fi->readdir_cache_idx = req->r_readdir_cache_idx; if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ @@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.getattr.mask = cpu_to_le32(mask); - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); @@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); goto out; } - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_SHARED on source inode (mds will lock it) */ @@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = drop_caps_for_unlink(inode); @@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); req->r_old_dentry_dir = old_dir; - req->r_locked_dir = new_dir; + req->r_parent = new_dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; @@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir; if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); + parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; @@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; op = ceph_snap(dir) == CEPH_SNAPDIR ? - CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR; + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); if (!IS_ERR(req)) { req->r_dentry = dget(dentry); - req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2; + req->r_num_caps = 2; + req->r_parent = dir; mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 180bbef760f2..e8f11fa565c5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_inode = d_inode(child); ihold(d_inode(child)); req->r_ino2 = ceph_vino(d_inode(parent)); - req->r_locked_dir = d_inode(parent); + req->r_parent = d_inode(parent); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 045d30d26624..26cc95421cca 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file) spin_lock(&ci->i_ceph_lock); if (__ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { - int mds_wanted = __ceph_caps_mds_wanted(ci); + int mds_wanted = __ceph_caps_mds_wanted(ci, true); int issued = __ceph_caps_issued(ci, NULL); dout("open %p fmode %d want %s issued %s using existing\n", @@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.open.mask = cpu_to_le32(mask); - req->r_locked_dir = dir; /* caller holds dir->i_mutex */ + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); @@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); @@ -794,89 +793,6 @@ out: kfree(aio_work); } -/* - * Write commit request unsafe callback, called to tell us when a - * request is unsafe (that is, in flight--has been handed to the - * messenger to send to its target osd). It is called again when - * we've received a response message indicating the request is - * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request - * is completed early (and unsuccessfully) due to a timeout or - * interrupt. - * - * This is used if we requested both an ACK and ONDISK commit reply - * from the OSD. - */ -static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) -{ - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - - dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, - unsafe ? "un" : ""); - if (unsafe) { - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - - complete_all(&req->r_completion); - } else { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); - } -} - -/* - * Wait on any unsafe replies for the given inode. First wait on the - * newest request, and make that the upper bound. Then, if there are - * more requests, keep waiting on the oldest as long as it is still older - * than the original request. - */ -void ceph_sync_write_wait(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_writes; - struct ceph_osd_request *req; - u64 last_tid; - - if (!S_ISREG(inode->i_mode)) - return; - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - /* set upper bound as _last_ entry in chain */ - - req = list_last_entry(head, struct ceph_osd_request, - r_unsafe_item); - last_tid = req->r_tid; - - do { - ceph_osdc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("sync_write_wait on tid %llu (until %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_done_completion); - ceph_osdc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - /* - * from here on look at first entry in chain, since we - * only want to wait for anything older than last_tid - */ - if (list_empty(head)) - break; - req = list_first_entry(head, struct ceph_osd_request, - r_unsafe_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); -} - static ssize_t ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_snap_context *snapc, @@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; } else { flags = CEPH_OSD_FLAG_READ; } @@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK; + flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; @@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, goto out; } - /* get a second commit callback */ - req->r_unsafe_callback = ceph_sync_write_unsafe; req->r_inode = inode; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, @@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode, ceph_vino(inode), offset, length, 0, 1, op, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, + CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { ret = PTR_ERR(req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5e659d054b40..fd8f771f99b7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; - INIT_LIST_HEAD(&ci->i_unsafe_writes); INIT_LIST_HEAD(&ci->i_unsafe_dirops); INIT_LIST_HEAD(&ci->i_unsafe_iops); spin_lock_init(&ci->i_unsafe_lock); @@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode) return 1; } -void ceph_evict_inode(struct inode *inode) -{ - /* wait unsafe sync writes */ - ceph_sync_write_wait(inode); - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); -} - static inline blkcnt_t calc_inode_blocks(u64 size) { return (size + (1<<9) - 1) >> 9; @@ -1016,7 +1007,9 @@ out: static void update_dentry_lease(struct dentry *dentry, struct ceph_mds_reply_lease *lease, struct ceph_mds_session *session, - unsigned long from_time) + unsigned long from_time, + struct ceph_vino *tgt_vino, + struct ceph_vino *dir_vino) { struct ceph_dentry_info *di = ceph_dentry(dentry); long unsigned duration = le32_to_cpu(lease->duration_ms); @@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry, long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; struct inode *dir; + /* + * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that + * we expect a negative dentry. + */ + if (!tgt_vino && d_really_is_positive(dentry)) + return; + + if (tgt_vino && (d_really_is_negative(dentry) || + !ceph_ino_compare(d_inode(dentry), tgt_vino))) + return; + spin_lock(&dentry->d_lock); dout("update_dentry_lease %p duration %lu ms ttl %lu\n", dentry, duration, ttl); - /* make lease_rdcache_gen match directory */ dir = d_inode(dentry->d_parent); + /* make sure parent matches dir_vino */ + if (!ceph_ino_compare(dir, dir_vino)) + goto out_unlock; + /* only track leases on regular dentries */ if (ceph_snap(dir) != CEPH_NOSNAP) goto out_unlock; @@ -1108,61 +1115,27 @@ out: * * Called with snap_rwsem (read). */ -int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, - struct ceph_mds_session *session) +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) { + struct ceph_mds_session *session = req->r_session; struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_vino vino; + struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, rinfo->head->is_dentry, rinfo->head->is_target); -#if 0 - /* - * Debugging hook: - * - * If we resend completed ops to a recovering mds, we get no - * trace. Since that is very rare, pretend this is the case - * to ensure the 'no trace' handlers in the callers behave. - * - * Fill in inodes unconditionally to avoid breaking cap - * invariants. - */ - if (rinfo->head->op & CEPH_MDS_OP_WRITE) { - pr_info("fill_trace faking empty trace on %lld %s\n", - req->r_tid, ceph_mds_op_name(rinfo->head->op)); - if (rinfo->head->is_dentry) { - rinfo->head->is_dentry = 0; - err = fill_inode(req->r_locked_dir, - &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1); - } - if (rinfo->head->is_target) { - rinfo->head->is_target = 0; - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - err = fill_inode(in, &rinfo->targeti, NULL, - session, req->r_request_started, - req->r_fmode); - iput(in); - } - } -#endif - if (!rinfo->head->is_target && !rinfo->head->is_dentry) { dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) + if (rinfo->head->result == 0 && req->r_parent) ceph_invalidate_dir_request(req); return 0; } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; if (dir) { err = fill_inode(dir, NULL, @@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dname.name = rinfo->dname; dname.len = rinfo->dname_len; dname.hash = full_name_hash(parent, dname.name, dname.len); - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); retry_lookup: dn = d_lookup(parent, &dname); dout("d_lookup on parent=%p name=%.*s got %p\n", @@ -1206,8 +1179,8 @@ retry_lookup: } err = 0; } else if (d_really_is_positive(dn) && - (ceph_ino(d_inode(dn)) != vino.ino || - ceph_snap(d_inode(dn)) != vino.snap)) { + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); d_delete(dn); @@ -1221,10 +1194,10 @@ retry_lookup: } if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - in = ceph_get_inode(sb, vino); + in = ceph_get_inode(sb, tvino); if (IS_ERR(in)) { err = PTR_ERR(in); goto done; @@ -1233,8 +1206,8 @@ retry_lookup: err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, req->r_request_started, - (!req->r_aborted && rinfo->head->result == 0) ? - req->r_fmode : -1, + (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { pr_err("fill_inode badness %p %llx.%llx\n", @@ -1247,8 +1220,9 @@ retry_lookup: * ignore null lease/binding on snapdir ENOENT, or else we * will have trouble splicing in the virtual snapdir later */ - if (rinfo->head->is_dentry && !req->r_aborted && - req->r_locked_dir && + if (rinfo->head->is_dentry && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { @@ -1257,17 +1231,19 @@ retry_lookup: * mknod symlink mkdir : null -> new inode * unlink : linked -> null */ - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; struct dentry *dn = req->r_dentry; bool have_dir_cap, have_lease; BUG_ON(!dn); BUG_ON(!dir); BUG_ON(d_inode(dn->d_parent) != dir); - BUG_ON(ceph_ino(dir) != - le64_to_cpu(rinfo->diri.in->ino)); - BUG_ON(ceph_snap(dir) != - le64_to_cpu(rinfo->diri.in->snapid)); + + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + BUG_ON(ceph_ino(dir) != dvino.ino); + BUG_ON(ceph_snap(dir) != dvino.snap); /* do we have a lease on the whole dir? */ have_dir_cap = @@ -1319,12 +1295,13 @@ retry_lookup: ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); - } else { - if (have_lease && d_unhashed(dn)) + } else if (have_lease) { + if (d_unhashed(dn)) d_add(dn, NULL); update_dentry_lease(dn, rinfo->dlease, session, - req->r_request_started); + req->r_request_started, + NULL, &dvino); } goto done; } @@ -1347,15 +1324,19 @@ retry_lookup: have_lease = false; } - if (have_lease) + if (have_lease) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); update_dentry_lease(dn, rinfo->dlease, session, - req->r_request_started); + req->r_request_started, + &tvino, &dvino); + } dout(" final dn %p\n", dn); - } else if (!req->r_aborted && - (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP)) { + } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct dentry *dn = req->r_dentry; - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); @@ -1370,6 +1351,26 @@ retry_lookup: goto done; } req->r_dentry = dn; /* may have spliced */ + } else if (rinfo->head->is_dentry) { + struct ceph_vino *ptvino = NULL; + + if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || + le32_to_cpu(rinfo->dlease->duration_ms)) { + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (rinfo->head->is_target) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + ptvino = &tvino; + } + + update_dentry_lease(req->r_dentry, rinfo->dlease, + session, req->r_request_started, ptvino, + &dvino); + } else { + dout("%s: no dentry lease or dir cap\n", __func__); + } } done: dout("fill_trace done err=%d\n", err); @@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; - if (req->r_aborted) + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) return readdir_prepopulate_inodes_only(req, session); if (rinfo->hash_order && req->r_path2) { @@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - struct ceph_vino vino; + struct ceph_vino tvino, dvino; dname.name = rde->name; dname.len = rde->name_len; dname.hash = full_name_hash(parent, dname.name, dname.len); - vino.ino = le64_to_cpu(rde->inode.in->ino); - vino.snap = le64_to_cpu(rde->inode.in->snapid); + tvino.ino = le64_to_cpu(rde->inode.in->ino); + tvino.snap = le64_to_cpu(rde->inode.in->snapid); if (rinfo->hash_order) { u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, @@ -1559,8 +1560,8 @@ retry_lookup: goto out; } } else if (d_really_is_positive(dn) && - (ceph_ino(d_inode(dn)) != vino.ino || - ceph_snap(d_inode(dn)) != vino.snap)) { + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); d_delete(dn); @@ -1572,7 +1573,7 @@ retry_lookup: if (d_really_is_positive(dn)) { in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, vino); + in = ceph_get_inode(parent->d_sb, tvino); if (IS_ERR(in)) { dout("new_inode badness\n"); d_drop(dn); @@ -1617,8 +1618,9 @@ retry_lookup: ceph_dentry(dn)->offset = rde->offset; + dvino = ceph_vino(d_inode(parent)); update_dentry_lease(dn, rde->lease, req->r_session, - req->r_request_started); + req->r_request_started, &tvino, &dvino); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { ret = fill_readdir_cache(d_inode(parent), dn, @@ -1632,7 +1634,7 @@ next_item: } out: if (err == 0 && skipped == 0) { - req->r_did_prepopulate = true; + set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); @@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work) mutex_lock(&ci->i_truncate_mutex); - if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", inode, ceph_ino(inode)); mapping_set_error(inode->i_mapping, -EIO); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 7d752d53353a..4c9c72f26eb9 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) l.stripe_count = ci->i_layout.stripe_count; l.object_size = ci->i_layout.object_size; l.data_pool = ci->i_layout.pool_id; - l.preferred_osd = (s32)-1; + l.preferred_osd = -1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } @@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ - nl.preferred_osd = le64_to_cpu(-1); + nl.preferred_osd = -1; err = __validate_layout(mdsc, &nl); if (err) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c9d2e553a6c4..c681762d76e6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref) ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } - if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_parent) + ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); @@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; @@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc, erase_request(&mdsc->request_tree, req); - if (req->r_unsafe_dir && req->r_got_unsafe) { + if (req->r_unsafe_dir && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); } - if (req->r_target_inode && req->r_got_unsafe) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_target_item); @@ -668,6 +673,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } /* + * Walk back up the dentry tree until we hit a dentry representing a + * non-snapshot inode. We do this using the rcu_read_lock (which must be held + * when calling this) to ensure that the objects won't disappear while we're + * working with them. Once we hit a candidate dentry, we attempt to take a + * reference to it, and return that as the result. + */ +static struct inode *get_nonsnap_parent(struct dentry *dentry) +{ + struct inode *inode = NULL; + + while (dentry && !IS_ROOT(dentry)) { + inode = d_inode_rcu(dentry); + if (!inode || ceph_snap(inode) == CEPH_NOSNAP) + break; + dentry = dentry->d_parent; + } + if (inode) + inode = igrab(inode); + return inode; +} + +/* * Choose mds to send request to next. If there is a hint set in the * request (e.g., due to a prior forward hint from the mds), use that. * Otherwise, consult frag tree and/or caps to identify the @@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -static struct dentry *get_nonsnap_parent(struct dentry *dentry) -{ - /* - * we don't need to worry about protecting the d_parent access - * here because we never renaming inside the snapped namespace - * except to resplice to another snapdir, and either the old or new - * result is a valid result. - */ - while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) - dentry = dentry->d_parent; - return dentry; -} - static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { @@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, int mode = req->r_direct_mode; int mds = -1; u32 hash = req->r_direct_hash; - bool is_hash = req->r_direct_is_hash; + bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); /* * is there a specific mds we should try? ignore hint if we have @@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = NULL; if (req->r_inode) { inode = req->r_inode; + ihold(inode); } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ - struct dentry *parent = req->r_dentry->d_parent; - struct inode *dir = d_inode(parent); + struct dentry *parent; + struct inode *dir; + + rcu_read_lock(); + parent = req->r_dentry->d_parent; + dir = req->r_parent ? : d_inode_rcu(parent); - if (dir->i_sb != mdsc->fsc->sb) { - /* not this fs! */ + if (!dir || dir->i_sb != mdsc->fsc->sb) { + /* not this fs or parent went negative */ inode = d_inode(req->r_dentry); + if (inode) + ihold(inode); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = get_nonsnap_parent(parent); - inode = d_inode(dn); + inode = get_nonsnap_parent(parent); dout("__choose_mds using nonsnap parent %p\n", inode); } else { /* dentry target */ inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ - inode = dir; + inode = igrab(dir); hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; + } else { + ihold(inode); } } + rcu_read_unlock(); } dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, @@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) - return mds; + goto out; } /* since this file/dir wasn't known to be @@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) - return mds; + goto out; } } } @@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); + iput(inode); goto random; } mds = cap->session->s_mds; @@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); +out: + iput(inode); return mds; random: @@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); - list_del_init(&req->r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); __unregister_request(mdsc, req); @@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; if (ci->i_wrbuffer_ref > 0 && - ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) invalidate = true; while (!list_empty(&ci->i_cap_flush_list)) { @@ -1775,18 +1800,23 @@ retry: return path; } -static int build_dentry_path(struct dentry *dentry, +static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, int *pfreepath) { char *path; - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { - *pino = ceph_ino(d_inode(dentry->d_parent)); + rcu_read_lock(); + if (!dir) + dir = d_inode_rcu(dentry->d_parent); + if (dir && ceph_snap(dir) == CEPH_NOSNAP) { + *pino = ceph_ino(dir); + rcu_read_unlock(); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; } + rcu_read_unlock(); path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); @@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode, * an explicit ino+path. */ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, - const char *rpath, u64 rino, - const char **ppath, int *pathlen, + struct inode *rdiri, const char *rpath, + u64 rino, const char **ppath, int *pathlen, u64 *ino, int *freepath) { int r = 0; @@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); + r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, + freepath); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, int ret; ret = set_request_path_attr(req->r_inode, req->r_dentry, - req->r_path1, req->r_ino1.ino, + req->r_parent, req->r_path1, req->r_ino1.ino, &path1, &pathlen1, &ino1, &freepath1); if (ret < 0) { msg = ERR_PTR(ret); @@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } ret = set_request_path_attr(NULL, req->r_old_dentry, + req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2); if (ret < 0) { @@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_inode_drop, req->r_inode_unless, 0); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, - mds, req->r_dentry_drop, req->r_dentry_unless); + req->r_parent, mds, req->r_dentry_drop, + req->r_dentry_unless); if (req->r_old_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_old_dentry, - mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + req->r_old_dentry_dir, mds, + req->r_old_dentry_drop, + req->r_old_dentry_unless); if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), @@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); - if (req->r_got_unsafe) { + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; /* * Replay. Do not regenerate message (and rebuild @@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead = msg->front.iov_base; rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; - if (req->r_locked_dir) + if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; rhead->ino = 0; - dout(" r_locked_dir = %p\n", req->r_locked_dir); + dout(" r_parent = %p\n", req->r_parent); return 0; } @@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = 0; - if (req->r_err || req->r_got_result) { - if (req->r_aborted) + if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) __unregister_request(mdsc, req); goto out; } @@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc, err = -EIO; goto finish; } - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("do_request forced umount\n"); err = -EIO; goto finish; } - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { if (mdsc->mdsmap_err) { err = mdsc->mdsmap_err; dout("do_request mdsmap err %d\n", err); @@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts > 0) continue; /* only new requests */ @@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, dout("do_request on %p\n", req); - /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ + /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_locked_dir) - ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); + if (req->r_parent) + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); @@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); /* only abort if we didn't race with a real reply */ - if (req->r_got_result) { + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { err = le32_to_cpu(req->r_reply_info.head->result); } else if (err < 0) { dout("aborted request %lld with %d\n", req->r_tid, err); @@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, */ mutex_lock(&req->r_fill_mutex); req->r_err = err; - req->r_aborted = true; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); - if (req->r_locked_dir && + if (req->r_parent && (req->r_op & CEPH_MDS_OP_WRITE)) ceph_invalidate_dir_request(req); } else { @@ -2323,7 +2358,7 @@ out: */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { - struct inode *inode = req->r_locked_dir; + struct inode *inode = req->r_parent; dout("invalidate_dir_request %p (complete, lease(s))\n", inode); @@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* dup? */ - if ((req->r_got_unsafe && !head->safe) || - (req->r_got_safe && head->safe)) { + if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || + (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { pr_warn("got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } - if (req->r_got_safe) { + if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); @@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { - req->r_got_safe = true; + set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); - if (req->r_got_unsafe) { + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { /* * We already handled the unsafe response, now do the * cleanup. No need to examine the response; the MDS @@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * useful we could do with a revised return value. */ dout("got safe reply %llu, mds%d\n", tid, mds); - list_del_init(&req->r_unsafe_item); /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) @@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) goto out; } } else { - req->r_got_unsafe = true; + set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); if (req->r_unsafe_dir) { struct ceph_inode_info *ci = @@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); current->journal_info = req; - err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); + err = ceph_fill_trace(mdsc->fsc->sb, req); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) @@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (realm) ceph_put_snap_realm(mdsc, realm); - if (err == 0 && req->r_got_unsafe && req->r_target_inode) { + if (err == 0 && req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); @@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } out_err: mutex_lock(&mdsc->mutex); - if (!req->r_aborted) { + if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { if (err) { req->r_err = err; } else { req->r_reply = ceph_msg_get(msg); - req->r_got_result = true; + set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); } } else { dout("reply arrived after request %lld was aborted\n", tid); @@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, goto out; /* dup reply? */ } - if (req->r_aborted) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { @@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); - BUG_ON(req->r_got_result); + BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; @@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); - if (req->r_got_unsafe) + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts == 0) continue; /* only old requests */ @@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { - if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; return atomic_read(&mdsc->num_sessions) <= skipped; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3c6f77b7bb02..ac0475a2daa7 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -202,9 +202,18 @@ struct ceph_mds_request { char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; - struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ +#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ +#define CEPH_MDS_R_ABORTED (2) /* call was aborted */ +#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ +#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ +#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ +#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ +#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ + unsigned long r_req_flags; + struct mutex r_fill_mutex; union ceph_mds_request_args r_args; @@ -216,7 +225,6 @@ struct ceph_mds_request { /* for choosing which mds to send this request to */ int r_direct_mode; u32 r_direct_hash; /* choose dir frag based on this dentry hash */ - bool r_direct_is_hash; /* true if r_direct_hash is valid */ /* data payload is used for xattr ops */ struct ceph_pagelist *r_pagelist; @@ -234,7 +242,6 @@ struct ceph_mds_request { struct ceph_mds_reply_info_parsed r_reply_info; struct page *r_locked_page; int r_err; - bool r_aborted; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ @@ -262,9 +269,7 @@ struct ceph_mds_request { ceph_mds_request_callback_t r_callback; ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe, r_got_result; - bool r_did_prepopulate; long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6bd20d707bfd..0ec8d0114e57 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = { .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, .drop_inode = ceph_drop_inode, - .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + if (fsc->mount_options->rsize > fsc->mount_options->rasize && + fsc->mount_options->rsize >= PAGE_SIZE) + fsc->backing_dev_info.io_pages = + (fsc->mount_options->rsize + PAGE_SIZE - 1) + >> PAGE_SHIFT; + else if (fsc->mount_options->rsize == 0) + fsc->backing_dev_info.io_pages = ULONG_MAX; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (!err) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3373b61faefd..e9410bcf4113 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -45,8 +45,8 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT 0 /* max read size */ -#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ +#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -343,7 +343,6 @@ struct ceph_inode_info { u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ - struct list_head i_unsafe_writes; /* uncommitted sync writes */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; @@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) } /* what the mds thinks we want */ -extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); @@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); extern int ceph_drop_inode(struct inode *inode); -extern void ceph_evict_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec *ctime, struct timespec *mtime, struct timespec *atime); extern int ceph_fill_trace(struct super_block *sb, - struct ceph_mds_request *req, - struct ceph_mds_session *session); + struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); @@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + struct inode *dir, int mds, int drop, int unless); extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, @@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); -extern void ceph_sync_write_wait(struct inode *inode); + /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1c13dd80744f..7e4ea3b9f472 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -322,6 +322,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; + if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin6); } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index eb094c6011d8..fd0284c1dc32 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1083,7 +1083,8 @@ struct svc_version nfs4_callback_version1 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, - .vs_hidden = 1, + .vs_hidden = true, + .vs_need_cong_ctrl = true, }; struct svc_version nfs4_callback_version4 = { @@ -1092,5 +1093,6 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, - .vs_hidden = 1, + .vs_hidden = true, + .vs_need_cong_ctrl = true, }; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 43e109cc0ccc..e71f11b1a180 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1102,6 +1102,7 @@ static struct flags { { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, + { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index d08cd88155c7..838f90f3f890 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -376,5 +376,4 @@ struct svc_version nfsd_acl_version2 = { .vs_proc = nfsd_acl_procedures2, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 0c890347cde3..dcb5f79076c0 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -266,6 +266,5 @@ struct svc_version nfsd_acl_version3 = { .vs_proc = nfsd_acl_procedures3, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index d818e4ffd79f..045c9081eabe 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -193,11 +193,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nfserr = nfsd_write(rqstp, &resp->fh, NULL, - argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, - &resp->committed); + nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, argp->vlen, + &cnt, resp->committed); resp->count = cnt; RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index eb78109d666c..0274db6e65d0 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -303,6 +303,7 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, length + 4); if (unlikely(p == NULL)) goto out_overflow; + p += XDR_QUADLEN(length); hdr->nops = be32_to_cpup(p); return 0; out_overflow: @@ -396,13 +397,10 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, struct nfsd4_callback *cb) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; - struct nfs4_sessionid id; - int status; + int status = -ESERVERFAULT; __be32 *p; u32 dummy; - status = -ESERVERFAULT; - /* * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. @@ -410,9 +408,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; - memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); - if (memcmp(id.data, session->se_sessionid.data, - NFS4_MAX_SESSIONID_LEN) != 0) { + + if (memcmp(p, session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s Invalid session id\n", __func__); goto out; } @@ -753,6 +750,14 @@ int set_callback_cred(void) return 0; } +void cleanup_callback_cred(void) +{ + if (callback_cred) { + put_rpccred(callback_cred); + callback_cred = NULL; + } +} + static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b20577dcdd2..6b9b6cca469f 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -628,6 +628,10 @@ nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, { __be32 status; u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); *uid = make_kuid(&init_user_ns, id); if (!uid_valid(*uid)) @@ -641,6 +645,10 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, { __be32 status; u32 id = -1; + + if (name == NULL || namelen == 0) + return nfserr_inval; + status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); *gid = make_kgid(&init_user_ns, id); if (!gid_valid(*gid)) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 74a6e573e061..cbeeda1e94a2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -95,11 +95,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u32 *bmval, u32 *writable) { struct dentry *dentry = cstate->current_fh.fh_dentry; + struct svc_export *exp = cstate->current_fh.fh_export; if (!nfsd_attrs_supported(cstate->minorversion, bmval)) return nfserr_attrnotsupp; if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry))) return nfserr_attrnotsupp; + if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) && + !(exp->ex_flags & NFSEXP_SECURITY_LABEL)) + return nfserr_attrnotsupp; if (writable && !bmval_is_subset(bmval, writable)) return nfserr_inval; if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) && @@ -983,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, - &write->wr_how_written); + write->wr_how_written); fput(filp); write->wr_bytes_written = cnt; @@ -1838,6 +1842,12 @@ static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); } +static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + /* ac_supported, ac_resp_access */ + return (op_encode_hdr_size + 2)* sizeof(__be32); +} + static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); @@ -1892,6 +1902,11 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, return ret; } +static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE; +} + static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1933,6 +1948,11 @@ static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o XDR_QUADLEN(rlen)) * sizeof(__be32); } +static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE; +} + static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1952,11 +1972,23 @@ static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); } +static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids) + * sizeof(__be32); +} + static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR * + (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32); +} + static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * @@ -2011,6 +2043,19 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) } #ifdef CONFIG_NFSD_PNFS +static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + u32 maxcount = 0, rlen = 0; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount); + + return (op_encode_hdr_size + + 1 /* gd_layout_type*/ + + XDR_QUADLEN(rlen) + + 2 /* gd_notify_types */) * sizeof(__be32); +} + /* * At this stage we don't really know what layout driver will handle the request, * so we need to define an arbitrary upper bound here. @@ -2040,10 +2085,17 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_ } #endif /* CONFIG_NFSD_PNFS */ + +static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + 3) * sizeof(__be32); +} + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, .op_name = "OP_ACCESS", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_access_rsize, }, [OP_CLOSE] = { .op_func = (nfsd4op_func)nfsd4_close, @@ -2081,6 +2133,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_GETFH] = { .op_func = (nfsd4op_func)nfsd4_getfh, .op_name = "OP_GETFH", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_getfh_rsize, }, [OP_LINK] = { .op_func = (nfsd4op_func)nfsd4_link, @@ -2099,6 +2152,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_LOCKT] = { .op_func = (nfsd4op_func)nfsd4_lockt, .op_name = "OP_LOCKT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize, }, [OP_LOCKU] = { .op_func = (nfsd4op_func)nfsd4_locku, @@ -2111,15 +2165,18 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_lookup, .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUP", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_LOOKUPP] = { .op_func = (nfsd4op_func)nfsd4_lookupp, .op_flags = OP_HANDLES_WRONGSEC | OP_CLEAR_STATEID, .op_name = "OP_LOOKUPP", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_NVERIFY] = { .op_func = (nfsd4op_func)nfsd4_nverify, .op_name = "OP_NVERIFY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_OPEN] = { .op_func = (nfsd4op_func)nfsd4_open, @@ -2177,6 +2234,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_READLINK] = { .op_func = (nfsd4op_func)nfsd4_readlink, .op_name = "OP_READLINK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_readlink_rsize, }, [OP_REMOVE] = { .op_func = (nfsd4op_func)nfsd4_remove, @@ -2215,6 +2273,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_secinfo, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize, }, [OP_SETATTR] = { .op_func = (nfsd4op_func)nfsd4_setattr, @@ -2240,6 +2299,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_VERIFY] = { .op_func = (nfsd4op_func)nfsd4_verify, .op_name = "OP_VERIFY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_WRITE] = { .op_func = (nfsd4op_func)nfsd4_write, @@ -2314,11 +2374,13 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_secinfo_rsize, }, [OP_TEST_STATEID] = { .op_func = (nfsd4op_func)nfsd4_test_stateid, .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_TEST_STATEID", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_test_stateid_rsize, }, [OP_FREE_STATEID] = { .op_func = (nfsd4op_func)nfsd4_free_stateid, @@ -2332,6 +2394,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_getdeviceinfo, .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_GETDEVICEINFO", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_getdeviceinfo_rsize, }, [OP_LAYOUTGET] = { .op_func = (nfsd4op_func)nfsd4_layoutget, @@ -2381,6 +2444,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_seek_rsize, }, }; @@ -2425,14 +2489,11 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { - struct nfsd4_operation *opdesc; - nfsd4op_rsize estimator; - if (op->opnum == OP_ILLEGAL) return op_encode_hdr_size * sizeof(__be32); - opdesc = OPDESC(op); - estimator = opdesc->op_rsize_bop; - return estimator ? estimator(rqstp, op) : PAGE_SIZE; + + BUG_ON(OPDESC(op)->op_rsize_bop == NULL); + return OPDESC(op)->op_rsize_bop(rqstp, op); } void warn_on_nonidempotent_op(struct nfsd4_op *op) @@ -2476,12 +2537,13 @@ static struct svc_procedure nfsd_procedures4[2] = { }; struct svc_version nfsd_version4 = { - .vs_vers = 4, - .vs_nproc = 2, - .vs_proc = nfsd_procedures4, - .vs_dispatch = nfsd_dispatch, - .vs_xdrsize = NFS4_SVC_XDRSIZE, - .vs_rpcb_optnl = 1, + .vs_vers = 4, + .vs_nproc = 2, + .vs_proc = nfsd_procedures4, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = true, + .vs_need_cong_ctrl = true, }; /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a0dee8ae9f97..e9ef50addddb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2281,7 +2281,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r out_err: conn->cb_addr.ss_family = AF_UNSPEC; conn->cb_addrlen = 0; - dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " + dprintk("NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -7012,23 +7012,24 @@ nfs4_state_start(void) ret = set_callback_cred(); if (ret) - return -ENOMEM; + return ret; + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); if (laundry_wq == NULL) { ret = -ENOMEM; - goto out_recovery; + goto out_cleanup_cred; } ret = nfsd4_create_callback_queue(); if (ret) goto out_free_laundry; set_max_delegations(); - return 0; out_free_laundry: destroy_workqueue(laundry_wq); -out_recovery: +out_cleanup_cred: + cleanup_callback_cred(); return ret; } @@ -7086,6 +7087,7 @@ nfs4_state_shutdown(void) { destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); + cleanup_callback_cred(); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8fae53ce21d1..382c1fd05b4c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -58,7 +58,7 @@ #define NFSDDBG_FACILITY NFSDDBG_XDR -u32 nfsd_suppattrs[3][3] = { +const u32 nfsd_suppattrs[3][3] = { {NFSD4_SUPPORTED_ATTRS_WORD0, NFSD4_SUPPORTED_ATTRS_WORD1, NFSD4_SUPPORTED_ATTRS_WORD2}, @@ -1250,7 +1250,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) READ_BUF(16); p = xdr_decode_hyper(p, &write->wr_offset); write->wr_stable_how = be32_to_cpup(p++); - if (write->wr_stable_how > 2) + if (write->wr_stable_how > NFS_FILE_SYNC) goto xdr_error; write->wr_buflen = be32_to_cpup(p++); @@ -1941,12 +1941,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } else max_reply += nfsd4_max_reply(argp->rqstp, op); /* - * OP_LOCK may return a conflicting lock. (Special case - * because it will just skip encoding this if it runs - * out of xdr buffer space, and it is the only operation - * that behaves this way.) + * OP_LOCK and OP_LOCKT may return a conflicting lock. + * (Special case because it will just skip encoding this + * if it runs out of xdr buffer space, and it is the only + * operation that behaves this way.) */ - if (op->opnum == OP_LOCK) + if (op->opnum == OP_LOCK || op->opnum == OP_LOCKT) max_reply += NFS4_OPAQUE_LIMIT; if (op->status) { @@ -1966,9 +1966,13 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) DECODE_TAIL; } -static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode) +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, + struct svc_export *exp) { - if (IS_I_VERSION(inode)) { + if (exp->ex_flags & NFSEXP_V4ROOT) { + *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); + *p++ = 0; + } else if (IS_I_VERSION(inode)) { p = xdr_encode_hyper(p, inode->i_version); } else { *p++ = cpu_to_be32(stat->ctime.tv_sec); @@ -2417,8 +2421,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - err = security_inode_getsecctx(d_inode(dentry), + if (exp->ex_flags & NFSEXP_SECURITY_LABEL) + err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); + else + err = -EOPNOTSUPP; contextsupport = (err == 0); if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) @@ -2490,7 +2497,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; - p = encode_change(p, &stat, d_inode(dentry)); + p = encode_change(p, &stat, d_inode(dentry), exp); } if (bmval0 & FATTR4_WORD0_SIZE) { p = xdr_reserve_space(xdr, 8); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f3b2f34b10a3..73e75ac90525 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -536,6 +536,19 @@ out_free: return rv; } +static ssize_t +nfsd_print_version_support(char *buf, int remaining, const char *sep, + unsigned vers, unsigned minor) +{ + const char *format = (minor == 0) ? "%s%c%u" : "%s%c%u.%u"; + bool supported = !!nfsd_vers(vers, NFSD_TEST); + + if (vers == 4 && !nfsd_minorversion(minor, NFSD_TEST)) + supported = false; + return snprintf(buf, remaining, format, sep, + supported ? '+' : '-', vers, minor); +} + static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; @@ -561,6 +574,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) len = qword_get(&mesg, vers, size); if (len <= 0) return -EINVAL; do { + enum vers_op cmd; sign = *vers; if (sign == '+' || sign == '-') num = simple_strtol((vers+1), &minorp, 0); @@ -569,24 +583,22 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (*minorp == '.') { if (num != 4) return -EINVAL; - minor = simple_strtoul(minorp+1, NULL, 0); - if (minor == 0) - return -EINVAL; - if (nfsd_minorversion(minor, sign == '-' ? - NFSD_CLEAR : NFSD_SET) < 0) + if (kstrtouint(minorp+1, 0, &minor) < 0) return -EINVAL; - goto next; - } + } else + minor = 0; + cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { case 2: case 3: - case 4: - nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET); + nfsd_vers(num, cmd); break; + case 4: + if (nfsd_minorversion(minor, cmd) >= 0) + break; default: return -EINVAL; } - next: vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); /* If all get turned off, turn them back on, as @@ -599,35 +611,23 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) len = 0; sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; - for (num=2 ; num <= 4 ; num++) - if (nfsd_vers(num, NFSD_AVAIL)) { - len = snprintf(buf, remaining, "%s%c%d", sep, - nfsd_vers(num, NFSD_TEST)?'+':'-', - num); - sep = " "; - - if (len >= remaining) - break; - remaining -= len; - buf += len; - tlen += len; - } - if (nfsd_vers(4, NFSD_AVAIL)) - for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; - minor++) { - len = snprintf(buf, remaining, " %c4.%u", - (nfsd_vers(4, NFSD_TEST) && - nfsd_minorversion(minor, NFSD_TEST)) ? - '+' : '-', - minor); - + for (num=2 ; num <= 4 ; num++) { + if (!nfsd_vers(num, NFSD_AVAIL)) + continue; + minor = 0; + do { + len = nfsd_print_version_support(buf, remaining, + sep, num, minor); if (len >= remaining) - break; + goto out; remaining -= len; buf += len; tlen += len; - } - + minor++; + sep = " "; + } while (num == 4 && minor <= NFSD_SUPPORTED_MINOR_VERSION); + } +out: len = snprintf(buf, remaining, "\n"); if (len >= remaining) return -EINVAL; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d74c8c44dc35..d96606801d47 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -362,16 +362,16 @@ void nfsd_lockd_shutdown(void); FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS) -extern u32 nfsd_suppattrs[3][3]; +extern const u32 nfsd_suppattrs[3][3]; -static inline bool bmval_is_subset(u32 *bm1, u32 *bm2) +static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2) { return !((bm1[0] & ~bm2[0]) || (bm1[1] & ~bm2[1]) || (bm1[2] & ~bm2[2])); } -static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) +static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) { return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]); } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 010aff5c5a79..fa82b7707e85 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -204,18 +204,14 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, struct nfsd_attrstat *resp) { __be32 nfserr; - int stable = 1; unsigned long cnt = argp->len; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, - argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, - &stable); + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, + rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC); return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e6bfd96734c0..efd66da99201 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -153,6 +153,18 @@ int nfsd_vers(int vers, enum vers_op change) return 0; } +static void +nfsd_adjust_nfsd_versions4(void) +{ + unsigned i; + + for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { + if (nfsd_supported_minorversions[i]) + return; + } + nfsd_vers(4, NFSD_CLEAR); +} + int nfsd_minorversion(u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) @@ -160,9 +172,11 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change) switch(change) { case NFSD_SET: nfsd_supported_minorversions[minorversion] = true; + nfsd_vers(4, NFSD_SET); break; case NFSD_CLEAR: nfsd_supported_minorversions[minorversion] = false; + nfsd_adjust_nfsd_versions4(); break; case NFSD_TEST: return nfsd_supported_minorversions[minorversion]; @@ -354,6 +368,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; + if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4516e8b7d776..005c911b34ac 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -615,6 +615,7 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, extern __be32 nfs4_check_open_reclaim(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn); extern int set_callback_cred(void); +extern void cleanup_callback_cred(void); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 26c6fdb4bf67..19d50f600e8d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -377,7 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, __be32 err; int host_err; bool get_write_count; - int size_change = 0; + bool size_change = (iap->ia_valid & ATTR_SIZE); if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -390,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) - goto out; + return err; if (get_write_count) { host_err = fh_want_write(fhp); if (host_err) - return nfserrno(host_err); + goto out; } dentry = fhp->fh_dentry; @@ -405,20 +405,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~ATTR_MODE; if (!iap->ia_valid) - goto out; + return 0; nfsd_sanitize_attrs(inode, iap); + if (check_guard && guardtime != inode->i_ctime.tv_sec) + return nfserr_notsync; + /* * The size case is special, it changes the file in addition to the - * attributes. + * attributes, and file systems don't expect it to be mixed with + * "random" attribute changes. We thus split out the size change + * into a separate call to ->setattr, and do the rest as a separate + * setattr call. */ - if (iap->ia_valid & ATTR_SIZE) { + if (size_change) { err = nfsd_get_write_access(rqstp, fhp, iap); if (err) - goto out; - size_change = 1; + return err; + } + fh_lock(fhp); + if (size_change) { /* * RFC5661, Section 18.30.4: * Changing the size of a file with SETATTR indirectly @@ -426,29 +434,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * * (and similar for the older RFCs) */ - if (iap->ia_size != i_size_read(inode)) - iap->ia_valid |= ATTR_MTIME; - } + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; - iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(dentry, &size_attr, NULL); + if (host_err) + goto out_unlock; + iap->ia_valid &= ~ATTR_SIZE; - if (check_guard && guardtime != inode->i_ctime.tv_sec) { - err = nfserr_notsync; - goto out_put_write_access; + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + goto out_unlock; } - fh_lock(fhp); + iap->ia_valid |= ATTR_CTIME; host_err = notify_change(dentry, iap, NULL); - fh_unlock(fhp); - err = nfserrno(host_err); -out_put_write_access: +out_unlock: + fh_unlock(fhp); if (size_change) put_write_access(inode); - if (!err) - err = nfserrno(commit_metadata(fhp)); out: - return err; + if (!host_err) + host_err = commit_metadata(fhp); + return nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) @@ -940,14 +955,12 @@ static int wait_for_concurrent_writes(struct file *file) __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *cnt, int *stablep) + unsigned long *cnt, int stable) { struct svc_export *exp; - struct inode *inode; mm_segment_t oldfs; __be32 err = 0; int host_err; - int stable = *stablep; int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; @@ -962,13 +975,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, */ current->flags |= PF_LESS_THROTTLE; - inode = file_inode(file); - exp = fhp->fh_export; - + exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); if (!EX_ISSYNC(exp)) - stable = 0; + stable = NFS_UNSTABLE; if (stable && !use_wgather) flags |= RWF_SYNC; @@ -1035,35 +1046,22 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, - int *stablep) +nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + struct kvec *vec, int vlen, unsigned long *cnt, int stable) { - __be32 err = 0; + struct file *file = NULL; + __be32 err = 0; trace_write_start(rqstp, fhp, offset, vlen); - if (file) { - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - trace_write_opened(rqstp, fhp, offset, vlen); - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, - stablep); - trace_write_io_done(rqstp, fhp, offset, vlen); - } else { - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); - if (err) - goto out; + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + if (err) + goto out; - trace_write_opened(rqstp, fhp, offset, vlen); - if (cnt) - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, - cnt, stablep); - trace_write_io_done(rqstp, fhp, offset, vlen); - fput(file); - } + trace_write_opened(rqstp, fhp, offset, vlen); + err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); + trace_write_io_done(rqstp, fhp, offset, vlen); + fput(file); out: trace_write_done(rqstp, fhp, offset, vlen); return err; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 0bf9e7bf5800..db98c48c735a 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -83,12 +83,12 @@ __be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, - loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, + struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, - int *stablep); + int stable); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 03a6653d329a..2ea0c282f3dc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -22,7 +22,6 @@ struct ceph_osd_client; * completion callback for async writepages */ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); -typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); #define CEPH_HOMELESS_OSD -1 @@ -170,15 +169,12 @@ struct ceph_osd_request { unsigned int r_num_ops; int r_result; - bool r_got_reply; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; - struct completion r_completion; - struct completion r_done_completion; /* fsync waiter */ + struct completion r_completion; /* private to osd_client.c */ ceph_osdc_callback_t r_callback; - ceph_osdc_unsafe_callback_t r_unsafe_callback; struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 9a9041784dcf..938656f70807 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) case CEPH_POOL_TYPE_EC: return false; default: - BUG_ON(1); + BUG(); } } @@ -82,13 +82,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest, void ceph_oloc_destroy(struct ceph_object_locator *oloc); /* - * Maximum supported by kernel client object name length - * - * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) - */ -#define CEPH_MAX_OID_NAME_LEN 100 - -/* * 51-char inline_name is long enough for all cephfs and all but one * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all @@ -173,8 +166,8 @@ struct ceph_osdmap { * the list of osds that store+replicate them. */ struct crush_map *crush; - struct mutex crush_scratch_mutex; - int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; + struct mutex crush_workspace_mutex; + void *crush_workspace; }; static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 5c0da61cb763..5d0018782d50 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -50,7 +50,7 @@ struct ceph_timespec { #define CEPH_PG_LAYOUT_LINEAR 2 #define CEPH_PG_LAYOUT_HYBRID 3 -#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ +#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */ /* * placement group. diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 811f7a915658..76e28c229805 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -197,6 +197,17 @@ #endif #endif +#ifdef CONFIG_STACK_VALIDATION +#define annotate_unreachable() ({ \ + asm("%c0:\t\n" \ + ".pushsection __unreachable, \"a\"\t\n" \ + ".long %c0b\t\n" \ + ".popsection\t\n" : : "i" (__LINE__)); \ +}) +#else +#define annotate_unreachable() +#endif + /* * Mark a position in code as unreachable. This can be used to * suppress control flow warnings after asm blocks that transfer @@ -206,7 +217,8 @@ * this in the preprocessor, but we can live with this because they're * unreleased. Really, we need to have autoconf for the kernel. */ -#define unreachable() __builtin_unreachable() +#define unreachable() \ + do { annotate_unreachable(); __builtin_unreachable(); } while (0) /* Mark a function definition as prohibited from being cloned. */ #define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index be8f12b8f195..fbecbd089d75 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -135,13 +135,6 @@ struct crush_bucket { __u32 size; /* num items */ __s32 *items; - /* - * cached random permutation: used for uniform bucket and for - * the linear search fallback for the other bucket types. - */ - __u32 perm_x; /* @x for which *perm is defined */ - __u32 perm_n; /* num elements of *perm that are permuted/defined */ - __u32 *perm; }; struct crush_bucket_uniform { @@ -211,6 +204,21 @@ struct crush_map { * device fails. */ __u8 chooseleaf_stable; + /* + * This value is calculated after decode or construction by + * the builder. It is exposed here (rather than having a + * 'build CRUSH working space' function) so that callers can + * reserve a static buffer, allocate space on the stack, or + * otherwise avoid calling into the heap allocator if they + * want to. The size of the working space depends on the map, + * while the size of the scratch vector passed to the mapper + * depends on the size of the desired result set. + * + * Nothing stops the caller from allocating both in one swell + * foop and passing in two points, though. + */ + size_t working_size; + #ifndef __KERNEL__ /* * version 0 (original) of straw_calc has various flaws. version 1 @@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i) return ((i+1) << 1)-1; } +/* + * These data structures are private to the CRUSH implementation. They + * are exposed in this header file because builder needs their + * definitions to calculate the total working size. + * + * Moving this out of the crush map allow us to treat the CRUSH map as + * immutable within the mapper and removes the requirement for a CRUSH + * map lock. + */ +struct crush_work_bucket { + __u32 perm_x; /* @x for which *perm is defined */ + __u32 perm_n; /* num elements of *perm that are permuted/defined */ + __u32 *perm; /* Permutation of the bucket's items */ +}; + +struct crush_work { + struct crush_work_bucket **work; /* Per-bucket working store */ +}; + #endif diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 5dfd5b1125d2..c95e19e1ff11 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weights, int weight_max, - int *scratch); + void *cwin); + +/* + * Returns the exact amount of workspace that will need to be used + * for a given combination of crush_map and result_max. The caller can + * then allocate this much on its own, either on the stack, in a + * per-thread long-lived buffer, or however it likes. + */ +static inline size_t crush_work_size(const struct crush_map *map, + int result_max) +{ + return map->working_size + result_max * 3 * sizeof(__u32); +} + +void crush_init_workspace(const struct crush_map *map, void *v); #endif diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 600aadf9cca4..0023fee4bbbc 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -1,54 +1,10 @@ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H -/* - * Variant of atomic_t specialized for reference counts. - * - * The interface matches the atomic_t interface (to aid in porting) but only - * provides the few functions one should use for reference counting. - * - * It differs in that the counter saturates at UINT_MAX and will not move once - * there. This avoids wrapping the counter and causing 'spurious' - * use-after-free issues. - * - * Memory ordering rules are slightly relaxed wrt regular atomic_t functions - * and provide only what is strictly required for refcounts. - * - * The increments are fully relaxed; these will not provide ordering. The - * rationale is that whatever is used to obtain the object we're increasing the - * reference count on will provide the ordering. For locked data structures, - * its the lock acquire, for RCU/lockless data structures its the dependent - * load. - * - * Do note that inc_not_zero() provides a control dependency which will order - * future stores against the inc, this ensures we'll never modify the object - * if we did not in fact acquire a reference. - * - * The decrements will provide release order, such that all the prior loads and - * stores will be issued before, it also provides a control dependency, which - * will order us against the subsequent free(). - * - * The control dependency is against the load of the cmpxchg (ll/sc) that - * succeeded. This means the stores aren't fully ordered, but this is fine - * because the 1->0 transition indicates no concurrency. - * - * Note that the allocator is responsible for ordering things between free() - * and alloc(). - * - */ - #include <linux/atomic.h> -#include <linux/bug.h> #include <linux/mutex.h> #include <linux/spinlock.h> - -#ifdef CONFIG_DEBUG_REFCOUNT -#define REFCOUNT_WARN(cond, str) WARN_ON(cond) -#define __refcount_check __must_check -#else -#define REFCOUNT_WARN(cond, str) (void)(cond) -#define __refcount_check -#endif +#include <linux/kernel.h> typedef struct refcount_struct { atomic_t refs; @@ -66,229 +22,21 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } -static inline __refcount_check -bool refcount_add_not_zero(unsigned int i, refcount_t *r) -{ - unsigned int old, new, val = atomic_read(&r->refs); - - for (;;) { - if (!val) - return false; - - if (unlikely(val == UINT_MAX)) - return true; - - new = val + i; - if (new < val) - new = UINT_MAX; - old = atomic_cmpxchg_relaxed(&r->refs, val, new); - if (old == val) - break; - - val = old; - } - - REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); - - return true; -} - -static inline void refcount_add(unsigned int i, refcount_t *r) -{ - REFCOUNT_WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n"); -} - -/* - * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN. - * - * Provides no memory ordering, it is assumed the caller has guaranteed the - * object memory to be stable (RCU, etc.). It does provide a control dependency - * and thereby orders future stores. See the comment on top. - */ -static inline __refcount_check -bool refcount_inc_not_zero(refcount_t *r) -{ - unsigned int old, new, val = atomic_read(&r->refs); - - for (;;) { - new = val + 1; - - if (!val) - return false; - - if (unlikely(!new)) - return true; - - old = atomic_cmpxchg_relaxed(&r->refs, val, new); - if (old == val) - break; - - val = old; - } - - REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); - - return true; -} - -/* - * Similar to atomic_inc(), will saturate at UINT_MAX and WARN. - * - * Provides no memory ordering, it is assumed the caller already has a - * reference on the object, will WARN when this is not so. - */ -static inline void refcount_inc(refcount_t *r) -{ - REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n"); -} - -/* - * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to - * decrement when saturated at UINT_MAX. - * - * Provides release memory ordering, such that prior loads and stores are done - * before, and provides a control dependency such that free() must come after. - * See the comment on top. - */ -static inline __refcount_check -bool refcount_sub_and_test(unsigned int i, refcount_t *r) -{ - unsigned int old, new, val = atomic_read(&r->refs); - - for (;;) { - if (unlikely(val == UINT_MAX)) - return false; - - new = val - i; - if (new > val) { - REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n"); - return false; - } - - old = atomic_cmpxchg_release(&r->refs, val, new); - if (old == val) - break; - - val = old; - } - - return !new; -} - -static inline __refcount_check -bool refcount_dec_and_test(refcount_t *r) -{ - return refcount_sub_and_test(1, r); -} +extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r); +extern void refcount_add(unsigned int i, refcount_t *r); -/* - * Similar to atomic_dec(), it will WARN on underflow and fail to decrement - * when saturated at UINT_MAX. - * - * Provides release memory ordering, such that prior loads and stores are done - * before. - */ -static inline -void refcount_dec(refcount_t *r) -{ - REFCOUNT_WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n"); -} - -/* - * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the - * success thereof. - * - * Like all decrement operations, it provides release memory order and provides - * a control dependency. - * - * It can be used like a try-delete operator; this explicit case is provided - * and not cmpxchg in generic, because that would allow implementing unsafe - * operations. - */ -static inline __refcount_check -bool refcount_dec_if_one(refcount_t *r) -{ - return atomic_cmpxchg_release(&r->refs, 1, 0) == 1; -} - -/* - * No atomic_t counterpart, it decrements unless the value is 1, in which case - * it will return false. - * - * Was often done like: atomic_add_unless(&var, -1, 1) - */ -static inline __refcount_check -bool refcount_dec_not_one(refcount_t *r) -{ - unsigned int old, new, val = atomic_read(&r->refs); +extern __must_check bool refcount_inc_not_zero(refcount_t *r); +extern void refcount_inc(refcount_t *r); - for (;;) { - if (unlikely(val == UINT_MAX)) - return true; +extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r); +extern void refcount_sub(unsigned int i, refcount_t *r); - if (val == 1) - return false; +extern __must_check bool refcount_dec_and_test(refcount_t *r); +extern void refcount_dec(refcount_t *r); - new = val - 1; - if (new > val) { - REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n"); - return true; - } - - old = atomic_cmpxchg_release(&r->refs, val, new); - if (old == val) - break; - - val = old; - } - - return true; -} - -/* - * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail - * to decrement when saturated at UINT_MAX. - * - * Provides release memory ordering, such that prior loads and stores are done - * before, and provides a control dependency such that free() must come after. - * See the comment on top. - */ -static inline __refcount_check -bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) -{ - if (refcount_dec_not_one(r)) - return false; - - mutex_lock(lock); - if (!refcount_dec_and_test(r)) { - mutex_unlock(lock); - return false; - } - - return true; -} - -/* - * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to - * decrement when saturated at UINT_MAX. - * - * Provides release memory ordering, such that prior loads and stores are done - * before, and provides a control dependency such that free() must come after. - * See the comment on top. - */ -static inline __refcount_check -bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) -{ - if (refcount_dec_not_one(r)) - return false; - - spin_lock(lock); - if (!refcount_dec_and_test(r)) { - spin_unlock(lock); - return false; - } - - return true; -} +extern __must_check bool refcount_dec_if_one(refcount_t *r); +extern __must_check bool refcount_dec_not_one(refcount_t *r); +extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock); +extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock); #endif /* _LINUX_REFCOUNT_H */ diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 8a511c0985aa..20d157a518a7 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -204,8 +204,11 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd) kref_put(&h->ref, cd->cache_put); } -static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h) +static inline bool cache_is_expired(struct cache_detail *detail, struct cache_head *h) { + if (!test_bit(CACHE_VALID, &h->flags)) + return false; + return (h->expiry_time < seconds_since_boot()) || (detail->flush_time >= h->last_refresh); } @@ -227,6 +230,7 @@ extern void sunrpc_destroy_cache_detail(struct cache_detail *cd); extern int sunrpc_cache_register_pipefs(struct dentry *parent, const char *, umode_t, struct cache_detail *); extern void sunrpc_cache_unregister_pipefs(struct cache_detail *); +extern void sunrpc_cache_unhash(struct cache_detail *, struct cache_head *); /* Must store cache_detail in seq_file->private if using next three functions */ extern void *cache_seq_start(struct seq_file *file, loff_t *pos); diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index cfda6adcf33c..245fc59b7324 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -110,6 +110,15 @@ struct rpcrdma_msg { }; /* + * XDR sizes, in quads + */ +enum { + rpcrdma_fixed_maxsz = 4, + rpcrdma_segment_maxsz = 4, + rpcrdma_readchunk_maxsz = 2 + rpcrdma_segment_maxsz, +}; + +/* * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks */ #define RPCRDMA_HDRLEN_MIN (sizeof(__be32) * 7) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 7321ae933867..e770abeed32d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -400,10 +400,14 @@ struct svc_version { struct svc_procedure * vs_proc; /* per-procedure info */ u32 vs_xdrsize; /* xdrsize needed for this version */ - unsigned int vs_hidden : 1, /* Don't register with portmapper. - * Only used for nfsacl so far. */ - vs_rpcb_optnl:1;/* Don't care the result of register. - * Only used for nfsv4. */ + /* Don't register with rpcbind */ + bool vs_hidden; + + /* Don't care if the rpcbind registration fails */ + bool vs_rpcb_optnl; + + /* Need xprt with congestion control */ + bool vs_need_cong_ctrl; /* Override dispatch function (e.g. when caching replies). * A return value of 0 means drop the request. diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 757fb963696c..b105f73e3ca2 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -70,7 +70,7 @@ extern atomic_t rdma_stat_sq_prod; * completes. */ struct svc_rdma_op_ctxt { - struct list_head free; + struct list_head list; struct svc_rdma_op_ctxt *read_hdr; struct svc_rdma_fastreg_mr *frmr; int hdr_count; @@ -78,7 +78,6 @@ struct svc_rdma_op_ctxt { struct ib_cqe cqe; struct ib_cqe reg_cqe; struct ib_cqe inv_cqe; - struct list_head dto_q; u32 byte_len; u32 position; struct svcxprt_rdma *xprt; @@ -141,7 +140,8 @@ struct svcxprt_rdma { atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ unsigned int sc_rq_depth; /* Depth of RQ */ - u32 sc_max_requests; /* Forward credits */ + __be32 sc_fc_credits; /* Forward credits */ + u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ int sc_max_req_size; /* Size of each RQ WR buf */ @@ -171,7 +171,6 @@ struct svcxprt_rdma { wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ unsigned long sc_flags; - struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */ struct list_head sc_read_complete_q; struct work_struct sc_work; }; @@ -214,11 +213,7 @@ extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, __be32, __be64, u32); -extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *, - struct rpcrdma_msg *, - struct rpcrdma_msg *, - enum rpcrdma_proc); -extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); +extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp); /* svc_rdma_recvfrom.c */ extern int svc_rdma_recvfrom(struct svc_rqst *); diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 7440290f64ac..ddb7f94a9d06 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -67,6 +67,7 @@ struct svc_xprt { #define XPT_CACHE_AUTH 11 /* cache auth info */ #define XPT_LOCAL 12 /* connection from loopback interface */ #define XPT_KILL_TEMP 13 /* call xpo_kill_temp_xprt before closing */ +#define XPT_CONG_CTRL 14 /* has congestion control */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 7550e9176a54..c111a91adcc0 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -3,7 +3,6 @@ #include <linux/types.h> #include <linux/compiler.h> -#include <linux/sysctl.h> #include <linux/in.h> #include <linux/in6.h> diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h index 3efc0ca18345..79da349f1060 100644 --- a/include/uapi/linux/netfilter/xt_hashlimit.h +++ b/include/uapi/linux/netfilter/xt_hashlimit.h @@ -2,6 +2,7 @@ #define _UAPI_XT_HASHLIMIT_H #include <linux/types.h> +#include <linux/limits.h> #include <linux/if.h> /* timings are in milliseconds. */ diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index 0df7bd5d2fb1..c3be256107c6 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -32,7 +32,8 @@ #define NFSEXP_ASYNC 0x0010 #define NFSEXP_GATHERED_WRITES 0x0020 #define NFSEXP_NOREADDIRPLUS 0x0040 -/* 80 100 currently unused */ +#define NFSEXP_SECURITY_LABEL 0x0080 +/* 0x100 currently unused */ #define NFSEXP_NOHIDE 0x0200 #define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ @@ -53,7 +54,7 @@ #define NFSEXP_PNFS 0x20000 /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x3FE7F +#define NFSEXP_ALLFLAGS 0x3FEFF /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b4e0b98f4eb..1031bdf9f012 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -455,7 +455,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -3522,6 +3522,8 @@ static void perf_event_enable_on_exec(int ctxn) if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); + } else { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); } perf_ctx_unlock(cpuctx, ctx); @@ -9955,6 +9957,7 @@ SYSCALL_DEFINE5(perf_event_open, * of swizzling perf_event::ctx. */ perf_remove_from_context(group_leader, 0); + put_ctx(gctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { @@ -9993,13 +9996,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - - /* - * Now that all events are installed in @ctx, nothing - * references @gctx anymore, so drop the last reference we have - * on it. - */ - put_ctx(gctx); } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ea1925ac5c0..bbfb917a9b49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1090,6 +1090,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, int ret = 0; rq = task_rq_lock(p, &rf); + update_rq_clock(rq); if (p->flags & PF_KTHREAD) { /* @@ -5560,7 +5561,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct rq_flags rf, old_rf; + struct rq_flags rf; int dest_cpu; /* @@ -5579,7 +5580,9 @@ static void migrate_tasks(struct rq *dead_rq) * class method both need to have an up-to-date * value of rq->clock[_task] */ + rq_pin_lock(rq, &rf); update_rq_clock(rq); + rq_unpin_lock(rq, &rf); for (;;) { /* @@ -5592,7 +5595,7 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task() assumes pinned rq->lock: */ - rq_pin_lock(rq, &rf); + rq_repin_lock(rq, &rf); next = pick_next_task(rq, &fake_task, &rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5621,13 +5624,6 @@ static void migrate_tasks(struct rq *dead_rq) continue; } - /* - * __migrate_task() may return with a different - * rq->lock held and a new cookie in 'rf', but we need - * to preserve rf::clock_update_flags for 'dead_rq'. - */ - old_rf = rf; - /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); @@ -5636,7 +5632,6 @@ static void migrate_tasks(struct rq *dead_rq) raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); - rf = old_rf; } raw_spin_unlock(&next->pi_lock); } @@ -6819,11 +6814,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - sched_online_group(tg, parent); - return &tg->css; } +/* Expose task group only after completing cgroup initialization */ +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); + + if (parent) + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -7229,6 +7233,7 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .fork = cpu_cgroup_fork, diff --git a/lib/Kconfig b/lib/Kconfig index 8f69579dfac3..0c8b78a9ae2e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -559,7 +559,7 @@ config SBITMAP bool config PARMAN - tristate + tristate "parman" if COMPILE_TEST config PRIME_NUMBERS tristate diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 55735c9bdb75..97d62c2da6c2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -729,19 +729,6 @@ source "lib/Kconfig.kmemcheck" source "lib/Kconfig.kasan" -config DEBUG_REFCOUNT - bool "Verbose refcount checks" - help - Say Y here if you want reference counters (refcount_t and kref) to - generate WARNs on dubious usage. Without this refcount_t will still - be a saturating counter and avoid Use-After-Free by turning it into - a resource leak Denial-Of-Service. - - Use of this option will increase kernel text size but will alert the - admin of potential abuse. - - If in doubt, say "N". - endmenu # "Memory Debugging" config ARCH_HAS_KCOV diff --git a/lib/Makefile b/lib/Makefile index c9023efbd4ca..469b2392893a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -38,7 +38,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \ - once.o + once.o refcount.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += hexdump.o diff --git a/lib/refcount.c b/lib/refcount.c new file mode 100644 index 000000000000..1d33366189d1 --- /dev/null +++ b/lib/refcount.c @@ -0,0 +1,267 @@ +/* + * Variant of atomic_t specialized for reference counts. + * + * The interface matches the atomic_t interface (to aid in porting) but only + * provides the few functions one should use for reference counting. + * + * It differs in that the counter saturates at UINT_MAX and will not move once + * there. This avoids wrapping the counter and causing 'spurious' + * use-after-free issues. + * + * Memory ordering rules are slightly relaxed wrt regular atomic_t functions + * and provide only what is strictly required for refcounts. + * + * The increments are fully relaxed; these will not provide ordering. The + * rationale is that whatever is used to obtain the object we're increasing the + * reference count on will provide the ordering. For locked data structures, + * its the lock acquire, for RCU/lockless data structures its the dependent + * load. + * + * Do note that inc_not_zero() provides a control dependency which will order + * future stores against the inc, this ensures we'll never modify the object + * if we did not in fact acquire a reference. + * + * The decrements will provide release order, such that all the prior loads and + * stores will be issued before, it also provides a control dependency, which + * will order us against the subsequent free(). + * + * The control dependency is against the load of the cmpxchg (ll/sc) that + * succeeded. This means the stores aren't fully ordered, but this is fine + * because the 1->0 transition indicates no concurrency. + * + * Note that the allocator is responsible for ordering things between free() + * and alloc(). + * + */ + +#include <linux/refcount.h> +#include <linux/bug.h> + +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + unsigned int old, new, val = atomic_read(&r->refs); + + for (;;) { + if (!val) + return false; + + if (unlikely(val == UINT_MAX)) + return true; + + new = val + i; + if (new < val) + new = UINT_MAX; + old = atomic_cmpxchg_relaxed(&r->refs, val, new); + if (old == val) + break; + + val = old; + } + + WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); + + return true; +} +EXPORT_SYMBOL_GPL(refcount_add_not_zero); + +void refcount_add(unsigned int i, refcount_t *r) +{ + WARN(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n"); +} +EXPORT_SYMBOL_GPL(refcount_add); + +/* + * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + */ +bool refcount_inc_not_zero(refcount_t *r) +{ + unsigned int old, new, val = atomic_read(&r->refs); + + for (;;) { + new = val + 1; + + if (!val) + return false; + + if (unlikely(!new)) + return true; + + old = atomic_cmpxchg_relaxed(&r->refs, val, new); + if (old == val) + break; + + val = old; + } + + WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); + + return true; +} +EXPORT_SYMBOL_GPL(refcount_inc_not_zero); + +/* + * Similar to atomic_inc(), will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller already has a + * reference on the object, will WARN when this is not so. + */ +void refcount_inc(refcount_t *r) +{ + WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n"); +} +EXPORT_SYMBOL_GPL(refcount_inc); + +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + unsigned int old, new, val = atomic_read(&r->refs); + + for (;;) { + if (unlikely(val == UINT_MAX)) + return false; + + new = val - i; + if (new > val) { + WARN(new > val, "refcount_t: underflow; use-after-free.\n"); + return false; + } + + old = atomic_cmpxchg_release(&r->refs, val, new); + if (old == val) + break; + + val = old; + } + + return !new; +} +EXPORT_SYMBOL_GPL(refcount_sub_and_test); + +/* + * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to + * decrement when saturated at UINT_MAX. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides a control dependency such that free() must come after. + * See the comment on top. + */ +bool refcount_dec_and_test(refcount_t *r) +{ + return refcount_sub_and_test(1, r); +} +EXPORT_SYMBOL_GPL(refcount_dec_and_test); + +/* + * Similar to atomic_dec(), it will WARN on underflow and fail to decrement + * when saturated at UINT_MAX. + * + * Provides release memory ordering, such that prior loads and stores are done + * before. + */ + +void refcount_dec(refcount_t *r) +{ + WARN(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n"); +} +EXPORT_SYMBOL_GPL(refcount_dec); + +/* + * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the + * success thereof. + * + * Like all decrement operations, it provides release memory order and provides + * a control dependency. + * + * It can be used like a try-delete operator; this explicit case is provided + * and not cmpxchg in generic, because that would allow implementing unsafe + * operations. + */ +bool refcount_dec_if_one(refcount_t *r) +{ + return atomic_cmpxchg_release(&r->refs, 1, 0) == 1; +} +EXPORT_SYMBOL_GPL(refcount_dec_if_one); + +/* + * No atomic_t counterpart, it decrements unless the value is 1, in which case + * it will return false. + * + * Was often done like: atomic_add_unless(&var, -1, 1) + */ +bool refcount_dec_not_one(refcount_t *r) +{ + unsigned int old, new, val = atomic_read(&r->refs); + + for (;;) { + if (unlikely(val == UINT_MAX)) + return true; + + if (val == 1) + return false; + + new = val - 1; + if (new > val) { + WARN(new > val, "refcount_t: underflow; use-after-free.\n"); + return true; + } + + old = atomic_cmpxchg_release(&r->refs, val, new); + if (old == val) + break; + + val = old; + } + + return true; +} +EXPORT_SYMBOL_GPL(refcount_dec_not_one); + +/* + * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail + * to decrement when saturated at UINT_MAX. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides a control dependency such that free() must come after. + * See the comment on top. + */ +bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) +{ + if (refcount_dec_not_one(r)) + return false; + + mutex_lock(lock); + if (!refcount_dec_and_test(r)) { + mutex_unlock(lock); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock); + +/* + * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to + * decrement when saturated at UINT_MAX. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides a control dependency such that free() must come after. + * See the comment on top. + */ +bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) +{ + if (refcount_dec_not_one(r)) + return false; + + spin_lock(lock); + if (!refcount_dec_and_test(r)) { + spin_unlock(lock); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(refcount_dec_and_lock); + diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 172454e6b979..c5b9b9351cec 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -146,9 +146,7 @@ static void bucket_table_free(const struct bucket_table *tbl) if (tbl->nest) nested_bucket_table_free(tbl); - if (tbl) - kvfree(tbl->locks); - + kvfree(tbl->locks); kvfree(tbl); } @@ -1123,12 +1121,13 @@ struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, union nested_table *ntbl; ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); - ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); subhash >>= tbl->nest; while (ntbl && size > (1 << shift)) { index = subhash & ((1 << shift) - 1); - ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); size >>= shift; subhash >>= shift; } diff --git a/lib/test_parman.c b/lib/test_parman.c index fe9f3a785804..35e32243693c 100644 --- a/lib/test_parman.c +++ b/lib/test_parman.c @@ -334,7 +334,7 @@ static int test_parman_check_array(struct test_parman *test_parman, last_priority = item->prio->priority; if (item->parman_item.index != i) { - pr_err("Item has different index in compare to where it actualy is (%lu != %d)\n", + pr_err("Item has different index in compare to where it actually is (%lu != %d)\n", item->parman_item.index, i); return -EINVAL; } diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 50f040fdb2a9..b9233b990399 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc, dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", __func__, lock_name, type, cookie, tag, desc, flags); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - lock_op_page, lock_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, lock_op_page, + lock_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(lock_op_page); @@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc, dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - unlock_op_page, unlock_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, unlock_op_page, + unlock_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(unlock_op_page); @@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, cookie, ENTITY_NAME(*locker)); ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - break_op_page, break_op_buf_size, NULL, NULL); + CEPH_OSD_FLAG_WRITE, break_op_page, + break_op_buf_size, NULL, NULL); dout("%s: status %d\n", __func__, ret); __free_page(break_op_page); @@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc, int get_info_op_buf_size; int name_len = strlen(lock_name); struct page *get_info_op_page, *reply_page; - size_t reply_len; + size_t reply_len = PAGE_SIZE; void *p, *end; int ret; diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 80d7c3a97cb8..5bf94c04f645 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) { kfree(b->item_weights); kfree(b->sum_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { - kfree(b->h.perm); kfree(b->h.items); kfree(b->node_weights); kfree(b); @@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) { kfree(b->straws); kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } @@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); - kfree(b->h.perm); kfree(b->h.items); kfree(b); } diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 130ab407c5ec..b5cd8c21bfdf 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size return -1; } - /* * bucket choose methods * @@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size * Since this is expensive, we optimize for the r=0 case, which * captures the vast majority of calls. */ -static int bucket_perm_choose(struct crush_bucket *bucket, +static int bucket_perm_choose(const struct crush_bucket *bucket, + struct crush_work_bucket *work, int x, int r) { unsigned int pr = r % bucket->size; unsigned int i, s; /* start a new permutation if @x has changed */ - if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { + if (work->perm_x != (__u32)x || work->perm_n == 0) { dprintk("bucket %d new x=%d\n", bucket->id, x); - bucket->perm_x = x; + work->perm_x = x; /* optimize common r=0 case */ if (pr == 0) { s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % bucket->size; - bucket->perm[0] = s; - bucket->perm_n = 0xffff; /* magic value, see below */ + work->perm[0] = s; + work->perm_n = 0xffff; /* magic value, see below */ goto out; } for (i = 0; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm_n = 0; - } else if (bucket->perm_n == 0xffff) { + work->perm[i] = i; + work->perm_n = 0; + } else if (work->perm_n == 0xffff) { /* clean up after the r=0 case above */ for (i = 1; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm[bucket->perm[0]] = 0; - bucket->perm_n = 1; + work->perm[i] = i; + work->perm[work->perm[0]] = 0; + work->perm_n = 1; } /* calculate permutation up to pr */ - for (i = 0; i < bucket->perm_n; i++) - dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); - while (bucket->perm_n <= pr) { - unsigned int p = bucket->perm_n; + for (i = 0; i < work->perm_n; i++) + dprintk(" perm_choose have %d: %d\n", i, work->perm[i]); + while (work->perm_n <= pr) { + unsigned int p = work->perm_n; /* no point in swapping the final entry */ if (p < bucket->size - 1) { i = crush_hash32_3(bucket->hash, x, bucket->id, p) % (bucket->size - p); if (i) { - unsigned int t = bucket->perm[p + i]; - bucket->perm[p + i] = bucket->perm[p]; - bucket->perm[p] = t; + unsigned int t = work->perm[p + i]; + work->perm[p + i] = work->perm[p]; + work->perm[p] = t; } dprintk(" perm_choose swap %d with %d\n", p, p+i); } - bucket->perm_n++; + work->perm_n++; } for (i = 0; i < bucket->size; i++) - dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); + dprintk(" perm_choose %d: %d\n", i, work->perm[i]); - s = bucket->perm[pr]; + s = work->perm[pr]; out: dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, bucket->size, x, r, pr, s); @@ -132,14 +132,14 @@ out: } /* uniform */ -static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, - int x, int r) +static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, + struct crush_work_bucket *work, int x, int r) { - return bucket_perm_choose(&bucket->h, x, r); + return bucket_perm_choose(&bucket->h, work, x, r); } /* list */ -static int bucket_list_choose(struct crush_bucket_list *bucket, +static int bucket_list_choose(const struct crush_bucket_list *bucket, int x, int r) { int i; @@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, w *= bucket->sum_weights[i]; w = w >> 16; /*dprintk(" scaled %llx\n", w);*/ - if (w < bucket->item_weights[i]) + if (w < bucket->item_weights[i]) { return bucket->h.items[i]; + } } dprintk("bad list sums for bucket %d\n", bucket->h.id); @@ -192,7 +193,7 @@ static int terminal(int x) return x & 1; } -static int bucket_tree_choose(struct crush_bucket_tree *bucket, +static int bucket_tree_choose(const struct crush_bucket_tree *bucket, int x, int r) { int n; @@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, /* straw */ -static int bucket_straw_choose(struct crush_bucket_straw *bucket, +static int bucket_straw_choose(const struct crush_bucket_straw *bucket, int x, int r) { __u32 i; @@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin) * */ -static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, +static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, int x, int r) { unsigned int i, high = 0; @@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, high_draw = draw; } } + return bucket->h.items[high]; } -static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +static int crush_bucket_choose(const struct crush_bucket *in, + struct crush_work_bucket *work, + int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: - return bucket_uniform_choose((struct crush_bucket_uniform *)in, - x, r); + return bucket_uniform_choose( + (const struct crush_bucket_uniform *)in, + work, x, r); case CRUSH_BUCKET_LIST: - return bucket_list_choose((struct crush_bucket_list *)in, + return bucket_list_choose((const struct crush_bucket_list *)in, x, r); case CRUSH_BUCKET_TREE: - return bucket_tree_choose((struct crush_bucket_tree *)in, + return bucket_tree_choose((const struct crush_bucket_tree *)in, x, r); case CRUSH_BUCKET_STRAW: - return bucket_straw_choose((struct crush_bucket_straw *)in, - x, r); + return bucket_straw_choose( + (const struct crush_bucket_straw *)in, + x, r); case CRUSH_BUCKET_STRAW2: - return bucket_straw2_choose((struct crush_bucket_straw2 *)in, - x, r); + return bucket_straw2_choose( + (const struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } - /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map, * @parent_r: r value passed from the parent */ static int crush_choose_firstn(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, @@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map, int rep; unsigned int ftotal, flocal; int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int r; int i; int item = 0; @@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map, if (local_fallback_retries > 0 && flocal >= (in->size>>1) && flocal > local_fallback_retries) - item = bucket_perm_choose(in, x, r); + item = bucket_perm_choose( + in, work->work[-1-in->id], + x, r); else - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map, sub_r = r >> (vary_r-1); else sub_r = 0; - if (crush_choose_firstn(map, - map->buckets[-1-item], - weight, weight_max, - x, stable ? 1 : outpos+1, 0, - out2, outpos, count, - recurse_tries, 0, - local_retries, - local_fallback_retries, - 0, - vary_r, - stable, - NULL, - sub_r) <= outpos) + if (crush_choose_firstn( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, stable ? 1 : outpos+1, 0, + out2, outpos, count, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + stable, + NULL, + sub_r) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map, } } - if (!reject) { + if (!reject && !collide) { /* out? */ if (itemtype == 0) reject = is_out(map, weight, weight_max, item, x); - else - reject = 0; } reject: @@ -600,7 +611,8 @@ reject: * */ static void crush_choose_indep(const struct crush_map *map, - struct crush_bucket *bucket, + struct crush_work *work, + const struct crush_bucket *bucket, const __u32 *weight, int weight_max, int x, int left, int numrep, int type, int *out, int outpos, @@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map, int *out2, int parent_r) { - struct crush_bucket *in = bucket; + const struct crush_bucket *in = bucket; int endpos = outpos + left; int rep; unsigned int ftotal; @@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map, break; } - item = crush_bucket_choose(in, x, r); + item = crush_bucket_choose( + in, work->work[-1-in->id], + x, r); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map, if (recurse_to_leaf) { if (item < 0) { - crush_choose_indep(map, - map->buckets[-1-item], - weight, weight_max, - x, 1, numrep, 0, - out2, rep, - recurse_tries, 0, - 0, NULL, r); + crush_choose_indep( + map, + work, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map, #endif } + +/* + * This takes a chunk of memory and sets it up to be a shiny new + * working area for a CRUSH placement computation. It must be called + * on any newly allocated memory before passing it in to + * crush_do_rule. It may be used repeatedly after that, so long as the + * map has not changed. If the map /has/ changed, you must make sure + * the working size is no smaller than what was allocated and re-run + * crush_init_workspace. + * + * If you do retain the working space between calls to crush, make it + * thread-local. + */ +void crush_init_workspace(const struct crush_map *map, void *v) +{ + struct crush_work *w = v; + __s32 b; + + /* + * We work by moving through the available space and setting + * values and pointers as we go. + * + * It's a bit like Forth's use of the 'allot' word since we + * set the pointer first and then reserve the space for it to + * point to by incrementing the point. + */ + v += sizeof(struct crush_work *); + w->work = v; + v += map->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < map->max_buckets; ++b) { + if (!map->buckets[b]) + continue; + + w->work[b] = v; + switch (map->buckets[b]->alg) { + default: + v += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = v; + v += map->buckets[b]->size * sizeof(__u32); + } + BUG_ON(v - (void *)w != map->working_size); +} + /** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map @@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map, * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector - * @scratch: scratch vector for private use; must be >= 3 * result_max + * @cwin: pointer to at least crush_work_size() bytes of memory */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - int *scratch) + void *cwin) { int result_len; - int *a = scratch; - int *b = scratch + result_max; - int *c = scratch + result_max*2; + struct crush_work *cw = cwin; + int *a = cwin + map->working_size; + int *b = a + result_max; + int *c = b + result_max; + int *w = a; + int *o = b; int recurse_to_leaf; - int *w; int wsize = 0; - int *o; int osize; int *tmp; - struct crush_rule *rule; + const struct crush_rule *rule; __u32 step; int i, j; int numrep; @@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map, rule = map->rules[ruleno]; result_len = 0; - w = a; - o = b; for (step = 0; step < rule->len; step++) { int firstn = 0; - struct crush_rule_step *curstep = &rule->steps[step]; + const struct crush_rule_step *curstep = &rule->steps[step]; switch (curstep->op) { case CRUSH_RULE_TAKE: @@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, + cw, map->buckets[bno], weight, weight_max, x, numrep, @@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, + cw, map->buckets[bno], weight, weight_max, x, out_size, numrep, @@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map, break; } } + return result_len; } diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 292e33bd916e..85747b7f91a9 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -3,6 +3,7 @@ #include <linux/err.h> #include <linux/scatterlist.h> +#include <linux/sched.h> #include <linux/slab.h> #include <crypto/aes.h> #include <crypto/skcipher.h> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f3378ba1a828..b65bbf9f45eb 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req) kref_init(&req->r_kref); init_completion(&req->r_completion); - init_completion(&req->r_done_completion); RB_CLEAR_NODE(&req->r_node); RB_CLEAR_NODE(&req->r_mc_node); INIT_LIST_HEAD(&req->r_unsafe_item); @@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, BUG_ON(length > previous); op->extent.length = length; - op->indata_len -= previous - length; + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) + op->indata_len -= previous - length; } EXPORT_SYMBOL(osd_req_op_extent_update); @@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) bool need_send = false; bool promoted = false; - WARN_ON(req->r_tid || req->r_got_reply); + WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: @@ -1704,17 +1704,10 @@ promote: static void account_request(struct ceph_osd_request *req) { - unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); + WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); - if (req->r_flags & CEPH_OSD_FLAG_READ) { - WARN_ON(req->r_flags & mask); - req->r_flags |= CEPH_OSD_FLAG_ACK; - } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) - WARN_ON(!(req->r_flags & mask)); - else - WARN_ON(1); - - WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); + req->r_flags |= CEPH_OSD_FLAG_ONDISK; atomic_inc(&req->r_osdc->num_requests); } @@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - if (req->r_callback) + if (req->r_callback) { + dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + req->r_tid, req->r_callback, req->r_result); req->r_callback(req); - else - complete_all(&req->r_completion); + } } /* - * Note that this is open-coded in handle_reply(), which has to deal - * with ack vs commit, dup acks, etc. + * This is open-coded in handle_reply(). */ static void complete_request(struct ceph_osd_request *req, int err) { @@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err) req->r_result = err; finish_request(req); __complete_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req) cancel_map_check(req); finish_request(req); - complete_all(&req->r_done_completion); + complete_all(&req->r_completion); ceph_osdc_put_request(req); } @@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req) mutex_lock(&lreq->lock); dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, lreq->linger_id, req->r_result); - WARN_ON(!__linger_registered(lreq)); linger_reg_commit_complete(lreq, req->r_result); lreq->committed = true; @@ -2785,31 +2777,8 @@ e_inval: } /* - * We are done with @req if - * - @m is a safe reply, or - * - @m is an unsafe reply and we didn't want a safe one - */ -static bool done_request(const struct ceph_osd_request *req, - const struct MOSDOpReply *m) -{ - return (m->result < 0 || - (m->flags & CEPH_OSD_FLAG_ONDISK) || - !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); -} - -/* - * handle osd op reply. either call the callback if it is specified, - * or do the completion to wake up the waiting thread. - * - * ->r_unsafe_callback is set? yes no - * - * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, - * any or needed/got safe) r_done_completion r_done_completion - * - * first reply is unsafe r_unsafe_cb(true) (nothing) - * - * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, - * r_done_completion r_done_completion + * Handle MOSDOpReply. Set ->r_result and call the callback if it is + * specified. */ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) { @@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) struct MOSDOpReply m; u64 tid = le64_to_cpu(msg->hdr.tid); u32 data_len = 0; - bool already_acked; int ret; int i; @@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) le32_to_cpu(msg->hdr.data_len), req->r_tid); goto fail_request; } - dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, - req, req->r_tid, req->r_got_reply, m.result, data_len); - - already_acked = req->r_got_reply; - if (!already_acked) { - req->r_result = m.result ?: data_len; - req->r_replay_version = m.replay_version; /* struct */ - req->r_got_reply = true; - } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { - dout("req %p tid %llu dup ack\n", req, req->r_tid); - goto out_unlock_session; - } - - if (done_request(req, &m)) { - finish_request(req); - if (req->r_linger) { - WARN_ON(req->r_unsafe_callback); - dout("req %p tid %llu cb (locked)\n", req, req->r_tid); - __complete_request(req); - } - } + dout("%s req %p tid %llu result %d data_len %u\n", __func__, + req, req->r_tid, m.result, data_len); + /* + * Since we only ever request ONDISK, we should only ever get + * one (type of) reply back. + */ + WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); + req->r_result = m.result ?: data_len; + finish_request(req); mutex_unlock(&osd->lock); up_read(&osdc->lock); - if (done_request(req, &m)) { - if (already_acked && req->r_unsafe_callback) { - dout("req %p tid %llu safe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, false); - } else if (!req->r_linger) { - dout("req %p tid %llu cb\n", req, req->r_tid); - __complete_request(req); - } - complete_all(&req->r_done_completion); - ceph_osdc_put_request(req); - } else { - if (req->r_unsafe_callback) { - dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); - req->r_unsafe_callback(req, true); - } else { - WARN_ON(1); - } - } - + __complete_request(req); + complete_all(&req->r_completion); + ceph_osdc_put_request(req); return; fail_request: @@ -3540,7 +3480,7 @@ again: up_read(&osdc->lock); dout("%s waiting on req %p tid %llu last_tid %llu\n", __func__, req, req->r_tid, last_tid); - wait_for_completion(&req->r_done_completion); + wait_for_completion(&req->r_completion); ceph_osdc_put_request(req); goto again; } @@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); - lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + lreq->t.flags = CEPH_OSD_FLAG_WRITE; lreq->mtime = CURRENT_TIME; lreq->reg_req = alloc_linger_request(lreq); @@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + req->r_flags = CEPH_OSD_FLAG_WRITE; req->r_mtime = CURRENT_TIME; osd_req_op_watch_init(req, 0, lreq->linger_id, CEPH_OSD_WATCH_OP_UNWATCH); @@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map); * Execute an OSD class method on an object. * * @flags: CEPH_OSD_FLAG_* - * @resp_len: out param for reply length + * @resp_len: in/out param for reply length */ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_object_id *oid, @@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, struct ceph_osd_request *req; int ret; + if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE)) + return -E2BIG; + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); if (!req) return -ENOMEM; @@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, 0, false, false); if (resp_page) osd_req_op_cls_response_data_pages(req, 0, &resp_page, - PAGE_SIZE, 0, false, false); + *resp_len, 0, false, false); ceph_osdc_start_request(osdc, req, false); ret = ceph_osdc_wait_request(osdc, req); @@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, int page_align = off & ~PAGE_MASK; req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, true); if (IS_ERR(req)) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d2436880b305..6824c0ec8373 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -153,6 +153,32 @@ bad: return -EINVAL; } +static void crush_finalize(struct crush_map *c) +{ + __s32 b; + + /* Space for the array of pointers to per-bucket workspace */ + c->working_size = sizeof(struct crush_work) + + c->max_buckets * sizeof(struct crush_work_bucket *); + + for (b = 0; b < c->max_buckets; b++) { + if (!c->buckets[b]) + continue; + + switch (c->buckets[b]->alg) { + default: + /* + * The base case, permutation variables and + * the pointer to the permutation array. + */ + c->working_size += sizeof(struct crush_work_bucket); + break; + } + /* Every bucket has a permutation array. */ + c->working_size += c->buckets[b]->size * sizeof(__u32); + } +} + static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; @@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); if (b->items == NULL) goto badmem; - b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); - if (b->perm == NULL) - goto badmem; - b->perm_n = 0; ceph_decode_need(p, end, b->size*sizeof(u32), bad); for (j = 0; j < b->size; j++) @@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); + crush_finalize(c); + done: dout("crush_decode success\n"); return c; @@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); + mutex_init(&map->crush_workspace_mutex); return map; } @@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kfree(map->osd_weight); kfree(map->osd_addr); kfree(map->osd_primary_affinity); + kfree(map->crush_workspace); kfree(map); } @@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } +static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) +{ + void *workspace; + size_t work_size; + + if (IS_ERR(crush)) + return PTR_ERR(crush); + + work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); + dout("%s work_size %zu bytes\n", __func__, work_size); + workspace = kmalloc(work_size, GFP_NOIO); + if (!workspace) { + crush_destroy(crush); + return -ENOMEM; + } + crush_init_workspace(crush, workspace); + + if (map->crush) + crush_destroy(map->crush); + kfree(map->crush_workspace); + map->crush = crush; + map->crush_workspace = workspace; + return 0; +} + #define OSDMAP_WRAPPER_COMPAT_VER 7 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 @@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) /* crush */ ceph_decode_32_safe(p, end, len, e_inval); - map->crush = crush_decode(*p, min(*p + len, end)); - if (IS_ERR(map->crush)) { - err = PTR_ERR(map->crush); - map->crush = NULL; + err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); + if (err) goto bad; - } - *p += len; /* ignore the rest */ *p = end; @@ -1375,7 +1421,6 @@ e_inval: struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map) { - struct crush_map *newcrush = NULL; struct ceph_fsid fsid; u32 epoch = 0; struct ceph_timespec modified; @@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new crush? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { - newcrush = crush_decode(*p, min(*p+len, end)); - if (IS_ERR(newcrush)) { - err = PTR_ERR(newcrush); - newcrush = NULL; + err = osdmap_set_crush(map, + crush_decode(*p, min(*p + len, end))); + if (err) goto bad; - } *p += len; } @@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->epoch++; map->modified = modified; - if (newcrush) { - if (map->crush) - crush_destroy(map->crush); - map->crush = newcrush; - newcrush = NULL; - } /* new_pools */ err = decode_new_pools(p, end, map); @@ -1505,8 +1542,6 @@ bad: print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); - if (newcrush) - crush_destroy(newcrush); return ERR_PTR(err); } @@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, BUG_ON(result_max > CEPH_PG_MAX_SIZE); - mutex_lock(&map->crush_scratch_mutex); + mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_scratch_ary); - mutex_unlock(&map->crush_scratch_mutex); + weight, weight_max, map->crush_workspace); + mutex_unlock(&map->crush_workspace_mutex); return r; } @@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, return; } - len = do_crush(osdmap, ruleno, pps, raw->osds, - min_t(int, pi->size, ARRAY_SIZE(raw->osds)), + if (pi->size > ARRAY_SIZE(raw->osds)) { + pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n", + pi->id, pi->crush_ruleset, pi->type, pi->size, + ARRAY_SIZE(raw->osds)); + return; + } + + len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, osdmap->osd_weight, osdmap->max_osd); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c index 154683f5f14c..705414e78ae0 100644 --- a/net/ceph/snapshot.c +++ b/net/ceph/snapshot.c @@ -18,8 +18,6 @@ * 02110-1301, USA. */ -#include <stddef.h> - #include <linux/types.h> #include <linux/export.h> #include <linux/ceph/libceph.h> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7db2ad2e82d3..b39a791f6756 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int ret, no_addr; struct fib_result res; struct flowi4 fl4; - struct net *net; + struct net *net = dev_net(dev); bool dev_match; fl4.flowi4_oif = 0; @@ -332,6 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); no_addr = idev->ifa_list == NULL; @@ -339,13 +340,12 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, trace_fib_validate_source(dev, &fl4); - net = dev_net(dev); if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && + if (!rpf && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) goto last_resort; fib_combine_itag(itag, &res); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cb494a5050f7..8471dd116771 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1876,6 +1876,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; + fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) @@ -2008,6 +2009,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, { int res; + tos &= IPTOS_RT_MASK; rcu_read_lock(); /* Multicast recognition logic is moved from route cache to here. diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index c795fee372c4..644ba59fbd9d 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -693,6 +693,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; + if (u->i_key) + u->i_flags |= GRE_KEY; + if (u->o_key) + u->o_flags |= GRE_KEY; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index babaf3ec2742..6ba6c900ebcf 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1666,6 +1666,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns struct net *net = sock_net(sk); struct mr6_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; @@ -1677,9 +1681,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns switch (optname) { case MRT6_INIT: - if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; if (optlen < sizeof(int)) return -EINVAL; @@ -1815,6 +1816,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, struct net *net = sock_net(sk); struct mr6_table *mrt; + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index c59712057dc8..d25038cfd64e 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -388,7 +388,7 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); - return -1; + return 0; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e19a69787d99..4b2e1fb28bb4 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -410,7 +410,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; - int ret = 1; + int ret = 0; if (!master_help) { ret = -ESHUTDOWN; @@ -460,14 +460,14 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect); - if (ret <= 0) + if (ret < 0) goto out; nf_ct_expect_insert(expect); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); - return ret; + return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index c6b8022c0e47..bf548a7a71ec 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -528,6 +528,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, if (!nft_ct_tmpl_alloc_pcpu()) return -ENOMEM; nft_ct_pcpu_template_refcnt++; + len = sizeof(u16); break; #endif default: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 97f9649bcc7e..152d226552c1 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -258,7 +258,7 @@ static int nft_bitmap_init(const struct nft_set *set, { struct nft_bitmap *priv = nft_set_priv(set); - priv->bitmap_size = nft_bitmap_total_size(set->klen); + priv->bitmap_size = nft_bitmap_size(set->klen); return 0; } diff --git a/net/rds/ib.c b/net/rds/ib.c index 8d70884d7bb6..91fe46f1e4cc 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -111,8 +111,7 @@ static void rds_ib_dev_free(struct work_struct *work) kfree(i_ipaddr); } - if (rds_ibdev->vector_load) - kfree(rds_ibdev->vector_load); + kfree(rds_ibdev->vector_load); kfree(rds_ibdev); } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 57bb52361e0f..5438f6725092 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -641,12 +641,12 @@ static int rds_tcp_init(void) ret = register_netdevice_notifier(&rds_tcp_dev_notifier); if (ret) { pr_warn("could not register rds_tcp_dev_notifier\n"); - goto out; + goto out_slab; } ret = register_pernet_subsys(&rds_tcp_net_ops); if (ret) - goto out_slab; + goto out_notifier; ret = rds_tcp_recv_init(); if (ret) @@ -664,9 +664,10 @@ out_recv: rds_tcp_recv_exit(); out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); -out_slab: +out_notifier: if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) pr_warn("could not unregister rds_tcp_dev_notifier\n"); +out_slab: kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 18c737a61d80..0a4e28477ad9 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -1065,7 +1065,7 @@ static long rxrpc_read(const struct key *key, switch (token->security_index) { case RXRPC_SECURITY_RXKAD: - toksize += 8 * 4; /* viceid, kvno, key*2, begin, + toksize += 9 * 4; /* viceid, kvno, key*2 + len, begin, * end, primary, tktlen */ toksize += RND(token->kad->ticket_len); break; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index c29362d50a92..f3a688e10843 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -320,8 +320,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, /* Barriers against rxrpc_input_data(). */ hard_ack = call->rx_hard_ack; - top = smp_load_acquire(&call->rx_top); - for (seq = hard_ack + 1; before_eq(seq, top); seq++) { + seq = hard_ack + 1; + while (top = smp_load_acquire(&call->rx_top), + before_eq(seq, top) + ) { ix = seq & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; if (!skb) { @@ -394,6 +396,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, ret = 1; goto out; } + + seq++; } out: diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f219ff325ed4..b70aa57319ea 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -613,8 +613,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, goto err_mod; } - err = nla_memdup_cookie(a, tb); - if (err < 0) { + if (nla_memdup_cookie(a, tb) < 0) { + err = -ENOMEM; tcf_hash_release(a, bind); goto err_mod; } @@ -859,10 +859,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, goto out_module_put; err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); - if (err < 0) + if (err <= 0) goto out_module_put; - if (err == 0) - goto noflush_out; nla_nest_end(skb, nest); @@ -879,7 +877,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, out_module_put: module_put(ops->owner); err_out: -noflush_out: kfree_skb(skb); return err; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8227bbbd077a..1b6d4574d2b0 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -199,6 +199,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, sctp_scope_t scope, gfp_t gfp, int copy_flags) { struct sctp_sockaddr_entry *addr; + union sctp_addr laddr; int error = 0; rcu_read_lock(); @@ -220,7 +221,10 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, !(copy_flags & SCTP_ADDR6_PEERSUPP))) continue; - if (sctp_bind_addr_state(bp, &addr->a) != -1) + laddr = addr->a; + /* also works for setting ipv6 address port */ + laddr.v4.sin_port = htons(bp->port); + if (sctp_bind_addr_state(bp, &laddr) != -1) continue; error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a), diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b5321486fbed..465a9c8464f9 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4862,6 +4862,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) if (!asoc) return -EINVAL; + /* If there is a thread waiting on more sndbuf space for + * sending on this asoc, it cannot be peeled. + */ + if (waitqueue_active(&asoc->wait)) + return -EBUSY; + /* An association cannot be branched off from an already peeled-off * socket, nor is this supported for tcp style sockets. */ @@ -7599,8 +7605,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, */ release_sock(sk); current_timeo = schedule_timeout(current_timeo); - if (sk != asoc->base.sk) - goto do_error; lock_sock(sk); *timeo_p = current_timeo; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 153082598522..a54a7a3d28f5 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - rsci->h.expiry_time = seconds_since_boot(); - set_bit(CACHE_NEGATIVE, &rsci->h.flags); + /* Delete the entry from the cache_list and call cache_put */ + sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); if (resv->iov_len + 4 > PAGE_SIZE) goto drop; svc_putnl(resv, RPC_SUCCESS); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f39e3e11f9aa..d8639da06d9c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) cache_purge(cd); spin_lock(&cache_list_lock); write_lock(&cd->hash_lock); - if (cd->entries) { - write_unlock(&cd->hash_lock); - spin_unlock(&cache_list_lock); - goto out; - } if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); @@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); } - return; -out: - printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name); } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); @@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { - time_t now = seconds_since_boot(); - if (detail->flush_time >= now) - now = detail->flush_time + 1; - /* 'now' is the maximum value any 'last_refresh' can have */ - detail->flush_time = now; - detail->nextcheck = seconds_since_boot(); - cache_flush(); + struct cache_head *ch = NULL; + struct hlist_head *head = NULL; + struct hlist_node *tmp = NULL; + int i = 0; + + write_lock(&detail->hash_lock); + if (!detail->entries) { + write_unlock(&detail->hash_lock); + return; + } + + dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name); + for (i = 0; i < detail->hash_size; i++) { + head = &detail->hash_table[i]; + hlist_for_each_entry_safe(ch, tmp, head, cache_list) { + hlist_del_init(&ch->cache_list); + detail->entries--; + + set_bit(CACHE_CLEANED, &ch->flags); + write_unlock(&detail->hash_lock); + cache_fresh_unlocked(ch, detail); + cache_put(ch, detail); + write_lock(&detail->hash_lock); + } + } + write_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); @@ -1855,3 +1866,15 @@ void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) } EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); +void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) +{ + write_lock(&cd->hash_lock); + if (!hlist_unhashed(&h->cache_list)){ + hlist_del_init(&h->cache_list); + cd->entries--; + write_unlock(&cd->hash_lock); + cache_put(h, cd); + } else + write_unlock(&cd->hash_lock); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2e22889a8837..b94efd93d3e4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv) for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; - if (progp->pg_vers[i]->vs_hidden == 0) + if (!progp->pg_vers[i]->vs_hidden) return 1; } } @@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net, if (vers->vs_hidden) continue; + /* + * Don't register a UDP port if we need congestion + * control. + */ + if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP) + continue; + error = __svc_register(net, progp->pg_name, progp->pg_prog, i, family, proto, port); @@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) !(versp = progp->pg_vers[vers])) goto err_bad_vers; + /* + * Some protocol versions (namely NFSv4) require some form of + * congestion control. (See RFC 7530 section 3.1 paragraph 2) + * In other words, UDP is not allowed. We mark those when setting + * up the svc_xprt, and verify that here. + * + * The spec is not very clear about what error should be returned + * when someone tries to access a server that is listening on UDP + * for lower versions. RPC_PROG_MISMATCH seems to be the closest + * fit. + */ + if (versp->vs_need_cong_ctrl && + !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) + goto err_bad_vers; + procp = versp->vs_proc + proc; if (proc >= versp->vs_nproc || !procp->pc_func) goto err_bad_proc; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d227d97f7ad4..8931e33b6541 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class, &svsk->sk_xprt, serv); set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index cb1e48e54eb1..ff1df40f0d26 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -201,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + __be32 *p; int rc; /* Space in the send buffer for an RPC/RDMA header is reserved * via xprt->tsh_size. */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + p = rqst->rq_buffer; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 0ba9887f3e22..1c4aabf0f657 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016 Oracle. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -47,102 +48,43 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* - * Decodes a read chunk list. The expected format is as follows: - * descrim : xdr_one - * position : __be32 offset into XDR stream - * handle : __be32 RKEY - * . . . - * end-of-list: xdr_zero - */ -static __be32 *decode_read_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_read_list(__be32 *p, __be32 *end) { - struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + __be32 *next; - while (ch->rc_discrim != xdr_zero) { - if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > - (unsigned long)vaend) { - dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + while (*p++ != xdr_zero) { + next = p + rpcrdma_readchunk_maxsz - 1; + if (next > end) return NULL; - } - ch++; + p = next; } - return &ch->rc_position; + return p; } -/* - * Decodes a write chunk list. The expected format is as follows: - * descrim : xdr_one - * nchunks : <count> - * handle : __be32 RKEY ---+ - * length : __be32 <len of segment> | - * offset : remove va + <count> - * . . . | - * ---+ - */ -static __be32 *decode_write_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_write_list(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; + __be32 *next; - /* Check for not write-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + while (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - /* - * rs_length is the 2nd 4B field in wc_target and taking its - * address skips the list terminator - */ - return &ary->wc_array[nchunks].wc_target.rs_length; + return p; } -static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; - - /* Check for no reply-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + __be32 *next; + + if (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - return (__be32 *)&ary->wc_array[nchunks]; + return p; } /** @@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) */ int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { - struct rpcrdma_msg *rmsgp; - __be32 *va, *vaend; - unsigned int len; - u32 hdr_len; + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { - dprintk("svcrdma: header too short = %d\n", - rq_arg->len); - return -EINVAL; - } + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; - rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; - if (rmsgp->rm_vers != rpcrdma_version) { - dprintk("%s: bad version %u\n", __func__, - be32_to_cpu(rmsgp->rm_vers)); - return -EPROTONOSUPPORT; - } + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; - switch (be32_to_cpu(rmsgp->rm_type)) { - case RDMA_MSG: - case RDMA_NOMSG: + switch (*(rdma_argp + 3)) { + case rdma_msg: + case rdma_nomsg: break; - case RDMA_DONE: - /* Just drop it */ - dprintk("svcrdma: dropping RDMA_DONE message\n"); - return 0; - - case RDMA_ERROR: - /* Possible if this is a backchannel reply. - * XXX: We should cancel this XID, though. - */ - dprintk("svcrdma: dropping RDMA_ERROR message\n"); - return 0; - - case RDMA_MSGP: - /* Pull in the extra for the padded case, bump our pointer */ - rmsgp->rm_body.rm_padded.rm_align = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); - rmsgp->rm_body.rm_padded.rm_thresh = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); - - va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rq_arg->head[0].iov_base = va; - len = (u32)((unsigned long)va - (unsigned long)rmsgp); - rq_arg->head[0].iov_len -= len; - if (len > rq_arg->len) - return -EINVAL; - return len; - default: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpu(rmsgp->rm_type)); - return -EINVAL; - } + case rdma_done: + goto out_drop; - /* The chunk list may contain either a read chunk list or a write - * chunk list and a reply chunk list. - */ - va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); - va = decode_read_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode read list\n"); - return -EINVAL; - } - va = decode_write_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode write list\n"); - return -EINVAL; - } - va = decode_reply_array(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode reply chunk\n"); - return -EINVAL; + case rdma_error: + goto out_drop; + + default: + goto out_proc; } - rq_arg->head[0].iov_base = va; - hdr_len = (unsigned long)va - (unsigned long)rmsgp; + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; rq_arg->head[0].iov_len -= hdr_len; return hdr_len; + +out_short: + dprintk("svcrdma: header too short = %d\n", rq_arg->len); + return -EINVAL; + +out_version: + dprintk("svcrdma: bad xprt version: %u\n", + be32_to_cpup(rdma_argp + 1)); + return -EPROTONOSUPPORT; + +out_drop: + dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + return 0; + +out_proc: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpup(rdma_argp + 3)); + return -EINVAL; + +out_inval: + dprintk("svcrdma: failed to parse transport header\n"); + return -EINVAL; } int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, @@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, *va++ = rmsgp->rm_xid; *va++ = rmsgp->rm_vers; - *va++ = cpu_to_be32(xprt->sc_max_requests); + *va++ = xprt->sc_fc_credits; *va++ = rdma_error; *va++ = cpu_to_be32(err); if (err == ERR_VERS) { @@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, return (int)((unsigned long)va - (unsigned long)startp); } -int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +/** + * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header + * @rdma_resp: buffer containing Reply transport header + * + * Returns length of transport header, in bytes. + */ +unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp) { - struct rpcrdma_write_array *wr_ary; + unsigned int nsegs; + __be32 *p; - /* There is no read-list in a reply */ + p = rdma_resp; - /* skip write list */ - wr_ary = (struct rpcrdma_write_array *) - &rmsgp->rm_body.rm_chunks[1]; - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. - wc_target.rs_length; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - /* skip reply array */ - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - return (unsigned long) wr_ary - (unsigned long) rmsgp; + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; + + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + return (unsigned long)p - (unsigned long)rdma_resp; } void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) @@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, seg->rs_offset = rs_offset; seg->rs_length = cpu_to_be32(write_len); } - -void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rdma_argp, - struct rpcrdma_msg *rdma_resp, - enum rpcrdma_proc rdma_type) -{ - rdma_resp->rm_xid = rdma_argp->rm_xid; - rdma_resp->rm_vers = rdma_argp->rm_vers; - rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests); - rdma_resp->rm_type = cpu_to_be32(rdma_type); - - /* Encode <nul> chunks lists */ - rdma_resp->rm_body.rm_chunks[0] = xdr_zero; - rdma_resp->rm_body.rm_chunks[1] = xdr_zero; - rdma_resp->rm_body.rm_chunks[2] = xdr_zero; -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 172b537f8cfc..f7b2daf72a86 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_lock(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { - ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); rdma_read_complete(rqstp, ctxt); goto complete; } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { - ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); } else { atomic_inc(&rdma_stat_rq_starve); clear_bit(XPT_DATA, &xprt->xpt_flags); ctxt = NULL; } - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); if (!ctxt) { /* This is the EAGAIN path. The svc_recv routine will * return -EAGAIN, the nfsd thread will go to call into diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index ad4d286a83c5..515221b16d09 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].length = + svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, ctxt->sge[0].length, DMA_TO_DEVICE); @@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct rpcrdma_msg *rdma_argp; struct rpcrdma_msg *rdma_resp; struct rpcrdma_write_array *wr_ary, *rp_ary; - enum rpcrdma_proc reply_type; int ret; int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; u32 inv_rkey; + __be32 *p; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!res_page) goto err0; rdma_resp = page_address(res_page); - if (rp_ary) - reply_type = RDMA_NOMSG; - else - reply_type = RDMA_MSG; - svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, - rdma_resp, reply_type); + + p = &rdma_resp->rm_xid; + *p++ = rdma_argp->rm_xid; + *p++ = rdma_argp->rm_vers; + *p++ = rdma->sc_fc_credits; + *p++ = rp_ary ? rdma_nomsg : rdma_msg; + + /* Start with empty chunks */ + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; /* Send any write-chunk data and build resp write-list */ if (wr_ary) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 39652d390a9c..c13a5c35ce14 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, ctxt = kmalloc(sizeof(*ctxt), flags); if (ctxt) { ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->free); - INIT_LIST_HEAD(&ctxt->dto_q); + INIT_LIST_HEAD(&ctxt->list); } return ctxt; } @@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) dprintk("svcrdma: No memory for RDMA ctxt\n"); return false; } - list_add(&ctxt->free, &xprt->sc_ctxts); + list_add(&ctxt->list, &xprt->sc_ctxts); } return true; } @@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used++; if (list_empty(&xprt->sc_ctxts)) goto out_empty; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del_init(&ctxt->free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&xprt->sc_ctxt_lock); out: ctxt->count = 0; @@ -209,15 +208,15 @@ out_empty: /* Either pre-allocation missed the mark, or send * queue accounting is broken. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); ctxt = alloc_ctxt(xprt, GFP_NOIO); if (ctxt) goto out; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); return NULL; } @@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - list_add(&ctxt->free, &xprt->sc_ctxts); - spin_unlock_bh(&xprt->sc_ctxt_lock); + list_add(&ctxt->list, &xprt->sc_ctxts); + spin_unlock(&xprt->sc_ctxt_lock); } static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) @@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del(&ctxt->free); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); kfree(ctxt); } } @@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* All wc fields are now known to be valid */ ctxt->byte_len = wc->byte_len; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); spin_unlock(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); @@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) read_hdr = ctxt->read_hdr; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&read_hdr->dto_q, + list_add_tail(&read_hdr->list, &xprt->sc_read_complete_q); spin_unlock(&xprt->sc_rq_dto_lock); @@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return NULL; svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); - INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); @@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_map_lock); + /* + * Note that this implies that the underlying transport support + * has some form of congestion control (see RFC 7530 section 3.1 + * paragraph 2). For now, we assume that all supported RDMA + * transports are suitable here. + */ + set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) { struct svc_rdma_fastreg_mr *frmr = NULL; - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); if (!list_empty(&rdma->sc_frmr_q)) { frmr = list_entry(rdma->sc_frmr_q.next, struct svc_rdma_fastreg_mr, frmr_list); list_del_init(&frmr->frmr_list); frmr->sg_nents = 0; } - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); if (frmr) return frmr; @@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); } } @@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_requests); + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_bc_requests); newxprt->sc_rq_depth = newxprt->sc_max_requests + @@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work) */ while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } /* Destroy queued, but not processed recv completions */ while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } diff --git a/net/tipc/node.c b/net/tipc/node.c index e9295fa3a554..4512e83652b1 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1505,19 +1505,21 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_node *n; - struct tipc_msg *hdr = buf_msg(skb); - int usr = msg_user(hdr); + struct tipc_msg *hdr; int bearer_id = b->identity; struct tipc_link_entry *le; - u16 bc_ack = msg_bcast_ack(hdr); u32 self = tipc_own_addr(net); - int rc = 0; + int usr, rc = 0; + u16 bc_ack; __skb_queue_head_init(&xmitq); - /* Ensure message is well-formed */ + /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(skb))) goto discard; + hdr = buf_msg(skb); + usr = msg_user(hdr); + bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5f3e87866438..0806dccdf507 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2836,14 +2836,8 @@ static unsigned int xfrm_mtu(const struct dst_entry *dst) return mtu ? : dst_mtu(dst->path); } -static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) -{ - return dst->path->ops->neigh_lookup(dst, skb, daddr); -} - -static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) +static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, + const void *daddr) { const struct dst_entry *path = dst->path; @@ -2857,6 +2851,25 @@ static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; } + return daddr; +} + +static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + const struct dst_entry *path = dst->path; + + if (!skb) + daddr = xfrm_get_dst_nexthop(dst, daddr); + return path->ops->neigh_lookup(path, skb, daddr); +} + +static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) +{ + const struct dst_entry *path = dst->path; + + daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); } diff --git a/tools/build/Makefile b/tools/build/Makefile index aaf7ed329a45..477f00eda591 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -35,8 +35,8 @@ all: $(OUTPUT)fixdep clean: $(call QUIET_CLEAN, fixdep) - $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete - $(Q)rm -f fixdep + $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete + $(Q)rm -f $(OUTPUT)fixdep $(OUTPUT)fixdep-in.o: FORCE $(Q)$(MAKE) $(build)=fixdep diff --git a/tools/build/Makefile.include b/tools/build/Makefile.include index ad22e4e7bc59..d360f39a445b 100644 --- a/tools/build/Makefile.include +++ b/tools/build/Makefile.include @@ -3,4 +3,7 @@ build := -f $(srctree)/tools/build/Makefile.build dir=. obj fixdep: $(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep +fixdep-clean: + $(Q)$(MAKE) -C $(srctree)/tools/build clean + .PHONY: fixdep diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index f2ea78021450..7ce724fc0544 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -5225,13 +5225,13 @@ int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec) } /** - * pevent_data_prempt_count - parse the preempt count from the record + * pevent_data_preempt_count - parse the preempt count from the record * @pevent: a handle to the pevent * @rec: the record to parse * * This returns the preempt count from a record. */ -int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec) +int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec) { return parse_common_pc(pevent, rec->data); } diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index 74cecba87daa..66342804161c 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -710,7 +710,7 @@ void pevent_data_lat_fmt(struct pevent *pevent, int pevent_data_type(struct pevent *pevent, struct pevent_record *rec); struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type); int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec); -int pevent_data_prempt_count(struct pevent *pevent, struct pevent_record *rec); +int pevent_data_preempt_count(struct pevent *pevent, struct pevent_record *rec); int pevent_data_flags(struct pevent *pevent, struct pevent_record *rec); const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid); struct cmdline; diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index f7350fcedc70..a59e061c0b4a 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -31,9 +31,8 @@ #define INSN_CALL_DYNAMIC 8 #define INSN_RETURN 9 #define INSN_CONTEXT_SWITCH 10 -#define INSN_BUG 11 -#define INSN_NOP 12 -#define INSN_OTHER 13 +#define INSN_NOP 11 +#define INSN_OTHER 12 #define INSN_LAST INSN_OTHER int arch_decode_instruction(struct elf *elf, struct section *sec, diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 039636ffb6c8..6ac99e3266eb 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -118,9 +118,6 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, op2 == 0x35) /* sysenter, sysret */ *type = INSN_CONTEXT_SWITCH; - else if (op2 == 0x0b || op2 == 0xb9) - /* ud2 */ - *type = INSN_BUG; else if (op2 == 0x0d || op2 == 0x1f) /* nopl/nopw */ *type = INSN_NOP; diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index e8a1f699058a..5fc52ee3264c 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -51,7 +51,7 @@ struct instruction { unsigned int len, state; unsigned char type; unsigned long immediate; - bool alt_group, visited; + bool alt_group, visited, dead_end; struct symbol *call_dest; struct instruction *jump_dest; struct list_head alts; @@ -330,6 +330,54 @@ static int decode_instructions(struct objtool_file *file) } /* + * Find all uses of the unreachable() macro, which are code path dead ends. + */ +static int add_dead_ends(struct objtool_file *file) +{ + struct section *sec; + struct rela *rela; + struct instruction *insn; + bool found; + + sec = find_section_by_name(file->elf, ".rela__unreachable"); + if (!sec) + return 0; + + list_for_each_entry(rela, &sec->rela_list, list) { + if (rela->sym->type != STT_SECTION) { + WARN("unexpected relocation symbol type in .rela__unreachable"); + return -1; + } + insn = find_insn(file, rela->sym->sec, rela->addend); + if (insn) + insn = list_prev_entry(insn, list); + else if (rela->addend == rela->sym->sec->len) { + found = false; + list_for_each_entry_reverse(insn, &file->insn_list, list) { + if (insn->sec == rela->sym->sec) { + found = true; + break; + } + } + + if (!found) { + WARN("can't find unreachable insn at %s+0x%x", + rela->sym->sec->name, rela->addend); + return -1; + } + } else { + WARN("can't find unreachable insn at %s+0x%x", + rela->sym->sec->name, rela->addend); + return -1; + } + + insn->dead_end = true; + } + + return 0; +} + +/* * Warnings shouldn't be reported for ignored functions. */ static void add_ignores(struct objtool_file *file) @@ -843,6 +891,10 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; + ret = add_dead_ends(file); + if (ret) + return ret; + add_ignores(file); ret = add_jump_destinations(file); @@ -1037,13 +1089,13 @@ static int validate_branch(struct objtool_file *file, return 0; - case INSN_BUG: - return 0; - default: break; } + if (insn->dead_end) + return 0; + insn = next_insn_same_sec(file, insn); if (!insn) { WARN("%s: unexpected end of section", sec->name); diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index 8ffbd272952d..a89273d8e744 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -39,6 +39,10 @@ OPTIONS --verbose:: Be more verbose. (Show symbol address, etc) +-q:: +--quiet:: + Do not show any message. (Suppress -v) + -D:: --dump-raw-trace:: Dump raw trace in ASCII. diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index 66dbe3dee74b..a79c84ae61aa 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -73,6 +73,10 @@ OPTIONS Be verbose, for instance, show the raw counts in addition to the diff. +-q:: +--quiet:: + Do not show any message. (Suppress -v) + -f:: --force:: Don't do ownership validation. diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 27256bc68eda..b16003ec14a7 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -157,7 +157,7 @@ OPTIONS -a:: --all-cpus:: - System-wide collection from all CPUs. + System-wide collection from all CPUs (default if no target is specified). -p:: --pid=:: diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index f2914f03ae7b..c04cc0647c16 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -25,6 +25,10 @@ OPTIONS --verbose:: Be more verbose. (show symbol address, etc) +-q:: +--quiet:: + Do not show any message. (Suppress -v) + -n:: --show-nr-samples:: Show the number of samples for each symbol diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index d96ccd4844df..aecf2a87e7d6 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -63,7 +63,7 @@ report:: -a:: --all-cpus:: - system-wide collection from all CPUs + system-wide collection from all CPUs (default if no target is specified) -c:: --scale:: diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 2b941efadb04..27c9fbca7bd9 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -175,6 +175,10 @@ PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG)) PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null) PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null) +ifeq ($(CC), clang) + PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS)) +endif + FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS) FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS) FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS) @@ -601,6 +605,9 @@ else PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS)) PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null) + ifeq ($(CC), clang) + PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS)) + endif FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS) ifneq ($(feature-libpython), 1) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 4da19b6ba94a..79fe31f20a17 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -726,13 +726,13 @@ config-clean: $(call QUIET_CLEAN, config) $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null -clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean +clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean fixdep-clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \ - $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \ + $(OUTPUT)util/intel-pt-decoder/inat-tables.c \ $(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \ $(OUTPUT)pmu-events/pmu-events.c $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index ebb628332a6e..4f52d85f5ebc 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -410,6 +410,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused) OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('q', "quiet", &quiet, "do now show any message"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_BOOLEAN(0, "gtk", &annotate.use_gtk, "Use the GTK interface"), @@ -463,6 +464,9 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused) annotate.sym_hist_filter = argv[0]; } + if (quiet) + perf_quiet_option(); + file.path = input_name; annotate.session = perf_session__new(&file, false, &annotate.tool); diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 70a289347591..1b96a3122228 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -691,7 +691,7 @@ static void hists__process(struct hists *hists) hists__precompute(hists); hists__output_resort(hists, NULL); - hists__fprintf(hists, true, 0, 0, 0, stdout, + hists__fprintf(hists, !quiet, 0, 0, 0, stdout, symbol_conf.use_callchain); } @@ -739,12 +739,14 @@ static void data_process(void) hists__link(hists_base, hists); } - fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n", - perf_evsel__name(evsel_base)); + if (!quiet) { + fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n", + perf_evsel__name(evsel_base)); + } first = false; - if (verbose || data__files_cnt > 2) + if (verbose > 0 || ((data__files_cnt > 2) && !quiet)) data__fprintf(); /* Don't sort callchain for perf diff */ @@ -807,6 +809,7 @@ static const char * const diff_usage[] = { static const struct option options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"), OPT_BOOLEAN('b', "baseline-only", &show_baseline_only, "Show only items with match in baseline"), OPT_CALLBACK('c', "compute", &compute, @@ -1328,6 +1331,9 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused) argc = parse_options(argc, argv, options, diff_usage, 0); + if (quiet) + perf_quiet_option(); + if (symbol__init(NULL) < 0) return -1; diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index cd7bc4d104e2..6114e07ca613 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -42,8 +42,8 @@ static int parse_record_events(const struct option *opt, fprintf(stderr, "%-13s%-*s%s\n", e->tag, - verbose ? 25 : 0, - verbose ? perf_mem_events__name(j) : "", + verbose > 0 ? 25 : 0, + verbose > 0 ? perf_mem_events__name(j) : "", e->supported ? ": available" : ""); } exit(0); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6cd6776052e7..bc84a375295d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -432,7 +432,7 @@ static int record__open(struct record *rec) try_again: if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) { if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { - if (verbose) + if (verbose > 0) ui__warning("%s\n", msg); goto try_again; } @@ -1677,8 +1677,12 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) argc = parse_options(argc, argv, record_options, record_usage, PARSE_OPT_STOP_AT_NON_OPTION); + if (quiet) + perf_quiet_option(); + + /* Make system wide (-a) the default target. */ if (!argc && target__none(&rec->opts.target)) - usage_with_options(record_usage, record_options); + rec->opts.target.system_wide = true; if (nr_cgroups && !rec->opts.target.system_wide) { usage_with_options_msg(record_usage, record_options, diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index dbd7fa028861..0a88670e56f3 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -320,6 +320,9 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report size_t size = sizeof(buf); int socked_id = hists->socket_filter; + if (quiet) + return 0; + if (symbol_conf.filter_relative) { nr_samples = hists->stats.nr_non_filtered_samples; nr_events = hists->stats.total_non_filtered_period; @@ -372,7 +375,11 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, { struct perf_evsel *pos; - fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n", evlist->stats.total_lost_samples); + if (!quiet) { + fprintf(stdout, "#\n# Total Lost Samples: %" PRIu64 "\n#\n", + evlist->stats.total_lost_samples); + } + evlist__for_each_entry(evlist, pos) { struct hists *hists = evsel__hists(pos); const char *evname = perf_evsel__name(pos); @@ -382,7 +389,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, continue; hists__fprintf_nr_sample_events(hists, rep, evname, stdout); - hists__fprintf(hists, true, 0, 0, rep->min_percent, stdout, + hists__fprintf(hists, !quiet, 0, 0, rep->min_percent, stdout, symbol_conf.use_callchain); fprintf(stdout, "\n\n"); } @@ -716,6 +723,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) "input file name"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, @@ -863,6 +871,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) report.symbol_filter_str = argv[0]; } + if (quiet) + perf_quiet_option(); + if (symbol_conf.vmlinux_name && access(symbol_conf.vmlinux_name, R_OK)) { pr_err("Invalid file: %s\n", symbol_conf.vmlinux_name); @@ -983,14 +994,14 @@ repeat: goto error; } - if (report.header || report.header_only) { + if ((report.header || report.header_only) && !quiet) { perf_session__fprintf_info(session, stdout, report.show_full_info); if (report.header_only) { ret = 0; goto error; } - } else if (use_browser == 0) { + } else if (use_browser == 0 && !quiet) { fputs("# To display the perf.data header info, please use --header/--header-only options.\n#\n", stdout); } @@ -1009,7 +1020,7 @@ repeat: * providing it only in verbose mode not to bloat too * much struct symbol. */ - if (verbose) { + if (verbose > 0) { /* * XXX: Need to provide a less kludgy way to ask for * more space per symbol, the u32 is for the index on diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 270eb2d8ca6b..b94cf0de715a 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -460,7 +460,7 @@ static struct task_desc *register_pid(struct perf_sched *sched, BUG_ON(!sched->tasks); sched->tasks[task->nr] = task; - if (verbose) + if (verbose > 0) printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm); return task; @@ -794,7 +794,7 @@ replay_wakeup_event(struct perf_sched *sched, const u32 pid = perf_evsel__intval(evsel, sample, "pid"); struct task_desc *waker, *wakee; - if (verbose) { + if (verbose > 0) { printf("sched_wakeup event %p\n", evsel); printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid); @@ -822,7 +822,7 @@ static int replay_switch_event(struct perf_sched *sched, int cpu = sample->cpu; s64 delta; - if (verbose) + if (verbose > 0) printf("sched_switch event %p\n", evsel); if (cpu >= MAX_CPUS || cpu < 0) @@ -870,7 +870,7 @@ static int replay_fork_event(struct perf_sched *sched, goto out_put; } - if (verbose) { + if (verbose > 0) { printf("fork event\n"); printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid); printf("... child: %s/%d\n", thread__comm_str(child), child->tid); @@ -1573,7 +1573,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp)); color_fprintf(stdout, color, " %12s secs ", stimestamp); - if (new_shortname || (verbose && sched_in->tid)) { + if (new_shortname || (verbose > 0 && sched_in->tid)) { const char *pid_color = color; if (thread__has_color(sched_in)) @@ -2050,7 +2050,7 @@ static void save_task_callchain(struct perf_sched *sched, if (thread__resolve_callchain(thread, cursor, evsel, sample, NULL, NULL, sched->max_stack + 2) != 0) { - if (verbose) + if (verbose > 0) error("Failed to resolve callchain. Skipping\n"); return; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f28719178b51..13b54999ad79 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -573,7 +573,7 @@ try_again: if (errno == EINVAL || errno == ENOSYS || errno == ENOENT || errno == EOPNOTSUPP || errno == ENXIO) { - if (verbose) + if (verbose > 0) ui__warning("%s event is not supported by the kernel.\n", perf_evsel__name(counter)); counter->supported = false; @@ -582,7 +582,7 @@ try_again: !(counter->leader->nr_members > 1)) continue; } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { - if (verbose) + if (verbose > 0) ui__warning("%s\n", msg); goto try_again; } @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; @@ -2445,8 +2445,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) } else if (big_num_opt == 0) /* User passed --no-big-num */ big_num = false; + /* Make system wide (-a) the default target. */ if (!argc && target__none(&target)) - usage_with_options(stat_usage, stat_options); + target.system_wide = true; if (run_count < 0) { pr_err("Run count must be a positive number\n"); @@ -2538,7 +2539,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) status = 0; for (run_idx = 0; forever || run_idx < run_count; run_idx++) { - if (run_count != 1 && verbose) + if (run_count != 1 && verbose > 0) fprintf(output, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 5a7fd7af3a6d..ab9077915763 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -871,7 +871,7 @@ try_again: if (perf_evsel__open(counter, top->evlist->cpus, top->evlist->threads) < 0) { if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { - if (verbose) + if (verbose > 0) ui__warning("%s\n", msg); goto try_again; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 40ef9b293d1b..256f1fac6f7e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1399,7 +1399,7 @@ static struct syscall *trace__syscall_info(struct trace *trace, return &trace->syscalls.table[id]; out_cant_read: - if (verbose) { + if (verbose > 0) { fprintf(trace->output, "Problems reading syscall %d", id); if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); @@ -1801,10 +1801,10 @@ static void print_location(FILE *f, struct perf_sample *sample, bool print_dso, bool print_sym) { - if ((verbose || print_dso) && al->map) + if ((verbose > 0 || print_dso) && al->map) fprintf(f, "%s@", al->map->dso->long_name); - if ((verbose || print_sym) && al->sym) + if ((verbose > 0 || print_sym) && al->sym) fprintf(f, "%s+0x%" PRIx64, al->sym->name, al->addr - al->sym->start); else if (al->map) diff --git a/tools/perf/pmu-events/json.c b/tools/perf/pmu-events/json.c index f67bbb0aa36e..0544398d6e2d 100644 --- a/tools/perf/pmu-events/json.c +++ b/tools/perf/pmu-events/json.c @@ -49,7 +49,7 @@ static char *mapfile(const char *fn, size_t *size) int err; int fd = open(fn, O_RDONLY); - if (fd < 0 && verbose && fn) { + if (fd < 0 && verbose > 0 && fn) { pr_err("Error opening events file '%s': %s\n", fn, strerror(errno)); } diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index 28d1605b0338..88dc51f4c27b 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -144,7 +144,7 @@ static int run_dir(const char *d, const char *perf) int vcnt = min(verbose, (int) sizeof(v) - 1); char cmd[3*PATH_MAX]; - if (verbose) + if (verbose > 0) vcnt++; snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %.*s", diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 37e326bfd2dc..83c4669cbc5b 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -299,7 +299,7 @@ static int run_test(struct test *test, int subtest) if (!dont_fork) { pr_debug("test child forked, pid %d\n", getpid()); - if (!verbose) { + if (verbose <= 0) { int nullfd = open("/dev/null", O_WRONLY); if (nullfd >= 0) { diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index ff5bc6363a79..d1f693041324 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -599,7 +599,7 @@ static int do_test_code_reading(bool try_kcore) continue; } - if (verbose) { + if (verbose > 0) { char errbuf[512]; perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); pr_debug("perf_evlist__open() failed!\n%s\n", errbuf); diff --git a/tools/perf/tests/fdarray.c b/tools/perf/tests/fdarray.c index a2b5ff9bf83d..bc5982f42dc3 100644 --- a/tools/perf/tests/fdarray.c +++ b/tools/perf/tests/fdarray.c @@ -19,7 +19,7 @@ static int fdarray__fprintf_prefix(struct fdarray *fda, const char *prefix, FILE { int printed = 0; - if (!verbose) + if (verbose <= 0) return 0; printed += fprintf(fp, "\n%s: ", prefix); diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c index d357dab72e68..482b5365e68d 100644 --- a/tools/perf/tests/llvm.c +++ b/tools/perf/tests/llvm.c @@ -76,7 +76,7 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf, * Skip this test if user's .perfconfig doesn't set [llvm] section * and clang is not found in $PATH, and this is not perf test -v */ - if (!force && (verbose == 0 && + if (!force && (verbose <= 0 && !llvm_param.user_set_param && llvm__search_clang())) { pr_debug("No clang and no verbosive, skip this test\n"); diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index aa9276bfe3e9..1dc838014422 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1808,7 +1808,7 @@ static void debug_warn(const char *warn, va_list params) { char msg[1024]; - if (!verbose) + if (verbose <= 0) return; vsnprintf(msg, sizeof(msg), warn, params); diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 541da7a68f91..87893f3ba5f1 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -172,13 +172,13 @@ int test__PERF_RECORD(int subtest __maybe_unused) err = perf_evlist__parse_sample(evlist, event, &sample); if (err < 0) { - if (verbose) + if (verbose > 0) perf_event__fprintf(event, stderr); pr_debug("Couldn't parse sample\n"); goto out_delete_evlist; } - if (verbose) { + if (verbose > 0) { pr_info("%" PRIu64" %d ", sample.time, sample.cpu); perf_event__fprintf(event, stderr); } diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c index 7a52834ee0d0..fa79509da535 100644 --- a/tools/perf/tests/python-use.c +++ b/tools/perf/tests/python-use.c @@ -15,7 +15,7 @@ int test__python_use(int subtest __maybe_unused) int ret; if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s", - PYTHONPATH, PYTHON, verbose ? "" : "2> /dev/null") < 0) + PYTHONPATH, PYTHON, verbose > 0 ? "" : "2> /dev/null") < 0) return -1; ret = system(cmd) ? -1 : 0; diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c index a4a4b4625ac3..f2d2e542d0ee 100644 --- a/tools/perf/tests/thread-map.c +++ b/tools/perf/tests/thread-map.c @@ -109,7 +109,7 @@ int test__thread_map_remove(int subtest __maybe_unused) TEST_ASSERT_VAL("failed to allocate thread_map", threads); - if (verbose) + if (verbose > 0) thread_map__fprintf(threads, stderr); TEST_ASSERT_VAL("failed to remove thread", @@ -117,7 +117,7 @@ int test__thread_map_remove(int subtest __maybe_unused) TEST_ASSERT_VAL("thread_map count != 1", threads->nr == 1); - if (verbose) + if (verbose > 0) thread_map__fprintf(threads, stderr); TEST_ASSERT_VAL("failed to remove thread", @@ -125,7 +125,7 @@ int test__thread_map_remove(int subtest __maybe_unused) TEST_ASSERT_VAL("thread_map count != 0", threads->nr == 0); - if (verbose) + if (verbose > 0) thread_map__fprintf(threads, stderr); TEST_ASSERT_VAL("failed to not remove thread", diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(&file, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index a5082331f246..862b043e5924 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -168,7 +168,7 @@ next_pair: err = -1; } - if (!verbose) + if (verbose <= 0) goto out; header_printed = false; diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c index 98a34664bb7e..9ce142de536d 100644 --- a/tools/perf/ui/browsers/map.c +++ b/tools/perf/ui/browsers/map.c @@ -73,7 +73,7 @@ static int map_browser__run(struct map_browser *browser) if (ui_browser__show(&browser->b, browser->map->dso->long_name, "Press ESC to exit, %s / to search", - verbose ? "" : "restart with -v to use") < 0) + verbose > 0 ? "" : "restart with -v to use") < 0) return -1; while (1) { @@ -81,7 +81,7 @@ static int map_browser__run(struct map_browser *browser) switch (key) { case '/': - if (verbose) + if (verbose > 0) map_browser__search(browser); default: break; @@ -117,7 +117,7 @@ int map__browse(struct map *map) if (maxaddr < pos->end) maxaddr = pos->end; - if (verbose) { + if (verbose > 0) { u32 *idx = symbol__browser_index(pos); *idx = mb.b.nr_entries; } diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 18cfcdc90356..5d632dca672a 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -648,7 +648,7 @@ unsigned int hists__sort_list_width(struct hists *hists) ret += fmt->width(fmt, &dummy_hpp, hists); } - if (verbose && hists__has(hists, sym)) /* Addr + origin */ + if (verbose > 0 && hists__has(hists, sym)) /* Addr + origin */ ret += 3 + BITS_PER_LONG / 4; return ret; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 06cc04e5806a..273f21fa32b5 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1768,7 +1768,7 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map, printf("%-*.*s----\n", graph_dotted_len, graph_dotted_len, graph_dotted_line); - if (verbose) + if (verbose > 0) symbol__annotate_hits(sym, evsel); list_for_each_entry(pos, ¬es->src->source, node) { diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, &max_cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, &max_present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void); int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index c1838b643108..03eb81f30d0d 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -203,11 +203,28 @@ int perf_debug_option(const char *str) v = (v < 0) || (v > 10) ? 0 : v; } + if (quiet) + v = -1; + *var->ptr = v; free(s); return 0; } +int perf_quiet_option(void) +{ + struct debug_variable *var = &debug_variables[0]; + + /* disable all debug messages */ + while (var->name) { + *var->ptr = -1; + var++; + } + + quiet = true; + return 0; +} + #define DEBUG_WRAPPER(__n, __l) \ static int pr_ ## __n ## _wrapper(const char *fmt, ...) \ { \ diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index d242adc3d5a2..98832f5531d3 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -54,5 +54,6 @@ int veprintf(int level, int var, const char *fmt, va_list args); int perf_debug_option(const char *str); void perf_debug_setup(void); +int perf_quiet_option(void); #endif /* __PERF_DEBUG_H */ diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 3abe3373ce90..d38b62a700ca 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1058,7 +1058,7 @@ int dso__name_len(const struct dso *dso) { if (!dso) return strlen("[unknown]"); - if (verbose) + if (verbose > 0) return dso->long_name_len; return dso->short_name_len; diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3d12c16e5103..05714d548584 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -505,24 +501,29 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) + ncpus = cpu__max_present_cpu(); + + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); return NULL; + } nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -532,10 +533,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; @@ -1126,7 +1133,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph->env.nr_cpus_avail; nr = ph->env.nr_sibling_cores; str = ph->env.sibling_cores; @@ -1781,7 +1788,7 @@ static int process_cpu_topology(struct perf_file_section *section, u32 nr, i; char *str; struct strbuf sb; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph->env.nr_cpus_avail; u64 size = 0; ph->env.cpu = calloc(cpu_nr, sizeof(*ph->env.cpu)); @@ -1862,7 +1869,7 @@ static int process_cpu_topology(struct perf_file_section *section, if (ph->needs_swap) nr = bswap_32(nr); - if (nr > (u32)cpu_nr) { + if (nr != (u32)-1 && nr > (u32)cpu_nr) { pr_debug("socket_id number is too big." "You may need to upgrade the perf tool.\n"); goto free_cpu; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 32c6a939e4cc..eaf72a938fb4 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -69,7 +69,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) */ if (h->ms.sym) { symlen = h->ms.sym->namelen + 4; - if (verbose) + if (verbose > 0) symlen += BITS_PER_LONG / 4 + 2 + 3; hists__new_col_len(hists, HISTC_SYMBOL, symlen); } else { @@ -93,7 +93,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) if (h->branch_info) { if (h->branch_info->from.sym) { symlen = (int)h->branch_info->from.sym->namelen + 4; - if (verbose) + if (verbose > 0) symlen += BITS_PER_LONG / 4 + 2 + 3; hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen); @@ -107,7 +107,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) if (h->branch_info->to.sym) { symlen = (int)h->branch_info->to.sym->namelen + 4; - if (verbose) + if (verbose > 0) symlen += BITS_PER_LONG / 4 + 2 + 3; hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 281e44af31e2..67a8aebc67ab 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2318,24 +2318,20 @@ int parse_events__is_hardcoded_term(struct parse_events_term *term) return term->type_term != PARSE_EVENTS__TERM_TYPE_USER; } -static int new_term(struct parse_events_term **_term, int type_val, - int type_term, char *config, - char *str, u64 num, int err_term, int err_val) +static int new_term(struct parse_events_term **_term, + struct parse_events_term *temp, + char *str, u64 num) { struct parse_events_term *term; - term = zalloc(sizeof(*term)); + term = malloc(sizeof(*term)); if (!term) return -ENOMEM; + *term = *temp; INIT_LIST_HEAD(&term->list); - term->type_val = type_val; - term->type_term = type_term; - term->config = config; - term->err_term = err_term; - term->err_val = err_val; - switch (type_val) { + switch (term->type_val) { case PARSE_EVENTS__TERM_TYPE_NUM: term->val.num = num; break; @@ -2353,15 +2349,22 @@ static int new_term(struct parse_events_term **_term, int type_val, int parse_events_term__num(struct parse_events_term **term, int type_term, char *config, u64 num, + bool no_value, void *loc_term_, void *loc_val_) { YYLTYPE *loc_term = loc_term_; YYLTYPE *loc_val = loc_val_; - return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term, - config, NULL, num, - loc_term ? loc_term->first_column : 0, - loc_val ? loc_val->first_column : 0); + struct parse_events_term temp = { + .type_val = PARSE_EVENTS__TERM_TYPE_NUM, + .type_term = type_term, + .config = config, + .no_value = no_value, + .err_term = loc_term ? loc_term->first_column : 0, + .err_val = loc_val ? loc_val->first_column : 0, + }; + + return new_term(term, &temp, NULL, num); } int parse_events_term__str(struct parse_events_term **term, @@ -2371,37 +2374,45 @@ int parse_events_term__str(struct parse_events_term **term, YYLTYPE *loc_term = loc_term_; YYLTYPE *loc_val = loc_val_; - return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term, - config, str, 0, - loc_term ? loc_term->first_column : 0, - loc_val ? loc_val->first_column : 0); + struct parse_events_term temp = { + .type_val = PARSE_EVENTS__TERM_TYPE_STR, + .type_term = type_term, + .config = config, + .err_term = loc_term ? loc_term->first_column : 0, + .err_val = loc_val ? loc_val->first_column : 0, + }; + + return new_term(term, &temp, str, 0); } int parse_events_term__sym_hw(struct parse_events_term **term, char *config, unsigned idx) { struct event_symbol *sym; + struct parse_events_term temp = { + .type_val = PARSE_EVENTS__TERM_TYPE_STR, + .type_term = PARSE_EVENTS__TERM_TYPE_USER, + .config = config ?: (char *) "event", + }; BUG_ON(idx >= PERF_COUNT_HW_MAX); sym = &event_symbols_hw[idx]; - if (config) - return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, - PARSE_EVENTS__TERM_TYPE_USER, config, - (char *) sym->symbol, 0, 0, 0); - else - return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, - PARSE_EVENTS__TERM_TYPE_USER, - (char *) "event", (char *) sym->symbol, - 0, 0, 0); + return new_term(term, &temp, (char *) sym->symbol, 0); } int parse_events_term__clone(struct parse_events_term **new, struct parse_events_term *term) { - return new_term(new, term->type_val, term->type_term, term->config, - term->val.str, term->val.num, - term->err_term, term->err_val); + struct parse_events_term temp = { + .type_val = term->type_val, + .type_term = term->type_term, + .config = term->config, + .err_term = term->err_term, + .err_val = term->err_val, + }; + + return new_term(new, &temp, term->val.str, term->val.num); } void parse_events_terms__purge(struct list_head *terms) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index da246a3ddb69..1af6a267c21b 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -94,6 +94,7 @@ struct parse_events_term { int type_term; struct list_head list; bool used; + bool no_value; /* error string indexes for within parsed string */ int err_term; @@ -122,6 +123,7 @@ void parse_events__shrink_config_terms(void); int parse_events__is_hardcoded_term(struct parse_events_term *term); int parse_events_term__num(struct parse_events_term **term, int type_term, char *config, u64 num, + bool novalue, void *loc_term, void *loc_val); int parse_events_term__str(struct parse_events_term **term, int type_term, char *config, char *str, diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index a14b47ab3879..30f018ea1370 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -252,7 +252,7 @@ PE_KERNEL_PMU_EVENT sep_dc if (!strcasecmp(alias->name, $1)) { ALLOC_LIST(head); ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, - $1, 1, &@1, NULL)); + $1, 1, false, &@1, NULL)); list_add_tail(&term->list, head); if (!parse_events_add_pmu(data, list, @@ -282,7 +282,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc ALLOC_LIST(head); ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, - &pmu_name, 1, &@1, NULL)); + &pmu_name, 1, false, &@1, NULL)); list_add_tail(&term->list, head); ALLOC_LIST(list); @@ -548,7 +548,7 @@ PE_NAME '=' PE_VALUE struct parse_events_term *term; ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, - $1, $3, &@1, &@3)); + $1, $3, false, &@1, &@3)); $$ = term; } | @@ -566,7 +566,7 @@ PE_NAME struct parse_events_term *term; ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, - $1, 1, &@1, NULL)); + $1, 1, true, &@1, NULL)); $$ = term; } | @@ -591,7 +591,7 @@ PE_TERM '=' PE_VALUE { struct parse_events_term *term; - ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, &@1, &@3)); + ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, false, &@1, &@3)); $$ = term; } | @@ -599,7 +599,7 @@ PE_TERM { struct parse_events_term *term; - ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL)); + ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, true, &@1, NULL)); $$ = term; } | @@ -620,7 +620,7 @@ PE_NAME array '=' PE_VALUE struct parse_events_term *term; ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, - $1, $4, &@1, &@4)); + $1, $4, false, &@1, &@4)); term->array = $2; $$ = term; } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 49bfee0e3d9e..12f84dd2ac5d 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -745,7 +745,7 @@ static int pmu_resolve_param_term(struct parse_events_term *term, } } - if (verbose) + if (verbose > 0) printf("Required parameter '%s' not specified\n", term->config); return -1; @@ -803,7 +803,7 @@ static int pmu_config_term(struct list_head *formats, format = pmu_find_format(formats, term->config); if (!format) { - if (verbose) + if (verbose > 0) printf("Invalid event/parameter '%s'\n", term->config); if (err) { char *pmu_term = pmu_formats_string(formats); @@ -834,11 +834,20 @@ static int pmu_config_term(struct list_head *formats, * Either directly use a numeric term, or try to translate string terms * using event parameters. */ - if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) + if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) { + if (term->no_value && + bitmap_weight(format->bits, PERF_PMU_FORMAT_BITS) > 1) { + if (err) { + err->idx = term->err_val; + err->str = strdup("no value assigned for term"); + } + return -EINVAL; + } + val = term->val.num; - else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) { + } else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) { if (strcmp(term->val.str, "?")) { - if (verbose) { + if (verbose > 0) { pr_info("Invalid sysfs entry %s=%s\n", term->config, term->val.str); } @@ -1223,7 +1232,7 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag, printf("%*s", 8, "["); wordwrap(aliases[j].desc, 8, columns, 0); printf("]\n"); - if (verbose) + if (verbose > 0) printf("%*s%s/%s/\n", 8, "", aliases[j].pmu, aliases[j].str); } else printf(" %-50s [Kernel PMU event]\n", aliases[j].name); diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 35f5b7b7715c..28fb62c32678 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -594,7 +594,7 @@ static int find_perf_probe_point_from_dwarf(struct probe_trace_point *tp, pr_debug("try to find information at %" PRIx64 " in %s\n", addr, tp->module ? : "kernel"); - dinfo = debuginfo_cache__open(tp->module, verbose == 0); + dinfo = debuginfo_cache__open(tp->module, verbose <= 0); if (dinfo) ret = debuginfo__find_probe_point(dinfo, (unsigned long)addr, pp); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 581e0efd6356..783326cfbaa6 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -369,10 +369,10 @@ static PyObject *python_process_callchain(struct perf_sample *sample, if (node->map) { struct map *map = node->map; const char *dsoname = "[unknown]"; - if (map && map->dso && (map->dso->name || map->dso->long_name)) { + if (map && map->dso) { if (symbol_conf.show_kernel_path && map->dso->long_name) dsoname = map->dso->long_name; - else if (map->dso->name) + else dsoname = map->dso->name; } pydict_set_item_string_decref(pyelem, "dso", diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 4cdbc8f5f14d..1dd617d116b5 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -932,7 +932,7 @@ static void branch_stack__printf(struct perf_sample *sample) printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n", i, e->from, e->to, - e->flags.cycles, + (unsigned short)e->flags.cycles, e->flags.mispred ? "M" : " ", e->flags.predicted ? "P" : " ", e->flags.abort ? "A" : " ", diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index c8680984d2d6..af415febbc46 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -1,8 +1,15 @@ #!/usr/bin/python2 -from distutils.core import setup, Extension from os import getenv +cc = getenv("CC") +if cc == "clang": + from _sysconfigdata import build_time_vars + from re import sub + build_time_vars["CFLAGS"] = sub("-specs=[^ ]+", "", build_time_vars["CFLAGS"]) + +from distutils.core import setup, Extension + from distutils.command.build_ext import build_ext as _build_ext from distutils.command.install_lib import install_lib as _install_lib diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index df622f4e301e..0ff622288d24 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -151,7 +151,7 @@ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r) if (!dso_l || !dso_r) return cmp_null(dso_r, dso_l); - if (verbose) { + if (verbose > 0) { dso_name_l = dso_l->long_name; dso_name_r = dso_r->long_name; } else { @@ -172,8 +172,8 @@ static int _hist_entry__dso_snprintf(struct map *map, char *bf, size_t size, unsigned int width) { if (map && map->dso) { - const char *dso_name = !verbose ? map->dso->short_name : - map->dso->long_name; + const char *dso_name = verbose > 0 ? map->dso->long_name : + map->dso->short_name; return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name); } @@ -261,7 +261,7 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym, { size_t ret = 0; - if (verbose) { + if (verbose > 0) { char o = map ? dso__symtab_origin(map->dso) : '!'; ret += repsep_snprintf(bf, size, "%-#*llx %c ", BITS_PER_LONG / 4 + 2, ip, o); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 39345c2ddfc2..0d51334a9b46 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -344,7 +344,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, for (i = 0; i < 3; i++) update_stats(&ps->res_stats[i], count[i]); - if (verbose) { + if (verbose > 0) { fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", perf_evsel__name(counter), count[0], count[1], count[2]); } diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index adbc6c02c3aa..4e59ddeb4eda 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -213,7 +213,7 @@ static bool want_demangle(bool is_kernel_sym) static char *demangle_sym(struct dso *dso, int kmodule, const char *elf_name) { - int demangle_flags = verbose ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS; + int demangle_flags = verbose > 0 ? (DMGL_PARAMS | DMGL_ANSI) : DMGL_NO_OPTS; char *demangled = NULL; /* |