diff options
Diffstat (limited to 'arch/x86/kernel')
71 files changed, 624 insertions, 5922 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2c833d8c4141..1e994754d323 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,6 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o @@ -46,6 +45,7 @@ obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o +obj-y += resource.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -58,7 +58,6 @@ obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ -obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o @@ -82,7 +81,6 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o @@ -104,14 +102,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o -obj-$(CONFIG_SCx200) += scx200.o -scx200-y += scx200_32.o - -obj-$(CONFIG_OLPC) += olpc.o -obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o -obj-$(CONFIG_X86_MRST) += mrst.o - microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o @@ -124,7 +114,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f19d6679600f..7235e5fbdb6d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -518,35 +518,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) return 0; } -/* - * success: return IRQ number (>=0) - * failure: return < 0 - */ -int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +static int acpi_register_gsi_pic(struct device *dev, u32 gsi, + int trigger, int polarity) { - unsigned int irq; - unsigned int plat_gsi = gsi; - #ifdef CONFIG_PCI /* * Make sure all (legacy) PCI IRQs are set as level-triggered. */ - if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); - } + if (trigger == ACPI_LEVEL_SENSITIVE) + eisa_set_level_irq(gsi); #endif + return gsi; +} + +static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ #ifdef CONFIG_X86_IO_APIC - if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); - } + gsi = mp_register_gsi(dev, gsi, trigger, polarity); #endif + + return gsi; +} + +int (*__acpi_register_gsi)(struct device *dev, u32 gsi, + int trigger, int polarity) = acpi_register_gsi_pic; + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + unsigned int irq; + unsigned int plat_gsi = gsi; + + plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity); irq = gsi_to_irq(plat_gsi); return irq; } +void __init acpi_set_irq_model_pic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_PIC; + __acpi_register_gsi = acpi_register_gsi_pic; + acpi_ioapic = 0; +} + +void __init acpi_set_irq_model_ioapic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + __acpi_register_gsi = acpi_register_gsi_ioapic; + acpi_ioapic = 1; +} + /* * ACPI based hotplug support for CPU */ @@ -1264,8 +1291,7 @@ static void __init acpi_process_madt(void) */ error = acpi_parse_madt_ioapic_entries(); if (!error) { - acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; - acpi_ioapic = 1; + acpi_set_irq_model_ioapic(); smp_found_config = 1; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index e1252074ea40..69fd72aa5594 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -13,6 +13,10 @@ #include <asm/segment.h> #include <asm/desc.h> +#ifdef CONFIG_X86_32 +#include <asm/pgtable.h> +#endif + #include "realmode/wakeup.h" #include "sleep.h" @@ -91,7 +95,7 @@ int acpi_save_state_mem(void) #ifndef CONFIG_64BIT header->pmode_entry = (u32)&wakeup_pmode_return; - header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); + header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a36bb90aef53..5079f24c955a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -638,71 +638,32 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) atomic_set(&stop_machine_first, 1); wrote_text = 0; /* Use __stop_machine() because the caller already got online_cpus. */ - __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); return addr; } #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) -unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; +#ifdef CONFIG_X86_64 +unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 }; +#else +unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; +#endif void __init arch_init_ideal_nop5(void) { - extern const unsigned char ftrace_test_p6nop[]; - extern const unsigned char ftrace_test_nop5[]; - extern const unsigned char ftrace_test_jmp[]; - int faulted = 0; - /* - * There is no good nop for all x86 archs. - * We will default to using the P6_NOP5, but first we - * will test to make sure that the nop will actually - * work on this CPU. If it faults, we will then - * go to a lesser efficient 5 byte nop. If that fails - * we then just use a jmp as our nop. This isn't the most - * efficient nop, but we can not use a multi part nop - * since we would then risk being preempted in the middle - * of that nop, and if we enabled tracing then, it might - * cause a system crash. + * There is no good nop for all x86 archs. This selection + * algorithm should be unified with the one in find_nop_table(), + * but this should be good enough for now. * - * TODO: check the cpuid to determine the best nop. + * For cases other than the ones below, use the safe (as in + * always functional) defaults above. */ - asm volatile ( - "ftrace_test_jmp:" - "jmp ftrace_test_p6nop\n" - "nop\n" - "nop\n" - "nop\n" /* 2 byte jmp + 3 bytes */ - "ftrace_test_p6nop:" - P6_NOP5 - "jmp 1f\n" - "ftrace_test_nop5:" - ".byte 0x66,0x66,0x66,0x66,0x90\n" - "1:" - ".section .fixup, \"ax\"\n" - "2: movl $1, %0\n" - " jmp ftrace_test_nop5\n" - "3: movl $2, %0\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(ftrace_test_p6nop, 2b) - _ASM_EXTABLE(ftrace_test_nop5, 3b) - : "=r"(faulted) : "0" (faulted)); - - switch (faulted) { - case 0: - pr_info("converting mcount calls to 0f 1f 44 00 00\n"); - memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5); - break; - case 1: - pr_info("converting mcount calls to 66 66 66 66 90\n"); - memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5); - break; - case 2: - pr_info("converting mcount calls to jmp . + 5\n"); - memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5); - break; - } - +#ifdef CONFIG_X86_64 + /* Don't use these on 32 bits due to broken virtualizers */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + memcpy(ideal_nop5, p6_nops[5], 5); +#endif } #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0d5d07f2253e..30fdce8c5c17 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -52,7 +52,6 @@ #include <asm/mce.h> #include <asm/kvm_para.h> #include <asm/tsc.h> -#include <asm/atomic.h> unsigned int num_processors; @@ -1390,6 +1389,14 @@ void __cpuinit end_local_APIC_setup(void) setup_apic_nmi_watchdog(NULL); apic_pm_activate(); + + /* + * Now that local APIC setup is completed for BP, configure the fault + * handling for interrupt remapping. + */ + if (!smp_processor_id() && intr_remapping_enabled) + enable_drhd_fault_handling(); + } #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index cefd6942f0e9..62f6e1e55b90 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -17,15 +17,16 @@ #include <linux/nmi.h> #include <linux/module.h> -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; - u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; } #ifdef ARCH_HAS_NMI_WATCHDOG + +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + void arch_trigger_all_cpu_backtrace(void) { int i; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4f026a632c95..943d814ef8e4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2434,13 +2434,12 @@ static void ack_apic_level(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; int i, do_unmask_irq = 0, irq = data->irq; - struct irq_desc *desc = irq_to_desc(irq); unsigned long v; irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_ioapic(cfg); } @@ -3113,7 +3112,7 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - if (intr_remapping_enabled) + if (irq_remapped(cfg)) free_irte(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); @@ -3335,7 +3334,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) return 0; } -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; @@ -3393,7 +3392,7 @@ error: return ret; } -void arch_teardown_msi_irq(unsigned int irq) +void native_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); } @@ -3417,6 +3416,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); dmar_msi_write(irq, &msg); @@ -3654,6 +3654,11 @@ static void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +int get_nr_irqs_gsi(void) +{ + return nr_irqs_gsi; +} + #ifdef CONFIG_SPARSE_IRQ int __init arch_probe_nr_irqs(void) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index f9e4e6a54073..d8c4a6feb286 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void) /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; } - - /* - * Now that apic routing model is selected, configure the - * fault handling for intr remapping. - */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f744f54cb248..c1c52c341f40 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -41,8 +41,11 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +static union uvh_apicid uvh_apicid; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +unsigned int uv_apicid_hibits; +EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); static inline bool is_GRU_range(u64 start, u64 end) @@ -70,12 +73,44 @@ static int early_get_nodeid(void) return node_id.s.node_id; } +static void __init early_get_apic_pnode_shift(void) +{ + unsigned long *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr)); + uvh_apicid.v = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + if (!uvh_apicid.v) + /* + * Old bios, use default value + */ + uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; +} + +/* + * Add an extra bit as dictated by bios to the destination apicid of + * interrupts potentially passing through the UV HUB. This prevents + * a deadlock between interrupts and IO port operations. + */ +static void __init uv_set_apicid_hibit(void) +{ + union uvh_lb_target_physical_apic_id_mask_u apicid_mask; + unsigned long *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | + UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr)); + apicid_mask.v = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; +} + static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { int nodeid; if (!strcmp(oem_id, "SGI")) { nodeid = early_get_nodeid(); + early_get_apic_pnode_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) @@ -84,8 +119,9 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - nodeid << (UV_APIC_PNODE_SHIFT - 1); + nodeid << (uvh_apicid.s.pnode_shift - 1); uv_system_type = UV_NON_UNIQUE_APIC; + uv_set_apicid_hibit(); return 1; } } @@ -139,6 +175,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri int pnode; pnode = uv_apicid_to_pnode(phys_apicid); + phys_apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | @@ -220,7 +257,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) int cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; else return BAD_APICID; } @@ -239,7 +276,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -363,14 +400,14 @@ struct redir_addr { #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT static __initdata struct redir_addr redir_addrs[] = { - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { - union uvh_si_alias0_overlay_config_u alias; + union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; int i; @@ -644,7 +681,7 @@ void uv_nmi_init(void) void __init uv_system_init(void) { - union uvh_si_addr_map_config_u m_n_config; + union uvh_rh_gam_config_mmr_u m_n_config; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; @@ -654,7 +691,7 @@ void __init uv_system_init(void) map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; mmr_base = @@ -716,6 +753,10 @@ void __init uv_system_init(void) int apicid = per_cpu(x86_cpu_to_apicid, cpu); nid = cpu_to_node(cpu); + /* + * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); + */ + uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 4c9c67bf09b7..0e4f24c2a746 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -189,8 +189,8 @@ * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. * * [This document is available free from Intel by calling 800.628.8686 (fax - * 916.356.6100) or 800.548.4725; or via anonymous ftp from - * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also + * 916.356.6100) or 800.548.4725; or from + * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also * available from Microsoft by calling 206.882.8080.] * * APM 1.2 Reference: @@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = { .unlocked_ioctl = do_ioctl, .open = do_open, .release = do_release, + .llseek = noop_llseek, }; static struct miscdevice apm_device = { diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dfdbf6403895..1a4088dda37a 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -99,9 +99,7 @@ void foo(void) DEFINE(PAGE_SIZE_asm, PAGE_SIZE); DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); - DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); - DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); - DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); + DEFINE(THREAD_SIZE_asm, THREAD_SIZE); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c deleted file mode 100644 index 8bc57baaa9ad..000000000000 --- a/arch/x86/kernel/bios_uv.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * BIOS run time interface routines. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson <rja@sgi.com> - */ - -#include <linux/efi.h> -#include <asm/efi.h> -#include <linux/io.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv_hub.h> - -static struct uv_systab uv_systab; - -s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) -{ - struct uv_systab *tab = &uv_systab; - s64 ret; - - if (!tab->function) - /* - * BIOS does not support UV systab - */ - return BIOS_STATUS_UNIMPLEMENTED; - - ret = efi_call6((void *)__va(tab->function), (u64)which, - a1, a2, a3, a4, a5); - return ret; -} -EXPORT_SYMBOL_GPL(uv_bios_call); - -s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) -{ - unsigned long bios_flags; - s64 ret; - - local_irq_save(bios_flags); - ret = uv_bios_call(which, a1, a2, a3, a4, a5); - local_irq_restore(bios_flags); - - return ret; -} - -s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) -{ - s64 ret; - - preempt_disable(); - ret = uv_bios_call(which, a1, a2, a3, a4, a5); - preempt_enable(); - - return ret; -} - - -long sn_partition_id; -EXPORT_SYMBOL_GPL(sn_partition_id); -long sn_coherency_id; -EXPORT_SYMBOL_GPL(sn_coherency_id); -long sn_region_size; -EXPORT_SYMBOL_GPL(sn_region_size); -long system_serial_number; -EXPORT_SYMBOL_GPL(system_serial_number); -int uv_type; -EXPORT_SYMBOL_GPL(uv_type); - - -s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region, long *ssn) -{ - s64 ret; - u64 v0, v1; - union partition_info_u part; - - ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc, - (u64)(&v0), (u64)(&v1), 0, 0); - if (ret != BIOS_STATUS_SUCCESS) - return ret; - - part.val = v0; - if (uvtype) - *uvtype = part.hub_version; - if (partid) - *partid = part.partition_id; - if (coher) - *coher = part.coherence_id; - if (region) - *region = part.region_size; - if (ssn) - *ssn = v1; - return ret; -} -EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); - -int -uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, - unsigned long *intr_mmr_offset) -{ - u64 watchlist; - s64 ret; - - /* - * bios returns watchlist number or negative error number. - */ - ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, - mq_size, (u64)intr_mmr_offset, - (u64)&watchlist, 0); - if (ret < BIOS_STATUS_SUCCESS) - return ret; - - return watchlist; -} -EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc); - -int -uv_bios_mq_watchlist_free(int blade, int watchlist_num) -{ - return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE, - blade, watchlist_num, 0, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); - -s64 -uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) -{ - return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len, - perms, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); - -s64 -uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) -{ - s64 ret; - - ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, - (u64)addr, buf, (u64)len, 0); - return ret; -} -EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); - -s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) -{ - return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, - (u64)ticks_per_second, 0, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_freq_base); - -/* - * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target - * @decode: true to enable target, false to disable target - * @domain: PCI domain number - * @bus: PCI bus number - * - * Returns: - * 0: Success - * -EINVAL: Invalid domain or bus number - * -ENOSYS: Capability not available - * -EBUSY: Legacy VGA I/O cannot be retargeted at this time - */ -int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) -{ - return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, - (u64)decode, (u64)domain, (u64)bus, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); - - -#ifdef CONFIG_EFI -void uv_bios_init(void) -{ - struct uv_systab *tab; - - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || - (efi.uv_systab == (unsigned long)NULL)) { - printk(KERN_CRIT "No EFI UV System Table.\n"); - uv_systab.function = (unsigned long)NULL; - return; - } - - tab = (struct uv_systab *)ioremap(efi.uv_systab, - sizeof(struct uv_systab)); - if (strncmp(tab->signature, "UVST", 4) != 0) - printk(KERN_ERR "bad signature in UV system table!"); - - /* - * Copy table to permanent spot for later use. - */ - memcpy(&uv_systab, tab, sizeof(struct uv_systab)); - iounmap(tab); - - printk(KERN_INFO "EFI UV System Table Revision %d\n", - uv_systab.revision); -} -#else /* !CONFIG_EFI */ - -void uv_bios_init(void) { } -#endif diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cd8da247dda1..a2baafb2fe6d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -701,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) per_cpu(acfreq_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); + kfree(data->freq_table); kfree(data); } diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index 733093d60436..141abebc4516 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = { * Detects nForce2 A2 and C1 stepping * */ -static unsigned int nforce2_detect_chipset(void) +static int nforce2_detect_chipset(void) { nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index fc09f142d94d..d9f51367666b 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq; * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS * and MSR_TMTA_LONGRUN_CTRL */ -static void __init longrun_get_policy(struct cpufreq_policy *policy) +static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) { u32 msr_lo, msr_hi; @@ -165,7 +165,7 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, +static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, unsigned int *high_freq) { u32 msr_lo, msr_hi; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 695f17731e23..d16c2c53d6bf 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -284,9 +284,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) - node = first_node(node_online_map); - else if (!node_online(node)) { + if (node == NUMA_NO_NODE || !node_online(node)) { /* reuse the value from init_cpu_to_node() */ node = cpu_to_node(cpu); } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 12cd823c8d03..17ad03366211 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8a85dd1b1aa1..1e8d66c1336a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = { .release = seq_release, .read = seq_read, .write = severities_coverage_write, + .llseek = seq_lseek, }; static int __init severities_debugfs_init(void) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ed41562909fe..7a35b72d7c03 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1665,6 +1665,7 @@ struct file_operations mce_chrdev_ops = { .read = mce_read, .poll = mce_poll, .unlocked_ioctl = mce_ioctl, + .llseek = no_llseek, }; EXPORT_SYMBOL_GPL(mce_chrdev_ops); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fe73c1844a9a..6d75b9145b13 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -49,7 +49,6 @@ static unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; unsigned long size, len = 0; struct page *page; void *map; @@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) offset = addr & (PAGE_SIZE - 1); size = min(PAGE_SIZE - offset, n - len); - map = kmap_atomic(page, type); + map = kmap_atomic(page); memcpy(to, map+offset, size); - kunmap_atomic(map, type); + kunmap_atomic(map); put_page(page); len += size; @@ -238,6 +237,7 @@ struct x86_pmu { * Intel DebugStore bits */ int bts, pebs; + int bts_active, pebs_active; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; @@ -381,7 +381,21 @@ static void release_pmc_hardware(void) {} #endif -static int reserve_ds_buffers(void); +static bool check_hw_exists(void) +{ + u64 val, val_new = 0; + int ret = 0; + + val = 0xabcdUL; + ret |= checking_wrmsrl(x86_pmu.perfctr, val); + ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); + if (ret || val != val_new) + return false; + + return true; +} + +static void reserve_ds_buffers(void); static void release_ds_buffers(void); static void hw_perf_event_destroy(struct perf_event *event) @@ -478,7 +492,7 @@ static int x86_setup_perfctr(struct perf_event *event) if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && (hwc->sample_period == 1)) { /* BTS is not supported by this architecture. */ - if (!x86_pmu.bts) + if (!x86_pmu.bts_active) return -EOPNOTSUPP; /* BTS is currently only allowed for user-mode. */ @@ -497,12 +511,13 @@ static int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs) + if (x86_pmu.pebs_active) { precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr) - precise++; + /* Support for IP fixup */ + if (x86_pmu.lbr_nr) + precise++; + } if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -544,11 +559,8 @@ static int __x86_pmu_event_init(struct perf_event *event) if (atomic_read(&active_events) == 0) { if (!reserve_pmc_hardware()) err = -EBUSY; - else { - err = reserve_ds_buffers(); - if (err) - release_pmc_hardware(); - } + else + reserve_ds_buffers(); } if (!err) atomic_inc(&active_events); @@ -1374,6 +1386,12 @@ void __init init_hw_perf_events(void) pmu_check_apic(); + /* sanity check that the hardware exists or is emulated */ + if (!check_hw_exists()) { + pr_cont("Broken PMU hardware detected, software events only.\n"); + return; + } + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.quirks) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 46d58448c3af..e421b8cd6944 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -280,11 +280,11 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) struct amd_nb *nb; int i; - nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); if (!nb) return NULL; - memset(nb, 0, sizeof(*nb)); nb->nb_id = nb_id; /* diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 4977f9c400e5..b7dcd9f2b8a0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static int alloc_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh = 1; /* always use a single PEBS record */ + void *buffer; + + if (!x86_pmu.pebs) + return 0; + + buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + + ds->pebs_buffer_base = (u64)(unsigned long)buffer; + ds->pebs_index = ds->pebs_buffer_base; + ds->pebs_absolute_maximum = ds->pebs_buffer_base + + max * x86_pmu.pebs_record_size; + + ds->pebs_interrupt_threshold = ds->pebs_buffer_base + + thresh * x86_pmu.pebs_record_size; + + return 0; +} + +static void release_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.pebs) + return; + + kfree((void *)(unsigned long)ds->pebs_buffer_base); + ds->pebs_buffer_base = 0; +} + +static int alloc_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh; + void *buffer; + + if (!x86_pmu.bts) + return 0; + + buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + thresh = max / 16; + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + thresh * BTS_RECORD_SIZE; + + return 0; +} + +static void release_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.bts) + return; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + ds->bts_buffer_base = 0; +} + +static int alloc_ds_buffer(int cpu) +{ + int node = cpu_to_node(cpu); + struct debug_store *ds; + + ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!ds)) + return -ENOMEM; + + per_cpu(cpu_hw_events, cpu).ds = ds; + + return 0; +} + +static void release_ds_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + kfree(ds); +} + static void release_ds_buffers(void) { int cpu; @@ -82,93 +183,77 @@ static void release_ds_buffers(void) return; get_online_cpus(); - for_each_online_cpu(cpu) fini_debug_store_on_cpu(cpu); for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_events, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->pebs_buffer_base); - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); + release_pebs_buffer(cpu); + release_bts_buffer(cpu); + release_ds_buffer(cpu); } - put_online_cpus(); } -static int reserve_ds_buffers(void) +static void reserve_ds_buffers(void) { - int cpu, err = 0; + int bts_err = 0, pebs_err = 0; + int cpu; + + x86_pmu.bts_active = 0; + x86_pmu.pebs_active = 0; if (!x86_pmu.bts && !x86_pmu.pebs) - return 0; + return; + + if (!x86_pmu.bts) + bts_err = 1; + + if (!x86_pmu.pebs) + pebs_err = 1; get_online_cpus(); for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - int max, thresh; + if (alloc_ds_buffer(cpu)) { + bts_err = 1; + pebs_err = 1; + } + + if (!bts_err && alloc_bts_buffer(cpu)) + bts_err = 1; - err = -ENOMEM; - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) + if (!pebs_err && alloc_pebs_buffer(cpu)) + pebs_err = 1; + + if (bts_err && pebs_err) break; - per_cpu(cpu_hw_events, cpu).ds = ds; - - if (x86_pmu.bts) { - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - } + } - if (x86_pmu.pebs) { - buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; - ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - /* - * Always use single record PEBS - */ - ds->pebs_interrupt_threshold = ds->pebs_buffer_base + - x86_pmu.pebs_record_size; - } + if (bts_err) { + for_each_possible_cpu(cpu) + release_bts_buffer(cpu); + } - err = 0; + if (pebs_err) { + for_each_possible_cpu(cpu) + release_pebs_buffer(cpu); } - if (err) - release_ds_buffers(); - else { + if (bts_err && pebs_err) { + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + } else { + if (x86_pmu.bts && !bts_err) + x86_pmu.bts_active = 1; + + if (x86_pmu.pebs && !pebs_err) + x86_pmu.pebs_active = 1; + for_each_online_cpu(cpu) init_debug_store_on_cpu(cpu); } put_online_cpus(); - - return err; } /* @@ -233,7 +318,7 @@ static int intel_pmu_drain_bts_buffer(void) if (!event) return 0; - if (!ds) + if (!x86_pmu.bts_active) return 0; at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; @@ -503,7 +588,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) struct pebs_record_core *at, *top; int n; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs_active) return; at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; @@ -545,7 +630,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) u64 status = 0; int bit, n; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs_active) return; at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; @@ -630,9 +715,8 @@ static void intel_ds_init(void) #else /* CONFIG_CPU_SUP_INTEL */ -static int reserve_ds_buffers(void) +static void reserve_ds_buffers(void) { - return 0; } static void release_ds_buffers(void) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 1b7b31ab7d86..212a6a42527c 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -33,7 +33,6 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/device.h> diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 67414550c3cc..d5cd13945d5a 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + vaddr = kmap_atomic_pfn(pfn); if (!userbuf) { memcpy(buf, (vaddr + offset), csize); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 0f6376ffa2d9..1bc7f75a5bda 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -82,11 +82,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %08lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %08lx", *stack++); touch_nmi_watchdog(); } - printk("\n"); + printk(KERN_CONT "\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57a21f11c791..6a340485249a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -265,20 +265,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(" <EOI> "); + printk(KERN_CONT " <EOI> "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %016lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk("\n"); + printk(KERN_CONT "\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c deleted file mode 100644 index 0fe27d7c6258..000000000000 --- a/arch/x86/kernel/efi.c +++ /dev/null @@ -1,613 +0,0 @@ -/* - * Common EFI (Extensible Firmware Interface) support functions - * Based on Extensible Firmware Interface Specification version 1.0 - * - * Copyright (C) 1999 VA Linux Systems - * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> - * Copyright (C) 1999-2002 Hewlett-Packard Co. - * David Mosberger-Tang <davidm@hpl.hp.com> - * Stephane Eranian <eranian@hpl.hp.com> - * Copyright (C) 2005-2008 Intel Co. - * Fenghua Yu <fenghua.yu@intel.com> - * Bibo Mao <bibo.mao@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - * Huang Ying <ying.huang@intel.com> - * - * Copied from efi_32.c to eliminate the duplicated code between EFI - * 32/64 support code. --ying 2007-10-26 - * - * All EFI Runtime Services are not implemented yet as EFI only - * supports physical mode addressing on SoftSDV. This is to be fixed - * in a future version. --drummond 1999-07-20 - * - * Implemented EFI runtime services and virtual mode calls. --davidm - * - * Goutham Rao: <goutham.rao@intel.com> - * Skip non-WB memory and ignore empty memory ranges. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/efi.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/spinlock.h> -#include <linux/uaccess.h> -#include <linux/time.h> -#include <linux/io.h> -#include <linux/reboot.h> -#include <linux/bcd.h> - -#include <asm/setup.h> -#include <asm/efi.h> -#include <asm/time.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/x86_init.h> - -#define EFI_DEBUG 1 -#define PFX "EFI: " - -int efi_enabled; -EXPORT_SYMBOL(efi_enabled); - -struct efi efi; -EXPORT_SYMBOL(efi); - -struct efi_memory_map memmap; - -static struct efi efi_phys __initdata; -static efi_system_table_t efi_systab __initdata; - -static int __init setup_noefi(char *arg) -{ - efi_enabled = 0; - return 0; -} -early_param("noefi", setup_noefi); - -int add_efi_memmap; -EXPORT_SYMBOL(add_efi_memmap); - -static int __init setup_add_efi_memmap(char *arg) -{ - add_efi_memmap = 1; - return 0; -} -early_param("add_efi_memmap", setup_add_efi_memmap); - - -static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - return efi_call_virt2(get_time, tm, tc); -} - -static efi_status_t virt_efi_set_time(efi_time_t *tm) -{ - return efi_call_virt1(set_time, tm); -} - -static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, - efi_bool_t *pending, - efi_time_t *tm) -{ - return efi_call_virt3(get_wakeup_time, - enabled, pending, tm); -} - -static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) -{ - return efi_call_virt2(set_wakeup_time, - enabled, tm); -} - -static efi_status_t virt_efi_get_variable(efi_char16_t *name, - efi_guid_t *vendor, - u32 *attr, - unsigned long *data_size, - void *data) -{ - return efi_call_virt5(get_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) -{ - return efi_call_virt3(get_next_variable, - name_size, name, vendor); -} - -static efi_status_t virt_efi_set_variable(efi_char16_t *name, - efi_guid_t *vendor, - unsigned long attr, - unsigned long data_size, - void *data) -{ - return efi_call_virt5(set_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) -{ - return efi_call_virt1(get_next_high_mono_count, count); -} - -static void virt_efi_reset_system(int reset_type, - efi_status_t status, - unsigned long data_size, - efi_char16_t *data) -{ - efi_call_virt4(reset_system, reset_type, status, - data_size, data); -} - -static efi_status_t virt_efi_set_virtual_address_map( - unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - return efi_call_virt4(set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); -} - -static efi_status_t __init phys_efi_set_virtual_address_map( - unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys4(efi_phys.set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); - efi_call_phys_epilog(); - return status; -} - -static efi_status_t __init phys_efi_get_time(efi_time_t *tm, - efi_time_cap_t *tc) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys2(efi_phys.get_time, tm, tc); - efi_call_phys_epilog(); - return status; -} - -int efi_set_rtc_mmss(unsigned long nowtime) -{ - int real_seconds, real_minutes; - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - status = efi.get_time(&eft, &cap); - if (status != EFI_SUCCESS) { - printk(KERN_ERR "Oops: efitime: can't read time!\n"); - return -1; - } - - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - eft.minute) + 15)/30) & 1) - real_minutes += 30; - real_minutes %= 60; - eft.minute = real_minutes; - eft.second = real_seconds; - - status = efi.set_time(&eft); - if (status != EFI_SUCCESS) { - printk(KERN_ERR "Oops: efitime: can't write time!\n"); - return -1; - } - return 0; -} - -unsigned long efi_get_time(void) -{ - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - status = efi.get_time(&eft, &cap); - if (status != EFI_SUCCESS) - printk(KERN_ERR "Oops: efitime: can't read time!\n"); - - return mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); -} - -/* - * Tell the kernel about the EFI memory map. This might include - * more than the max 128 entries that can fit in the e820 legacy - * (zeropage) memory map. - */ - -static void __init do_add_efi_memmap(void) -{ - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - int e820_type; - - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - if (md->attribute & EFI_MEMORY_WB) - e820_type = E820_RAM; - else - e820_type = E820_RESERVED; - break; - case EFI_ACPI_RECLAIM_MEMORY: - e820_type = E820_ACPI; - break; - case EFI_ACPI_MEMORY_NVS: - e820_type = E820_NVS; - break; - case EFI_UNUSABLE_MEMORY: - e820_type = E820_UNUSABLE; - break; - default: - /* - * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE - * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO - * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE - */ - e820_type = E820_RESERVED; - break; - } - e820_add_region(start, size, e820_type); - } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); -} - -void __init efi_memblock_x86_reserve_range(void) -{ - unsigned long pmap; - -#ifdef CONFIG_X86_32 - pmap = boot_params.efi_info.efi_memmap; -#else - pmap = (boot_params.efi_info.efi_memmap | - ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); -#endif - memmap.phys_map = (void *)pmap; - memmap.nr_map = boot_params.efi_info.efi_memmap_size / - boot_params.efi_info.efi_memdesc_size; - memmap.desc_version = boot_params.efi_info.efi_memdesc_version; - memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, - "EFI memmap"); -} - -#if EFI_DEBUG -static void __init print_efi_memmap(void) -{ - efi_memory_desc_t *md; - void *p; - int i; - - for (p = memmap.map, i = 0; - p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, " - "range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), - (md->num_pages >> (20 - EFI_PAGE_SHIFT))); - } -} -#endif /* EFI_DEBUG */ - -void __init efi_init(void) -{ - efi_config_table_t *config_tables; - efi_runtime_services_t *runtime; - efi_char16_t *c16; - char vendor[100] = "unknown"; - int i = 0; - void *tmp; - -#ifdef CONFIG_X86_32 - efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; -#else - efi_phys.systab = (efi_system_table_t *) - (boot_params.efi_info.efi_systab | - ((__u64)boot_params.efi_info.efi_systab_hi<<32)); -#endif - - efi.systab = early_ioremap((unsigned long)efi_phys.systab, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) - printk(KERN_ERR "Couldn't map the EFI system table!\n"); - memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); - early_iounmap(efi.systab, sizeof(efi_system_table_t)); - efi.systab = &efi_systab; - - /* - * Verify the EFI Table - */ - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - printk(KERN_ERR "EFI system table signature incorrect!\n"); - if ((efi.systab->hdr.revision >> 16) == 0) - printk(KERN_ERR "Warning: EFI system table version " - "%d.%02d, expected 1.00 or greater!\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* - * Show what we know for posterity - */ - c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); - if (c16) { - for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) - vendor[i] = *c16++; - vendor[i] = '\0'; - } else - printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); - early_iounmap(tmp, 2); - - printk(KERN_INFO "EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - /* - * Let's see what config tables the firmware passed to us. - */ - config_tables = early_ioremap( - efi.systab->tables, - efi.systab->nr_tables * sizeof(efi_config_table_t)); - if (config_tables == NULL) - printk(KERN_ERR "Could not map EFI Configuration Table!\n"); - - printk(KERN_INFO); - for (i = 0; i < efi.systab->nr_tables; i++) { - if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) { - efi.mps = config_tables[i].table; - printk(" MPS=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - ACPI_20_TABLE_GUID)) { - efi.acpi20 = config_tables[i].table; - printk(" ACPI 2.0=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - ACPI_TABLE_GUID)) { - efi.acpi = config_tables[i].table; - printk(" ACPI=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - SMBIOS_TABLE_GUID)) { - efi.smbios = config_tables[i].table; - printk(" SMBIOS=0x%lx ", config_tables[i].table); -#ifdef CONFIG_X86_UV - } else if (!efi_guidcmp(config_tables[i].guid, - UV_SYSTEM_TABLE_GUID)) { - efi.uv_systab = config_tables[i].table; - printk(" UVsystab=0x%lx ", config_tables[i].table); -#endif - } else if (!efi_guidcmp(config_tables[i].guid, - HCDP_TABLE_GUID)) { - efi.hcdp = config_tables[i].table; - printk(" HCDP=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - UGA_IO_PROTOCOL_GUID)) { - efi.uga = config_tables[i].table; - printk(" UGA=0x%lx ", config_tables[i].table); - } - } - printk("\n"); - early_iounmap(config_tables, - efi.systab->nr_tables * sizeof(efi_config_table_t)); - - /* - * Check out the runtime services table. We need to map - * the runtime services table so that we can grab the physical - * address of several of the EFI runtime functions, needed to - * set the firmware into virtual mode. - */ - runtime = early_ioremap((unsigned long)efi.systab->runtime, - sizeof(efi_runtime_services_t)); - if (runtime != NULL) { - /* - * We will only need *early* access to the following - * two EFI runtime services before set_virtual_address_map - * is invoked. - */ - efi_phys.get_time = (efi_get_time_t *)runtime->get_time; - efi_phys.set_virtual_address_map = - (efi_set_virtual_address_map_t *) - runtime->set_virtual_address_map; - /* - * Make efi_get_time can be called before entering - * virtual mode. - */ - efi.get_time = phys_efi_get_time; - } else - printk(KERN_ERR "Could not map the EFI runtime service " - "table!\n"); - early_iounmap(runtime, sizeof(efi_runtime_services_t)); - - /* Map the EFI memory map */ - memmap.map = early_ioremap((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); - if (memmap.map == NULL) - printk(KERN_ERR "Could not map the EFI memory map!\n"); - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); - - if (memmap.desc_size != sizeof(efi_memory_desc_t)) - printk(KERN_WARNING - "Kernel-defined memdesc doesn't match the one from EFI!\n"); - - if (add_efi_memmap) - do_add_efi_memmap(); - -#ifdef CONFIG_X86_32 - x86_platform.get_wallclock = efi_get_time; - x86_platform.set_wallclock = efi_set_rtc_mmss; -#endif - - /* Setup for EFI runtime service */ - reboot_type = BOOT_EFI; - -#if EFI_DEBUG - print_efi_memmap(); -#endif -} - -static void __init runtime_code_page_mkexec(void) -{ - efi_memory_desc_t *md; - void *p; - u64 addr, npages; - - /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if (md->type != EFI_RUNTIME_SERVICES_CODE) - continue; - - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_x(addr, npages); - } -} - -/* - * This function will switch the EFI runtime services to virtual mode. - * Essentially, look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor and update - * that memory descriptor with the virtual address obtained from ioremap(). - * This enables the runtime services to be called without having to - * thunk back into physical mode for every invocation. - */ -void __init efi_enter_virtual_mode(void) -{ - efi_memory_desc_t *md; - efi_status_t status; - unsigned long size; - u64 end, systab, addr, npages, end_pfn; - void *p, *va; - - efi.systab = NULL; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - - size = md->num_pages << EFI_PAGE_SHIFT; - end = md->phys_addr + size; - - end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) - va = __va(md->phys_addr); - else - va = efi_ioremap(md->phys_addr, size, md->type); - - md->virt_addr = (u64) (unsigned long) va; - - if (!va) { - printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", - (unsigned long long)md->phys_addr); - continue; - } - - if (!(md->attribute & EFI_MEMORY_WB)) { - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_uc(addr, npages); - } - - systab = (u64) (unsigned long) efi_phys.systab; - if (md->phys_addr <= systab && systab < end) { - systab += md->virt_addr - md->phys_addr; - efi.systab = (efi_system_table_t *) (unsigned long) systab; - } - } - - BUG_ON(!efi.systab); - - status = phys_efi_set_virtual_address_map( - memmap.desc_size * memmap.nr_map, - memmap.desc_size, - memmap.desc_version, - memmap.phys_map); - - if (status != EFI_SUCCESS) { - printk(KERN_ALERT "Unable to switch EFI into virtual mode " - "(status=%lx)!\n", status); - panic("EFI call to SetVirtualAddressMap() failed!"); - } - - /* - * Now that EFI is in virtual mode, update the function - * pointers in the runtime service table to the new virtual addresses. - * - * Call EFI services through wrapper functions. - */ - efi.get_time = virt_efi_get_time; - efi.set_time = virt_efi_set_time; - efi.get_wakeup_time = virt_efi_get_wakeup_time; - efi.set_wakeup_time = virt_efi_set_wakeup_time; - efi.get_variable = virt_efi_get_variable; - efi.get_next_variable = virt_efi_get_next_variable; - efi.set_variable = virt_efi_set_variable; - efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; - efi.reset_system = virt_efi_reset_system; - efi.set_virtual_address_map = virt_efi_set_virtual_address_map; - if (__supported_pte_mask & _PAGE_NX) - runtime_code_page_mkexec(); - early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; -} - -/* - * Convenience functions to obtain memory types and attributes - */ -u32 efi_mem_type(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->type; - } - return 0; -} - -u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->attribute; - } - return 0; -} diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c deleted file mode 100644 index 5cab48ee61a4..000000000000 --- a/arch/x86/kernel/efi_32.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Extensible Firmware Interface - * - * Based on Extensible Firmware Interface Specification version 1.0 - * - * Copyright (C) 1999 VA Linux Systems - * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> - * Copyright (C) 1999-2002 Hewlett-Packard Co. - * David Mosberger-Tang <davidm@hpl.hp.com> - * Stephane Eranian <eranian@hpl.hp.com> - * - * All EFI Runtime Services are not implemented yet as EFI only - * supports physical mode addressing on SoftSDV. This is to be fixed - * in a future version. --drummond 1999-07-20 - * - * Implemented EFI runtime services and virtual mode calls. --davidm - * - * Goutham Rao: <goutham.rao@intel.com> - * Skip non-WB memory and ignore empty memory ranges. - */ - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/efi.h> - -#include <asm/io.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/efi.h> - -/* - * To make EFI call EFI runtime service in physical addressing mode we need - * prelog/epilog before/after the invocation to disable interrupt, to - * claim EFI runtime service handler exclusively and to duplicate a memory in - * low memory space say 0 - 3G. - */ - -static unsigned long efi_rt_eflags; -static pgd_t efi_bak_pg_dir_pointer[2]; - -void efi_call_phys_prelog(void) -{ - unsigned long cr4; - unsigned long temp; - struct desc_ptr gdt_descr; - - local_irq_save(efi_rt_eflags); - - /* - * If I don't have PAE, I should just duplicate two entries in page - * directory. If I have PAE, I just need to duplicate one entry in - * page directory. - */ - cr4 = read_cr4_safe(); - - if (cr4 & X86_CR4_PAE) { - efi_bak_pg_dir_pointer[0].pgd = - swapper_pg_dir[pgd_index(0)].pgd; - swapper_pg_dir[0].pgd = - swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; - } else { - efi_bak_pg_dir_pointer[0].pgd = - swapper_pg_dir[pgd_index(0)].pgd; - efi_bak_pg_dir_pointer[1].pgd = - swapper_pg_dir[pgd_index(0x400000)].pgd; - swapper_pg_dir[pgd_index(0)].pgd = - swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; - temp = PAGE_OFFSET + 0x400000; - swapper_pg_dir[pgd_index(0x400000)].pgd = - swapper_pg_dir[pgd_index(temp)].pgd; - } - - /* - * After the lock is released, the original page table is restored. - */ - __flush_tlb_all(); - - gdt_descr.address = __pa(get_cpu_gdt_table(0)); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); -} - -void efi_call_phys_epilog(void) -{ - unsigned long cr4; - struct desc_ptr gdt_descr; - - gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - - cr4 = read_cr4_safe(); - - if (cr4 & X86_CR4_PAE) { - swapper_pg_dir[pgd_index(0)].pgd = - efi_bak_pg_dir_pointer[0].pgd; - } else { - swapper_pg_dir[pgd_index(0)].pgd = - efi_bak_pg_dir_pointer[0].pgd; - swapper_pg_dir[pgd_index(0x400000)].pgd = - efi_bak_pg_dir_pointer[1].pgd; - } - - /* - * After the lock is released, the original page table is restored. - */ - __flush_tlb_all(); - - local_irq_restore(efi_rt_eflags); -} diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c deleted file mode 100644 index ac0621a7ac3d..000000000000 --- a/arch/x86/kernel/efi_64.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * x86_64 specific EFI support functions - * Based on Extensible Firmware Interface Specification version 1.0 - * - * Copyright (C) 2005-2008 Intel Co. - * Fenghua Yu <fenghua.yu@intel.com> - * Bibo Mao <bibo.mao@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - * Huang Ying <ying.huang@intel.com> - * - * Code to convert EFI to E820 map has been implemented in elilo bootloader - * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table - * is setup appropriately for EFI runtime code. - * - mouli 06/14/2007. - * - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/module.h> -#include <linux/efi.h> -#include <linux/uaccess.h> -#include <linux/io.h> -#include <linux/reboot.h> - -#include <asm/setup.h> -#include <asm/page.h> -#include <asm/e820.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm/efi.h> -#include <asm/cacheflush.h> -#include <asm/fixmap.h> - -static pgd_t save_pgd __initdata; -static unsigned long efi_flags __initdata; - -static void __init early_mapping_set_exec(unsigned long start, - unsigned long end, - int executable) -{ - unsigned long num_pages; - - start &= PMD_MASK; - end = (end + PMD_SIZE - 1) & PMD_MASK; - num_pages = (end - start) >> PAGE_SHIFT; - if (executable) - set_memory_x((unsigned long)__va(start), num_pages); - else - set_memory_nx((unsigned long)__va(start), num_pages); -} - -static void __init early_runtime_code_mapping_set_exec(int executable) -{ - efi_memory_desc_t *md; - void *p; - - if (!(__supported_pte_mask & _PAGE_NX)) - return; - - /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (md->type == EFI_RUNTIME_SERVICES_CODE) { - unsigned long end; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - early_mapping_set_exec(md->phys_addr, end, executable); - } - } -} - -void __init efi_call_phys_prelog(void) -{ - unsigned long vaddress; - - early_runtime_code_mapping_set_exec(1); - local_irq_save(efi_flags); - vaddress = (unsigned long)__va(0x0UL); - save_pgd = *pgd_offset_k(0x0UL); - set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); - __flush_tlb_all(); -} - -void __init efi_call_phys_epilog(void) -{ - /* - * After the lock is released, the original page table is restored. - */ - set_pgd(pgd_offset_k(0x0UL), save_pgd); - __flush_tlb_all(); - local_irq_restore(efi_flags); - early_runtime_code_mapping_set_exec(0); -} - -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, - u32 type) -{ - unsigned long last_map_pfn; - - if (type == EFI_MEMORY_MAPPED_IO) - return ioremap(phys_addr, size); - - last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); - if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) - return NULL; - - return (void __iomem *)__va(phys_addr); -} diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S deleted file mode 100644 index fbe66e626c09..000000000000 --- a/arch/x86/kernel/efi_stub_32.S +++ /dev/null @@ -1,123 +0,0 @@ -/* - * EFI call stub for IA32. - * - * This stub allows us to make EFI calls in physical mode with interrupts - * turned off. - */ - -#include <linux/linkage.h> -#include <asm/page_types.h> - -/* - * efi_call_phys(void *, ...) is a function with variable parameters. - * All the callers of this function assure that all the parameters are 4-bytes. - */ - -/* - * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. - * So we'd better save all of them at the beginning of this function and restore - * at the end no matter how many we use, because we can not assure EFI runtime - * service functions will comply with gcc calling convention, too. - */ - -.text -ENTRY(efi_call_phys) - /* - * 0. The function can only be called in Linux kernel. So CS has been - * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found - * the values of these registers are the same. And, the corresponding - * GDT entries are identical. So I will do nothing about segment reg - * and GDT, but change GDT base register in prelog and epilog. - */ - - /* - * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET. - * But to make it smoothly switch from virtual mode to flat mode. - * The mapping of lower virtual memory has been created in prelog and - * epilog. - */ - movl $1f, %edx - subl $__PAGE_OFFSET, %edx - jmp *%edx -1: - - /* - * 2. Now on the top of stack is the return - * address in the caller of efi_call_phys(), then parameter 1, - * parameter 2, ..., param n. To make things easy, we save the return - * address of efi_call_phys in a global variable. - */ - popl %edx - movl %edx, saved_return_addr - /* get the function pointer into ECX*/ - popl %ecx - movl %ecx, efi_rt_function_ptr - movl $2f, %edx - subl $__PAGE_OFFSET, %edx - pushl %edx - - /* - * 3. Clear PG bit in %CR0. - */ - movl %cr0, %edx - andl $0x7fffffff, %edx - movl %edx, %cr0 - jmp 1f -1: - - /* - * 4. Adjust stack pointer. - */ - subl $__PAGE_OFFSET, %esp - - /* - * 5. Call the physical function. - */ - jmp *%ecx - -2: - /* - * 6. After EFI runtime service returns, control will return to - * following instruction. We'd better readjust stack pointer first. - */ - addl $__PAGE_OFFSET, %esp - - /* - * 7. Restore PG bit - */ - movl %cr0, %edx - orl $0x80000000, %edx - movl %edx, %cr0 - jmp 1f -1: - /* - * 8. Now restore the virtual mode from flat mode by - * adding EIP with PAGE_OFFSET. - */ - movl $1f, %edx - jmp *%edx -1: - - /* - * 9. Balance the stack. And because EAX contain the return value, - * we'd better not clobber it. - */ - leal efi_rt_function_ptr, %edx - movl (%edx), %ecx - pushl %ecx - - /* - * 10. Push the saved return address onto the stack and return. - */ - leal saved_return_addr, %edx - movl (%edx), %ecx - pushl %ecx - ret -ENDPROC(efi_call_phys) -.previous - -.data -saved_return_addr: - .long 0 -efi_rt_function_ptr: - .long 0 diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S deleted file mode 100644 index 4c07ccab8146..000000000000 --- a/arch/x86/kernel/efi_stub_64.S +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Function calling ABI conversion from Linux to EFI for x86_64 - * - * Copyright (C) 2007 Intel Corp - * Bibo Mao <bibo.mao@intel.com> - * Huang Ying <ying.huang@intel.com> - */ - -#include <linux/linkage.h> - -#define SAVE_XMM \ - mov %rsp, %rax; \ - subq $0x70, %rsp; \ - and $~0xf, %rsp; \ - mov %rax, (%rsp); \ - mov %cr0, %rax; \ - clts; \ - mov %rax, 0x8(%rsp); \ - movaps %xmm0, 0x60(%rsp); \ - movaps %xmm1, 0x50(%rsp); \ - movaps %xmm2, 0x40(%rsp); \ - movaps %xmm3, 0x30(%rsp); \ - movaps %xmm4, 0x20(%rsp); \ - movaps %xmm5, 0x10(%rsp) - -#define RESTORE_XMM \ - movaps 0x60(%rsp), %xmm0; \ - movaps 0x50(%rsp), %xmm1; \ - movaps 0x40(%rsp), %xmm2; \ - movaps 0x30(%rsp), %xmm3; \ - movaps 0x20(%rsp), %xmm4; \ - movaps 0x10(%rsp), %xmm5; \ - mov 0x8(%rsp), %rsi; \ - mov %rsi, %cr0; \ - mov (%rsp), %rsp - -ENTRY(efi_call0) - SAVE_XMM - subq $32, %rsp - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call0) - -ENTRY(efi_call1) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call1) - -ENTRY(efi_call2) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call2) - -ENTRY(efi_call3) - SAVE_XMM - subq $32, %rsp - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call3) - -ENTRY(efi_call4) - SAVE_XMM - subq $32, %rsp - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call4) - -ENTRY(efi_call5) - SAVE_XMM - subq $48, %rsp - mov %r9, 32(%rsp) - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $48, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call5) - -ENTRY(efi_call6) - SAVE_XMM - mov (%rsp), %rax - mov 8(%rax), %rax - subq $48, %rsp - mov %r9, 32(%rsp) - mov %rax, 40(%rsp) - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $48, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call6) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9fb188d7bc76..591e60104278 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -382,20 +382,20 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl_cfi $(__USER_DS) + pushl_cfi $__USER_DS /*CFI_REL_OFFSET ss, 0*/ pushl_cfi %ebp CFI_REL_OFFSET esp, 0 pushfl_cfi orl $X86_EFLAGS_IF, (%esp) - pushl_cfi $(__USER_CS) + pushl_cfi $__USER_CS /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a7ae7fd1010f..e3ba417e8697 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -295,6 +295,7 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ + .pushsection .kprobes.text, "ax" ENTRY(save_args) XCPT_FRAME cld @@ -334,6 +335,7 @@ ENTRY(save_args) ret CFI_ENDPROC END(save_args) + .popsection ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 @@ -963,22 +965,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ - invalidate_interrupt0 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ - invalidate_interrupt1 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ - invalidate_interrupt2 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ - invalidate_interrupt3 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ - invalidate_interrupt4 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ - invalidate_interrupt5 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ - invalidate_interrupt6 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ - invalidate_interrupt7 smp_invalidate_interrupt +.irpc idx, "01234567" +apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ + invalidate_interrupt\idx smp_invalidate_interrupt +.endr #endif apicinterrupt THRESHOLD_APIC_VECTOR \ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 9a6ca2392170..763310165fa0 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -18,6 +18,7 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/bios_ebda.h> +#include <asm/tlbflush.h> static void __init i386_default_early_setup(void) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fa8c1b8e09fb..c0dbd9ac24f0 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -60,16 +60,18 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif +/* Number of possible pages in the lowmem region */ +LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) + /* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = \ - PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT +MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT /* * Worst-case size of the kernel mapping we need to make: - * the worst-case size of the kernel itself, plus the extra we need - * to map for the linear map. + * a relocatable kernel can live anywhere in lowmem, so we need to be able + * to map all of lowmem. */ -KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT +KERNEL_PAGES = LOWMEM_PAGES INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm RESERVE_BRK(pagetables, INIT_MAP_SIZE) @@ -183,13 +185,12 @@ default_entry: #ifdef CONFIG_X86_PAE /* - * In PAE mode swapper_pg_dir is statically defined to contain enough - * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD - * entries to the first kernel PMD. + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. * - * Note the upper half of each PMD or PTE are always zero at - * this stage. + * Note the upper half of each PMD or PTE are always zero at this stage. */ #define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ @@ -197,7 +198,7 @@ default_entry: xorl %ebx,%ebx /* %ebx is kept at zero */ movl $pa(__brk_base), %edi - movl $pa(swapper_pg_pmd), %edx + movl $pa(initial_pg_pmd), %edx movl $PTE_IDENT_ATTR, %eax 10: leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ @@ -226,14 +227,14 @@ default_entry: movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) #else /* Not PAE */ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(__brk_base), %edi - movl $pa(swapper_pg_dir), %edx + movl $pa(initial_page_table), %edx movl $PTE_IDENT_ATTR, %eax 10: leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ @@ -257,8 +258,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(swapper_pg_dir+0xffc) + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_page_table+0xffc) #endif jmp 3f /* @@ -334,7 +335,7 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl pa(initial_page_table), %eax + movl $pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $X86_CR0_PG,%eax @@ -614,8 +615,6 @@ ignore_int: .align 4 ENTRY(initial_code) .long i386_start_kernel -ENTRY(initial_page_table) - .long pa(swapper_pg_dir) /* * BSS section @@ -623,20 +622,18 @@ ENTRY(initial_page_table) __PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -swapper_pg_pmd: +ENTRY(initial_pg_pmd) .fill 1024*KPMDS,4,0 #else -ENTRY(swapper_pg_dir) +ENTRY(initial_page_table) .fill 1024,4,0 #endif -swapper_pg_fixmap: +ENTRY(initial_pg_fixmap) .fill 1024,4,0 -#ifdef CONFIG_X86_TRAMPOLINE -ENTRY(trampoline_pg_dir) - .fill 1024,4,0 -#endif ENTRY(empty_zero_page) .fill 4096,1,0 +ENTRY(swapper_pg_dir) + .fill 1024,4,0 /* * This starts the data section. @@ -645,20 +642,20 @@ ENTRY(empty_zero_page) __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm -ENTRY(swapper_pg_dir) - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ +ENTRY(initial_page_table) + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0 # elif KPMDS == 2 .long 0,0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 # elif KPMDS == 1 .long 0,0 .long 0,0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 # else # error "Kernel PMDs should be 1, 2 or 3" # endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index efaf906daf93..4ff5968f12d2 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -27,6 +27,9 @@ #define HPET_DEV_FSB_CAP 0x1000 #define HPET_DEV_PERI_CAP 0x2000 +#define HPET_MIN_CYCLES 128 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) + #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) /* @@ -299,8 +302,9 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - /* 5 usec minimum reprogramming delta. */ - hpet_clockevent.min_delta_ns = 5000; + /* Setup minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, + &hpet_clockevent); /* * Start hpet with the boot cpu mask and make it @@ -380,44 +384,37 @@ static int hpet_next_event(unsigned long delta, struct clock_event_device *evt, int timer) { u32 cnt; + s32 res; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; hpet_writel(cnt, HPET_Tn_CMP(timer)); /* - * We need to read back the CMP register on certain HPET - * implementations (ATI chipsets) which seem to delay the - * transfer of the compare register into the internal compare - * logic. With small deltas this might actually be too late as - * the counter could already be higher than the compare value - * at that point and we would wait for the next hpet interrupt - * forever. We found out that reading the CMP register back - * forces the transfer so we can rely on the comparison with - * the counter register below. If the read back from the - * compare register does not match the value we programmed - * then we might have a real hardware problem. We can not do - * much about it here, but at least alert the user/admin with - * a prominent warning. - * - * An erratum on some chipsets (ICH9,..), results in - * comparator read immediately following a write returning old - * value. Workaround for this is to read this value second - * time, when first read returns old value. - * - * In fact the write to the comparator register is delayed up - * to two HPET cycles so the workaround we tried to restrict - * the readback to those known to be borked ATI chipsets - * failed miserably. So we give up on optimizations forever - * and penalize all HPET incarnations unconditionally. + * HPETs are a complete disaster. The compare register is + * based on a equal comparison and neither provides a less + * than or equal functionality (which would require to take + * the wraparound into account) nor a simple count down event + * mode. Further the write to the comparator register is + * delayed internally up to two HPET clock cycles in certain + * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even + * longer delays. We worked around that by reading back the + * compare register, but that required another workaround for + * ICH9,10 chips where the first readout after write can + * return the old stale value. We already had a minimum + * programming delta of 5us enforced, but a NMI or SMI hitting + * between the counter readout and the comparator write can + * move us behind that point easily. Now instead of reading + * the compare register back several times, we make the ETIME + * decision based on the following: Return ETIME if the + * counter value after the write is less than HPET_MIN_CYCLES + * away from the event or if the counter is already ahead of + * the event. The minimum programming delta for the generic + * clockevents code is set to 1.5 * HPET_MIN_CYCLES. */ - if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { - if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) - printk_once(KERN_WARNING - "hpet: compare register read back failed.\n"); - } + res = (s32)(cnt - hpet_readl(HPET_COUNTER)); - return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return res < HPET_MIN_CYCLES ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -722,7 +719,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_ONLINE: - INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); init_completion(&work.complete); /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index ff15c9dcc25d..42c594254507 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; + /* If it's a single step, TRAP bits are random */ + if (dr6 & DR_STEP) + return NOTIFY_DONE; + /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 10709f29d166..96656f207751 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -17,6 +17,7 @@ #include <linux/delay.h> #include <linux/uaccess.h> #include <linux/percpu.h> +#include <linux/mm.h> #include <asm/apic.h> @@ -49,21 +50,17 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -#ifdef CONFIG_4KSTACKS /* * per-CPU IRQ handling contexts (thread information and stack) */ union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -} __attribute__((aligned(PAGE_SIZE))); +} __attribute__((aligned(THREAD_SIZE))); static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); -static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -129,7 +126,9 @@ void __cpuinit irq_ctx_init(int cpu) if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = &per_cpu(hardirq_stack, cpu); + irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + THREAD_FLAGS, + THREAD_ORDER)); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; @@ -138,7 +137,9 @@ void __cpuinit irq_ctx_init(int cpu) per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = &per_cpu(softirq_stack, cpu); + irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + THREAD_FLAGS, + THREAD_ORDER)); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; @@ -151,11 +152,6 @@ void __cpuinit irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -void irq_ctx_exit(int cpu) -{ - per_cpu(hardirq_ctx, cpu) = NULL; -} - asmlinkage void do_softirq(void) { unsigned long flags; @@ -187,11 +183,6 @@ asmlinkage void do_softirq(void) local_irq_restore(flags); } -#else -static inline int -execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } -#endif - bool handle_irq(unsigned irq, struct pt_regs *regs) { struct irq_desc *desc; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 8afd9f321f10..90fcf62854bb 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file) static const struct file_operations fops_setup_data = { .read = setup_data_read, .open = setup_data_open, + .llseek = default_llseek, }; static int __init diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 852b81967a37..cd21b654dec6 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -315,14 +315,18 @@ static void kgdb_remove_all_hw_break(void) if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); - if (bp->attr.disabled == 1) + if (!bp->attr.disabled) { + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; continue; + } if (dbg_is_early) early_dr7 &= ~encode_dr7(i, breakinfo[i].len, breakinfo[i].type); - else - arch_uninstall_hw_breakpoint(bp); - bp->attr.disabled = 1; + else if (hw_break_release_slot(i)) + printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n", + breakinfo[i].addr); + breakinfo[i].enabled = 0; } } @@ -387,7 +391,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) * disable hardware debugging while it is processing gdb packets or * handling exception. */ -void kgdb_disable_hw_debug(struct pt_regs *regs) +static void kgdb_disable_hw_debug(struct pt_regs *regs) { int i; int cpu = raw_smp_processor_id(); @@ -477,8 +481,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, raw_smp_processor_id()); } - kgdb_correct_hw_break(); - return 0; } @@ -621,7 +623,12 @@ int kgdb_arch_init(void) static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); + struct task_struct *tsk = current; + int i; + + for (i = 0; i < 4; i++) + if (breakinfo[i].enabled) + tsk->thread.debugreg6 |= (DR_TRAP0 << i); } void kgdb_arch_late(void) @@ -644,7 +651,7 @@ void kgdb_arch_late(void) if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); - if (IS_ERR(breakinfo[i].pev)) { + if (IS_ERR((void * __force)breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); breakinfo[i].pev = NULL; @@ -721,6 +728,7 @@ struct kgdb_arch arch_kgdb_ops = { .flags = KGDB_HW_BREAKPOINT, .set_hw_breakpoint = kgdb_set_hw_break, .remove_hw_breakpoint = kgdb_remove_hw_break, + .disable_hw_break = kgdb_disable_hw_debug, .remove_all_hw_break = kgdb_remove_all_hw_break, .correct_hw_break = kgdb_correct_hw_break, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index eb9b76c716c2..ca43ce31a19c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -128,13 +128,15 @@ static struct clocksource kvm_clock = { static int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); - int low, high; + int low, high, ret; + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(msr_kvm_system_time, low, high); + return ret; } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e1af7c055c7d..ce0cb4721c9a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -212,7 +212,7 @@ static int install_equiv_cpu_table(const u8 *buf) return 0; } - equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); + equiv_cpu_table = vmalloc(size); if (!equiv_cpu_table) { pr_err("failed to allocate equivalent CPU table\n"); return 0; diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fa6551d36c10..1cca374a2bac 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -12,7 +12,7 @@ * Software Developer's Manual * Order Number 253668 or free download from: * - * http://developer.intel.com/design/pentium4/manuals/253668.htm + * http://developer.intel.com/Assets/PDF/manual/253668.pdf * * For more information, go to http://www.urbanmyth.org/microcode * @@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = { .owner = THIS_MODULE, .write = microcode_write, .open = microcode_open, + .llseek = no_llseek, }; static struct miscdevice microcode_dev = { diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 356170262a93..1a1b606d3e92 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -12,7 +12,7 @@ * Software Developer's Manual * Order Number 253668 or free download from: * - * http://developer.intel.com/design/pentium4/manuals/253668.htm + * http://developer.intel.com/Assets/PDF/manual/253668.pdf * * For more information, go to http://www.urbanmyth.org/microcode * @@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, /* For performance reasons, reuse mc area when possible */ if (!mc || mc_size > curr_mc_size) { - if (mc) - vfree(mc); + vfree(mc); mc = vmalloc(mc_size); if (!mc) break; @@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (get_ucode_data(mc, ucode_ptr, mc_size) || microcode_sanity_check(mc) < 0) { - vfree(mc); break; } if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; mc = NULL; /* trigger new vmalloc */ @@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (mc) - vfree(mc); + vfree(mc); if (leftover) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); state = UCODE_ERROR; goto out; } @@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, goto out; } - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 71825806cd44..ac861b8348e2 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -25,7 +25,6 @@ struct pci_hostbridge_probe { }; static u64 __cpuinitdata fam10h_pci_mmconf_base; -static int __cpuinitdata fam10h_pci_mmconf_base_status; static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, @@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2) return start1 - start2; } -/*[47:0] */ -/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ +#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) +#define MMCONF_MASK (~(MMCONF_UNIT - 1)) +#define MMCONF_SIZE (MMCONF_UNIT << 8) +/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) -#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) +#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) static void __cpuinit get_fam10h_pci_mmconf_base(void) { int i; @@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) struct range range[8]; /* only try to get setting from BSP */ - /* -1 or 1 */ - if (fam10h_pci_mmconf_base_status) + if (fam10h_pci_mmconf_base) return; if (!early_pci_allowed()) - goto fail; + return; found = 0; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { @@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) } if (!found) - goto fail; + return; /* SYS_CFG */ address = MSR_K8_SYSCFG; @@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) /* TOP_MEM2 is not enabled? */ if (!(val & (1<<21))) { - tom2 = 0; + tom2 = 1ULL << 32; } else { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; rdmsrl(address, val); - tom2 = val & (0xffffULL<<32); + tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); } if (base <= tom2) - base = tom2 + (1ULL<<32); + base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK; /* * need to check if the range is in the high mmio range that is @@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) if (!(reg & 3)) continue; - start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/ reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); - end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ - if (!end) + if (end < tom2) continue; range[hi_mmio_num].start = start; @@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) if (range[hi_mmio_num - 1].end < base) goto out; - if (range[0].start > base) + if (range[0].start > base + MMCONF_SIZE) goto out; /* need to find one window */ - base = range[0].start - (1ULL << 32); + base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT; if ((base > tom2) && BASE_VALID(base)) goto out; - base = range[hi_mmio_num - 1].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) + base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK; + if (BASE_VALID(base)) goto out; /* need to find window between ranges */ - if (hi_mmio_num > 1) - for (i = 0; i < hi_mmio_num - 1; i++) { - if (range[i + 1].start > (range[i].end + (1ULL << 32))) { - base = range[i].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) - goto out; - } + for (i = 1; i < hi_mmio_num; i++) { + base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK; + val = range[i].start & MMCONF_MASK; + if (val >= base + MMCONF_SIZE && BASE_VALID(base)) + goto out; } - -fail: - fam10h_pci_mmconf_base_status = -1; return; + out: fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; } void __cpuinit fam10h_check_enable_mmcfg(void) @@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) /* only trust the one handle 256 buses, if acpi=off */ if (!acpi_pci_disabled || busnbits >= 8) { - u64 base; - base = val & (0xffffULL << 32); - if (fam10h_pci_mmconf_base_status <= 0) { + u64 base = val & MMCONF_MASK; + + if (!fam10h_pci_mmconf_base) { fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; return; } else if (fam10h_pci_mmconf_base == base) return; @@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) * with 256 buses */ get_fam10h_pci_mmconf_base(); - if (fam10h_pci_mmconf_base_status <= 0) + if (!fam10h_pci_mmconf_base) { + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; return; + } printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | @@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void) wrmsrl(address, val); } -static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) +static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) { pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; return 0; } -static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { +static const struct dmi_system_id __initconst mmconf_dmi_table[] = { { .callback = set_check_enable_amd_mmconf, .ident = "Sun Microsystems Machine", @@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { {} }; -void __cpuinit check_enable_amd_mmconf_dmi(void) +/* Called from a __cpuinit function, but only on the BSP. */ +void __ref check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); } diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c deleted file mode 100644 index 79ae68154e87..000000000000 --- a/arch/x86/kernel/mrst.c +++ /dev/null @@ -1,311 +0,0 @@ -/* - * mrst.c: Intel Moorestown platform specific setup code - * - * (C) Copyright 2008 Intel Corporation - * Author: Jacob Pan (jacob.jun.pan@intel.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sfi.h> -#include <linux/irq.h> -#include <linux/module.h> - -#include <asm/setup.h> -#include <asm/mpspec_def.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/io_apic.h> -#include <asm/mrst.h> -#include <asm/io.h> -#include <asm/i8259.h> -#include <asm/apb_timer.h> - -/* - * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, - * cmdline option x86_mrst_timer can be used to override the configuration - * to prefer one or the other. - * at runtime, there are basically three timer configurations: - * 1. per cpu apbt clock only - * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only - * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. - * - * by default (without cmdline option), platform code first detects cpu type - * to see if we are on lincroft or penwell, then set up both lapic or apbt - * clocks accordingly. - * i.e. by default, medfield uses configuration #2, moorestown uses #1. - * config #3 is supported but not recommended on medfield. - * - * rating and feature summary: - * lapic (with C3STOP) --------- 100 - * apbt (always-on) ------------ 110 - * lapic (always-on,ARAT) ------ 150 - */ - -__cpuinitdata enum mrst_timer_options mrst_timer_options; - -static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; -static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -enum mrst_cpu_type __mrst_cpu_chip; -EXPORT_SYMBOL_GPL(__mrst_cpu_chip); - -int sfi_mtimer_num; - -struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; -EXPORT_SYMBOL_GPL(sfi_mrtc_array); -int sfi_mrtc_num; - -static inline void assign_to_mp_irq(struct mpc_intsrc *m, - struct mpc_intsrc *mp_irq) -{ - memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); -} - -static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, - struct mpc_intsrc *m) -{ - return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); -} - -static void save_mp_irq(struct mpc_intsrc *m) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - if (!mp_irq_cmp(&mp_irqs[i], m)) - return; - } - - assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -/* parse all the mtimer info to a static mtimer array */ -static int __init sfi_parse_mtmr(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_timer_table_entry *pentry; - struct mpc_intsrc mp_irq; - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mtimer_num) { - sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_timer_table_entry); - pentry = (struct sfi_timer_table_entry *) sb->pentry; - totallen = sfi_mtimer_num * sizeof(*pentry); - memcpy(sfi_mtimer_array, pentry, totallen); - } - - printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); - pentry = sfi_mtimer_array; - for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { - printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," - " irq = %d\n", totallen, (u32)pentry->phys_addr, - pentry->freq_hz, pentry->irq); - if (!pentry->irq) - continue; - mp_irq.type = MP_IOAPIC; - mp_irq.irqtype = mp_INT; -/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ - mp_irq.irqflag = 5; - mp_irq.srcbus = 0; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - save_mp_irq(&mp_irq); - } - - return 0; -} - -struct sfi_timer_table_entry *sfi_get_mtmr(int hint) -{ - int i; - if (hint < sfi_mtimer_num) { - if (!sfi_mtimer_usage[hint]) { - pr_debug("hint taken for timer %d irq %d\n",\ - hint, sfi_mtimer_array[hint].irq); - sfi_mtimer_usage[hint] = 1; - return &sfi_mtimer_array[hint]; - } - } - /* take the first timer available */ - for (i = 0; i < sfi_mtimer_num;) { - if (!sfi_mtimer_usage[i]) { - sfi_mtimer_usage[i] = 1; - return &sfi_mtimer_array[i]; - } - i++; - } - return NULL; -} - -void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) -{ - int i; - for (i = 0; i < sfi_mtimer_num;) { - if (mtmr->irq == sfi_mtimer_array[i].irq) { - sfi_mtimer_usage[i] = 0; - return; - } - i++; - } -} - -/* parse all the mrtc info to a global mrtc array */ -int __init sfi_parse_mrtc(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_rtc_table_entry *pentry; - struct mpc_intsrc mp_irq; - - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mrtc_num) { - sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_rtc_table_entry); - pentry = (struct sfi_rtc_table_entry *)sb->pentry; - totallen = sfi_mrtc_num * sizeof(*pentry); - memcpy(sfi_mrtc_array, pentry, totallen); - } - - printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); - pentry = sfi_mrtc_array; - for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { - printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", - totallen, (u32)pentry->phys_addr, pentry->irq); - mp_irq.type = MP_IOAPIC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = 0; - mp_irq.srcbus = 0; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - save_mp_irq(&mp_irq); - } - return 0; -} - -static unsigned long __init mrst_calibrate_tsc(void) -{ - unsigned long flags, fast_calibrate; - - local_irq_save(flags); - fast_calibrate = apbt_quick_calibrate(); - local_irq_restore(flags); - - if (fast_calibrate) - return fast_calibrate; - - return 0; -} - -void __init mrst_time_init(void) -{ - switch (mrst_timer_options) { - case MRST_TIMER_APBT_ONLY: - break; - case MRST_TIMER_LAPIC_APBT: - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - break; - default: - if (!boot_cpu_has(X86_FEATURE_ARAT)) - break; - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - return; - } - /* we need at least one APB timer */ - sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); - pre_init_apic_IRQ0(); - apbt_time_init(); -} - -void __init mrst_rtc_init(void) -{ - sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); -} - -void __cpuinit mrst_arch_setup(void) -{ - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) - __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; - else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; - else { - pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", - boot_cpu_data.x86, boot_cpu_data.x86_model); - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; - } - pr_debug("Moorestown CPU %s identified\n", - (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? - "Lincroft" : "Penwell"); -} - -/* MID systems don't have i8042 controller */ -static int mrst_i8042_detect(void) -{ - return 0; -} - -/* - * Moorestown specific x86_init function overrides and early setup - * calls. - */ -void __init x86_mrst_early_setup(void) -{ - x86_init.resources.probe_roms = x86_init_noop; - x86_init.resources.reserve_resources = x86_init_noop; - - x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = x86_init_noop; - - x86_init.irqs.pre_vector_init = x86_init_noop; - - x86_init.oem.arch_setup = mrst_arch_setup; - - x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - - x86_platform.calibrate_tsc = mrst_calibrate_tsc; - x86_platform.i8042_detect = mrst_i8042_detect; - x86_init.pci.init = pci_mrst_init; - x86_init.pci.fixup_irqs = x86_init_noop; - - legacy_pic = &null_legacy_pic; - - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - -} - -/* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - mrst_timer_options = MRST_TIMER_APBT_ONLY; - else if (strcmp("lapic_and_apbt", arg) == 0) - mrst_timer_options = MRST_TIMER_LAPIC_APBT; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7bf2dc4c8f70..12fcbe2c143e 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -30,7 +30,6 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/device.h> diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c deleted file mode 100644 index f5442c03abc3..000000000000 --- a/arch/x86/kernel/olpc-xo1.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Support for features of the OLPC XO-1 laptop - * - * Copyright (C) 2010 One Laptop per Child - * Copyright (C) 2006 Red Hat, Inc. - * Copyright (C) 2006 Advanced Micro Devices, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/pci.h> -#include <linux/pci_ids.h> -#include <linux/platform_device.h> -#include <linux/pm.h> - -#include <asm/io.h> -#include <asm/olpc.h> - -#define DRV_NAME "olpc-xo1" - -#define PMS_BAR 4 -#define ACPI_BAR 5 - -/* PMC registers (PMS block) */ -#define PM_SCLK 0x10 -#define PM_IN_SLPCTL 0x20 -#define PM_WKXD 0x34 -#define PM_WKD 0x30 -#define PM_SSC 0x54 - -/* PM registers (ACPI block) */ -#define PM1_CNT 0x08 -#define PM_GPE0_STS 0x18 - -static unsigned long acpi_base; -static unsigned long pms_base; - -static void xo1_power_off(void) -{ - printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); - - /* Enable all of these controls with 0 delay */ - outl(0x40000000, pms_base + PM_SCLK); - outl(0x40000000, pms_base + PM_IN_SLPCTL); - outl(0x40000000, pms_base + PM_WKXD); - outl(0x40000000, pms_base + PM_WKD); - - /* Clear status bits (possibly unnecessary) */ - outl(0x0002ffff, pms_base + PM_SSC); - outl(0xffffffff, acpi_base + PM_GPE0_STS); - - /* Write SLP_EN bit to start the machinery */ - outl(0x00002000, acpi_base + PM1_CNT); -} - -/* Read the base addresses from the PCI BAR info */ -static int __devinit setup_bases(struct pci_dev *pdev) -{ - int r; - - r = pci_enable_device_io(pdev); - if (r) { - dev_err(&pdev->dev, "can't enable device IO\n"); - return r; - } - - r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); - if (r) { - dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); - return r; - } - - r = pci_request_region(pdev, PMS_BAR, DRV_NAME); - if (r) { - dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); - pci_release_region(pdev, ACPI_BAR); - return r; - } - - acpi_base = pci_resource_start(pdev, ACPI_BAR); - pms_base = pci_resource_start(pdev, PMS_BAR); - - return 0; -} - -static int __devinit olpc_xo1_probe(struct platform_device *pdev) -{ - struct pci_dev *pcidev; - int r; - - pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, - NULL); - if (!pdev) - return -ENODEV; - - r = setup_bases(pcidev); - if (r) - return r; - - pm_power_off = xo1_power_off; - - printk(KERN_INFO "OLPC XO-1 support registered\n"); - return 0; -} - -static int __devexit olpc_xo1_remove(struct platform_device *pdev) -{ - pm_power_off = NULL; - return 0; -} - -static struct platform_driver olpc_xo1_driver = { - .driver = { - .name = DRV_NAME, - .owner = THIS_MODULE, - }, - .probe = olpc_xo1_probe, - .remove = __devexit_p(olpc_xo1_remove), -}; - -static int __init olpc_xo1_init(void) -{ - return platform_driver_register(&olpc_xo1_driver); -} - -static void __exit olpc_xo1_exit(void) -{ - platform_driver_unregister(&olpc_xo1_driver); -} - -MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:olpc-xo1"); - -module_init(olpc_xo1_init); -module_exit(olpc_xo1_exit); diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c deleted file mode 100644 index edaf3fe8dc5e..000000000000 --- a/arch/x86/kernel/olpc.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Support for the OLPC DCON and OLPC EC access - * - * Copyright © 2006 Advanced Micro Devices, Inc. - * Copyright © 2007-2008 Andres Salomon <dilinger@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/io.h> -#include <linux/string.h> -#include <linux/platform_device.h> - -#include <asm/geode.h> -#include <asm/setup.h> -#include <asm/olpc.h> -#include <asm/olpc_ofw.h> - -struct olpc_platform_t olpc_platform_info; -EXPORT_SYMBOL_GPL(olpc_platform_info); - -static DEFINE_SPINLOCK(ec_lock); - -/* what the timeout *should* be (in ms) */ -#define EC_BASE_TIMEOUT 20 - -/* the timeout that bugs in the EC might force us to actually use */ -static int ec_timeout = EC_BASE_TIMEOUT; - -static int __init olpc_ec_timeout_set(char *str) -{ - if (get_option(&str, &ec_timeout) != 1) { - ec_timeout = EC_BASE_TIMEOUT; - printk(KERN_ERR "olpc-ec: invalid argument to " - "'olpc_ec_timeout=', ignoring!\n"); - } - printk(KERN_DEBUG "olpc-ec: using %d ms delay for EC commands.\n", - ec_timeout); - return 1; -} -__setup("olpc_ec_timeout=", olpc_ec_timeout_set); - -/* - * These {i,o}bf_status functions return whether the buffers are full or not. - */ - -static inline unsigned int ibf_status(unsigned int port) -{ - return !!(inb(port) & 0x02); -} - -static inline unsigned int obf_status(unsigned int port) -{ - return inb(port) & 0x01; -} - -#define wait_on_ibf(p, d) __wait_on_ibf(__LINE__, (p), (d)) -static int __wait_on_ibf(unsigned int line, unsigned int port, int desired) -{ - unsigned int timeo; - int state = ibf_status(port); - - for (timeo = ec_timeout; state != desired && timeo; timeo--) { - mdelay(1); - state = ibf_status(port); - } - - if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) && - timeo < (ec_timeout - EC_BASE_TIMEOUT)) { - printk(KERN_WARNING "olpc-ec: %d: waited %u ms for IBF!\n", - line, ec_timeout - timeo); - } - - return !(state == desired); -} - -#define wait_on_obf(p, d) __wait_on_obf(__LINE__, (p), (d)) -static int __wait_on_obf(unsigned int line, unsigned int port, int desired) -{ - unsigned int timeo; - int state = obf_status(port); - - for (timeo = ec_timeout; state != desired && timeo; timeo--) { - mdelay(1); - state = obf_status(port); - } - - if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) && - timeo < (ec_timeout - EC_BASE_TIMEOUT)) { - printk(KERN_WARNING "olpc-ec: %d: waited %u ms for OBF!\n", - line, ec_timeout - timeo); - } - - return !(state == desired); -} - -/* - * This allows the kernel to run Embedded Controller commands. The EC is - * documented at <http://wiki.laptop.org/go/Embedded_controller>, and the - * available EC commands are here: - * <http://wiki.laptop.org/go/Ec_specification>. Unfortunately, while - * OpenFirmware's source is available, the EC's is not. - */ -int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, - unsigned char *outbuf, size_t outlen) -{ - unsigned long flags; - int ret = -EIO; - int i; - int restarts = 0; - - spin_lock_irqsave(&ec_lock, flags); - - /* Clear OBF */ - for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++) - inb(0x68); - if (i == 10) { - printk(KERN_ERR "olpc-ec: timeout while attempting to " - "clear OBF flag!\n"); - goto err; - } - - if (wait_on_ibf(0x6c, 0)) { - printk(KERN_ERR "olpc-ec: timeout waiting for EC to " - "quiesce!\n"); - goto err; - } - -restart: - /* - * Note that if we time out during any IBF checks, that's a failure; - * we have to return. There's no way for the kernel to clear that. - * - * If we time out during an OBF check, we can restart the command; - * reissuing it will clear the OBF flag, and we should be alright. - * The OBF flag will sometimes misbehave due to what we believe - * is a hardware quirk.. - */ - pr_devel("olpc-ec: running cmd 0x%x\n", cmd); - outb(cmd, 0x6c); - - if (wait_on_ibf(0x6c, 0)) { - printk(KERN_ERR "olpc-ec: timeout waiting for EC to read " - "command!\n"); - goto err; - } - - if (inbuf && inlen) { - /* write data to EC */ - for (i = 0; i < inlen; i++) { - if (wait_on_ibf(0x6c, 0)) { - printk(KERN_ERR "olpc-ec: timeout waiting for" - " EC accept data!\n"); - goto err; - } - pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); - outb(inbuf[i], 0x68); - } - } - if (outbuf && outlen) { - /* read data from EC */ - for (i = 0; i < outlen; i++) { - if (wait_on_obf(0x6c, 1)) { - printk(KERN_ERR "olpc-ec: timeout waiting for" - " EC to provide data!\n"); - if (restarts++ < 10) - goto restart; - goto err; - } - outbuf[i] = inb(0x68); - pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); - } - } - - ret = 0; -err: - spin_unlock_irqrestore(&ec_lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(olpc_ec_cmd); - -static bool __init check_ofw_architecture(void) -{ - size_t propsize; - char olpc_arch[5]; - const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res)) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return false; - } - return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; -} - -static u32 __init get_board_revision(void) -{ - size_t propsize; - __be32 rev; - const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res) || propsize != 4) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return cpu_to_be32(0); - } - return be32_to_cpu(rev); -} - -static bool __init platform_detect(void) -{ - if (!check_ofw_architecture()) - return false; - olpc_platform_info.flags |= OLPC_F_PRESENT; - olpc_platform_info.boardrev = get_board_revision(); - return true; -} - -static int __init add_xo1_platform_devices(void) -{ - struct platform_device *pdev; - - pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - return 0; -} - -static int __init olpc_init(void) -{ - int r = 0; - - if (!olpc_ofw_present() || !platform_detect()) - return 0; - - spin_lock_init(&ec_lock); - - /* assume B1 and above models always have a DCON */ - if (olpc_board_at_least(olpc_board(0xb1))) - olpc_platform_info.flags |= OLPC_F_DCON; - - /* get the EC revision */ - olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, - (unsigned char *) &olpc_platform_info.ecver, 1); - -#ifdef CONFIG_PCI_OLPC - /* If the VSA exists let it emulate PCI, if not emulate in kernel. - * XO-1 only. */ - if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) && - !cs5535_has_vsa2()) - x86_init.pci.arch_init = pci_olpc_init; -#endif - - printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", - ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", - olpc_platform_info.boardrev >> 4, - olpc_platform_info.ecver); - - if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */ - r = add_xo1_platform_devices(); - if (r) - return r; - } - - return 0; -} - -postcore_initcall(olpc_init); diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c deleted file mode 100644 index 787320464379..000000000000 --- a/arch/x86/kernel/olpc_ofw.c +++ /dev/null @@ -1,112 +0,0 @@ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <asm/page.h> -#include <asm/setup.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/olpc_ofw.h> - -/* address of OFW callback interface; will be NULL if OFW isn't found */ -static int (*olpc_ofw_cif)(int *); - -/* page dir entry containing OFW's pgdir table; filled in by head_32.S */ -u32 olpc_ofw_pgd __initdata; - -static DEFINE_SPINLOCK(ofw_lock); - -#define MAXARGS 10 - -void __init setup_olpc_ofw_pgd(void) -{ - pgd_t *base, *ofw_pde; - - if (!olpc_ofw_cif) - return; - - /* fetch OFW's PDE */ - base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); - if (!base) { - printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); - olpc_ofw_cif = NULL; - return; - } - ofw_pde = &base[OLPC_OFW_PDE_NR]; - - /* install OFW's PDE permanently into the kernel's pgtable */ - set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); - /* implicit optimization barrier here due to uninline function return */ - - early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); -} - -int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, - void **res) -{ - int ofw_args[MAXARGS + 3]; - unsigned long flags; - int ret, i, *p; - - BUG_ON(nr_args + nr_res > MAXARGS); - - if (!olpc_ofw_cif) - return -EIO; - - ofw_args[0] = (int)name; - ofw_args[1] = nr_args; - ofw_args[2] = nr_res; - - p = &ofw_args[3]; - for (i = 0; i < nr_args; i++, p++) - *p = (int)args[i]; - - /* call into ofw */ - spin_lock_irqsave(&ofw_lock, flags); - ret = olpc_ofw_cif(ofw_args); - spin_unlock_irqrestore(&ofw_lock, flags); - - if (!ret) { - for (i = 0; i < nr_res; i++, p++) - *((int *)res[i]) = *p; - } - - return ret; -} -EXPORT_SYMBOL_GPL(__olpc_ofw); - -bool olpc_ofw_present(void) -{ - return olpc_ofw_cif != NULL; -} -EXPORT_SYMBOL_GPL(olpc_ofw_present); - -/* OFW cif _should_ be above this address */ -#define OFW_MIN 0xff000000 - -/* OFW starts on a 1MB boundary */ -#define OFW_BOUND (1<<20) - -void __init olpc_ofw_detect(void) -{ - struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; - unsigned long start; - - /* ensure OFW booted us by checking for "OFW " string */ - if (hdr->ofw_magic != OLPC_OFW_SIG) - return; - - olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; - - if ((unsigned long)olpc_ofw_cif < OFW_MIN) { - printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", - (unsigned long)olpc_ofw_cif); - olpc_ofw_cif = NULL; - return; - } - - /* determine where OFW starts in memory */ - start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); - printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", - (unsigned long)olpc_ofw_cif, (-start) >> 20); - reserve_top_address(-start); -} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 70c4872cd8aa..45892dc4b72a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child) static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif -long arch_ptrace(struct task_struct *child, long request, long addr, long data) +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; @@ -812,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) unsigned long tmp; ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ @@ -830,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) @@ -888,17 +887,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, - (struct user_desc __user *) data); + (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, - (struct user_desc __user *) data, 0); + (struct user_desc __user *)data, 0); break; #endif diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 239427ca02af..42eb3300dfc6 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -41,48 +41,11 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif defined(__x86_64__) - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) { u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); + return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, + shadow->tsc_shift); } /* @@ -120,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) static atomic64_t last_value = ATOMIC64_INIT(0); +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 939b9e98245f..8bbe8c56916d 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, vt8237_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700, + vt8237_force_enable_hpet); static void ati_force_hpet_resume(void) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 7a4cf14223ba..c495aa8d4815 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length) CMOS_WRITE(0x00, 0x8f); spin_unlock(&rtc_lock); - /* Remap the kernel at virtual address zero, as well as offset zero - from the kernel segment. This assumes the kernel segment starts at - virtual address PAGE_OFFSET. */ - memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); - /* - * Use `swapper_pg_dir' as our page directory. + * Switch back to the initial page table. */ - load_cr3(swapper_pg_dir); + load_cr3(initial_page_table); /* Write 0x1234 to absolute memory location 0x472. The BIOS reads this on booting to tell it to "Bypass memory test (also warm @@ -641,7 +635,7 @@ void native_machine_shutdown(void) /* O.K Now that I'm on the appropriate processor, * stop all of the others. */ - smp_send_stop(); + stop_other_cpus(); #endif lapic_shutdown(); diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c new file mode 100644 index 000000000000..2a26819bb6a8 --- /dev/null +++ b/arch/x86/kernel/resource.c @@ -0,0 +1,48 @@ +#include <linux/ioport.h> +#include <asm/e820.h> + +static void resource_clip(struct resource *res, resource_size_t start, + resource_size_t end) +{ + resource_size_t low = 0, high = 0; + + if (res->end < start || res->start > end) + return; /* no conflict */ + + if (res->start < start) + low = start - res->start; + + if (res->end > end) + high = res->end - end; + + /* Keep the area above or below the conflict, whichever is larger */ + if (low > high) + res->end = start - 1; + else + res->start = end + 1; +} + +static void remove_e820_regions(struct resource *avail) +{ + int i; + struct e820entry *entry; + + for (i = 0; i < e820.nr_map; i++) { + entry = &e820.map[i]; + + resource_clip(avail, entry->addr, + entry->addr + entry->size - 1); + } +} + +void arch_remove_reservations(struct resource *avail) +{ + /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + if (avail->flags & IORESOURCE_MEM) { + if (avail->start < BIOS_END) + avail->start = BIOS_END; + resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } +} diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c deleted file mode 100644 index 7e004acbe526..000000000000 --- a/arch/x86/kernel/scx200_32.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> - * - * National Semiconductor SCx200 support. - */ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mutex.h> -#include <linux/pci.h> - -#include <linux/scx200.h> -#include <linux/scx200_gpio.h> - -/* Verify that the configuration block really is there */ -#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base)) - -#define NAME "scx200" - -MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>"); -MODULE_DESCRIPTION("NatSemi SCx200 Driver"); -MODULE_LICENSE("GPL"); - -unsigned scx200_gpio_base = 0; -unsigned long scx200_gpio_shadow[2]; - -unsigned scx200_cb_base = 0; - -static struct pci_device_id scx200_tbl[] = { - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) }, - { }, -}; -MODULE_DEVICE_TABLE(pci,scx200_tbl); - -static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *); - -static struct pci_driver scx200_pci_driver = { - .name = "scx200", - .id_table = scx200_tbl, - .probe = scx200_probe, -}; - -static DEFINE_MUTEX(scx200_gpio_config_lock); - -static void __devinit scx200_init_shadow(void) -{ - int bank; - - /* read the current values driven on the GPIO signals */ - for (bank = 0; bank < 2; ++bank) - scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); -} - -static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) -{ - unsigned base; - - if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || - pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) { - base = pci_resource_start(pdev, 0); - printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); - - if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) { - printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); - return -EBUSY; - } - - scx200_gpio_base = base; - scx200_init_shadow(); - - } else { - /* find the base of the Configuration Block */ - if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) { - scx200_cb_base = SCx200_CB_BASE_FIXED; - } else { - pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base); - if (scx200_cb_probe(base)) { - scx200_cb_base = base; - } else { - printk(KERN_WARNING NAME ": Configuration Block not found\n"); - return -ENODEV; - } - } - printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base); - } - - return 0; -} - -u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits) -{ - u32 config, new_config; - - mutex_lock(&scx200_gpio_config_lock); - - outl(index, scx200_gpio_base + 0x20); - config = inl(scx200_gpio_base + 0x24); - - new_config = (config & mask) | bits; - outl(new_config, scx200_gpio_base + 0x24); - - mutex_unlock(&scx200_gpio_config_lock); - - return config; -} - -static int __init scx200_init(void) -{ - printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); - - return pci_register_driver(&scx200_pci_driver); -} - -static void __exit scx200_cleanup(void) -{ - pci_unregister_driver(&scx200_pci_driver); - release_region(scx200_gpio_base, SCx200_GPIO_SIZE); -} - -module_init(scx200_init); -module_exit(scx200_cleanup); - -EXPORT_SYMBOL(scx200_gpio_base); -EXPORT_SYMBOL(scx200_gpio_shadow); -EXPORT_SYMBOL(scx200_gpio_configure); -EXPORT_SYMBOL(scx200_cb_base); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b8982e0fc0c2..a25ce8864f1f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -501,7 +501,18 @@ static inline unsigned long long get_total_mem(void) return total << PAGE_SHIFT; } -#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF +/* + * Keep the crash kernel below this limit. On 32 bits earlier kernels + * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this + * limit once kexec-tools are fixed. + */ +#ifdef CONFIG_X86_32 +# define CRASH_KERNEL_ADDR_MAX (512 << 20) +#else +# define CRASH_KERNEL_ADDR_MAX (896 << 20) +#endif + static void __init reserve_crashkernel(void) { unsigned long long total_mem; @@ -520,10 +531,10 @@ static void __init reserve_crashkernel(void) const unsigned long long alignment = 16<<20; /* 16M */ /* - * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX + * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment); + CRASH_KERNEL_ADDR_MAX, crash_size, alignment); if (crash_base == MEMBLOCK_ERROR) { pr_info("crashkernel reservation failed - No suitable area found.\n"); @@ -700,6 +711,17 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); + + /* + * copy kernel address range established so far and switch + * to the proper swapper page table + */ + clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, + initial_page_table + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif @@ -758,6 +780,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); /* update the e820_saved too */ @@ -985,7 +1008,12 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); - setup_trampoline_page_table(); +#ifdef CONFIG_X86_32 + /* sync back kernel address range */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); +#endif tboot_probe(); diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c deleted file mode 100644 index dd4c281ffe57..000000000000 --- a/arch/x86/kernel/sfi.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * sfi.c - x86 architecture SFI support. - * - * Copyright (c) 2009, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#define KMSG_COMPONENT "SFI" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/io.h> - -#include <asm/io_apic.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/apic.h> - -#ifdef CONFIG_X86_LOCAL_APIC -static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; - -static void __init mp_sfi_register_lapic_address(unsigned long address) -{ - mp_lapic_addr = address; - - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); - - pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); -} - -/* All CPUs enumerated by SFI must be present and enabled */ -static void __cpuinit mp_sfi_register_lapic(u8 id) -{ - if (MAX_APICS - id <= 0) { - pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - pr_info("registering lapic[%d]\n", id); - - generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); -} - -static int __init sfi_parse_cpus(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_cpu_table_entry *pentry; - int i; - int cpu_num; - - sb = (struct sfi_table_simple *)table; - cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); - pentry = (struct sfi_cpu_table_entry *)sb->pentry; - - for (i = 0; i < cpu_num; i++) { - mp_sfi_register_lapic(pentry->apic_id); - pentry++; - } - - smp_found_config = 1; - return 0; -} -#endif /* CONFIG_X86_LOCAL_APIC */ - -#ifdef CONFIG_X86_IO_APIC - -static int __init sfi_parse_ioapic(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_apic_table_entry *pentry; - int i, num; - - sb = (struct sfi_table_simple *)table; - num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); - pentry = (struct sfi_apic_table_entry *)sb->pentry; - - for (i = 0; i < num; i++) { - mp_register_ioapic(i, pentry->phys_addr, gsi_top); - pentry++; - } - - WARN(pic_mode, KERN_WARNING - "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); - pic_mode = 0; - return 0; -} -#endif /* CONFIG_X86_IO_APIC */ - -/* - * sfi_platform_init(): register lapics & io-apics - */ -int __init sfi_platform_init(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - mp_sfi_register_lapic_address(sfi_lapic_addr); - sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); -#endif -#ifdef CONFIG_X86_IO_APIC - sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); -#endif - return 0; -} diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d801210945d6..513deac7228d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_smp_send_stop(void) +static void native_stop_other_cpus(int wait) { unsigned long flags; - unsigned long wait; + unsigned long timeout; if (reboot_force) return; @@ -179,9 +179,12 @@ static void native_smp_send_stop(void) if (num_online_cpus() > 1) { apic->send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } @@ -227,7 +230,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index dfb50890b5b7..083e99d1b7df 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -299,22 +299,16 @@ notrace static void __cpuinit start_secondary(void *unused) * fragile that we want to limit the things done here to the * most necessary things. */ + cpu_init(); + preempt_disable(); + smp_callin(); #ifdef CONFIG_X86_32 - /* - * Switch away from the trampoline page-table - * - * Do this before cpu_init() because it needs to access per-cpu - * data which may not be mapped in the trampoline page-table. - */ + /* switch away from the initial page table */ load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - cpu_init(); - preempt_disable(); - smp_callin(); - /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* @@ -753,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); + INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -785,7 +779,6 @@ do_rest: #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - initial_page_table = __pa(&trampoline_pg_dir); #else clear_tsk_thread_flag(c_idle.idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); @@ -934,7 +927,6 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; err = do_boot_cpu(apicid, cpu); - if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; @@ -1381,7 +1373,6 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - irq_ctx_exit(raw_smp_processor_id()); c1e_remove_cpu(raw_smp_processor_id()); mb(); diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c deleted file mode 100644 index 312ef0292815..000000000000 --- a/arch/x86/kernel/tlb_uv.c +++ /dev/null @@ -1,1655 +0,0 @@ -/* - * SGI UltraViolet TLB flush routines. - * - * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. - * - * This code is released under the GNU General Public License version 2 or - * later. - */ -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/slab.h> - -#include <asm/mmu_context.h> -#include <asm/uv/uv.h> -#include <asm/uv/uv_mmrs.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/uv_bau.h> -#include <asm/apic.h> -#include <asm/idle.h> -#include <asm/tsc.h> -#include <asm/irq_vectors.h> -#include <asm/timer.h> - -/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ -static int timeout_base_ns[] = { - 20, - 160, - 1280, - 10240, - 81920, - 655360, - 5242880, - 167772160 -}; -static int timeout_us; -static int nobau; -static int baudisabled; -static spinlock_t disable_lock; -static cycles_t congested_cycles; - -/* tunables: */ -static int max_bau_concurrent = MAX_BAU_CONCURRENT; -static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; -static int plugged_delay = PLUGGED_DELAY; -static int plugsb4reset = PLUGSB4RESET; -static int timeoutsb4reset = TIMEOUTSB4RESET; -static int ipi_reset_limit = IPI_RESET_LIMIT; -static int complete_threshold = COMPLETE_THRESHOLD; -static int congested_response_us = CONGESTED_RESPONSE_US; -static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; -static struct dentry *tunables_dir; -static struct dentry *tunables_file; - -static int __init setup_nobau(char *arg) -{ - nobau = 1; - return 0; -} -early_param("nobau", setup_nobau); - -/* base pnode in this partition */ -static int uv_partition_base_pnode __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; -static unsigned long uv_mmask __read_mostly; - -static DEFINE_PER_CPU(struct ptc_stats, ptcstats); -static DEFINE_PER_CPU(struct bau_control, bau_control); -static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); - -/* - * Determine the first node on a uvhub. 'Nodes' are used for kernel - * memory allocation. - */ -static int __init uvhub_to_first_node(int uvhub) -{ - int node, b; - - for_each_online_node(node) { - b = uv_node_to_blade_id(node); - if (uvhub == b) - return node; - } - return -1; -} - -/* - * Determine the apicid of the first cpu on a uvhub. - */ -static int __init uvhub_to_first_apicid(int uvhub) -{ - int cpu; - - for_each_present_cpu(cpu) - if (uvhub == uv_cpu_to_blade_id(cpu)) - return per_cpu(x86_cpu_to_apicid, cpu); - return -1; -} - -/* - * Free a software acknowledge hardware resource by clearing its Pending - * bit. This will return a reply to the sender. - * If the message has timed out, a reply has already been sent by the - * hardware but the resource has not been released. In that case our - * clear of the Timeout bit (as well) will free the resource. No reply will - * be sent (the hardware will only do one reply per message). - */ -static inline void uv_reply_to_message(struct msg_desc *mdp, - struct bau_control *bcp) -{ - unsigned long dw; - struct bau_payload_queue_entry *msg; - - msg = mdp->msg; - if (!msg->canceled) { - dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | - msg->sw_ack_vector; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); - } - msg->replied_to = 1; - msg->sw_ack_vector = 0; -} - -/* - * Process the receipt of a RETRY message - */ -static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, - struct bau_control *bcp) -{ - int i; - int cancel_count = 0; - int slot2; - unsigned long msg_res; - unsigned long mmr = 0; - struct bau_payload_queue_entry *msg; - struct bau_payload_queue_entry *msg2; - struct ptc_stats *stat; - - msg = mdp->msg; - stat = bcp->statp; - stat->d_retries++; - /* - * cancel any message from msg+1 to the retry itself - */ - for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { - if (msg2 > mdp->va_queue_last) - msg2 = mdp->va_queue_first; - if (msg2 == msg) - break; - - /* same conditions for cancellation as uv_do_reset */ - if ((msg2->replied_to == 0) && (msg2->canceled == 0) && - (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & - msg->sw_ack_vector) == 0) && - (msg2->sending_cpu == msg->sending_cpu) && - (msg2->msg_type != MSG_NOOP)) { - slot2 = msg2 - mdp->va_queue_first; - mmr = uv_read_local_mmr - (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = msg2->sw_ack_vector; - /* - * This is a message retry; clear the resources held - * by the previous message only if they timed out. - * If it has not timed out we have an unexpected - * situation to report. - */ - if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { - /* - * is the resource timed out? - * make everyone ignore the cancelled message. - */ - msg2->canceled = 1; - stat->d_canceled++; - cancel_count++; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << UV_SW_ACK_NPENDING) | - msg_res); - } - } - } - if (!cancel_count) - stat->d_nocanceled++; -} - -/* - * Do all the things a cpu should do for a TLB shootdown message. - * Other cpu's may come here at the same time for this message. - */ -static void uv_bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) -{ - int msg_ack_count; - short socket_ack_count = 0; - struct ptc_stats *stat; - struct bau_payload_queue_entry *msg; - struct bau_control *smaster = bcp->socket_master; - - /* - * This must be a normal message, or retry of a normal message - */ - msg = mdp->msg; - stat = bcp->statp; - if (msg->address == TLB_FLUSH_ALL) { - local_flush_tlb(); - stat->d_alltlb++; - } else { - __flush_tlb_one(msg->address); - stat->d_onetlb++; - } - stat->d_requestee++; - - /* - * One cpu on each uvhub has the additional job on a RETRY - * of releasing the resource held by the message that is - * being retried. That message is identified by sending - * cpu number. - */ - if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) - uv_bau_process_retry_msg(mdp, bcp); - - /* - * This is a sw_ack message, so we have to reply to it. - * Count each responding cpu on the socket. This avoids - * pinging the count's cache line back and forth between - * the sockets. - */ - socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) - &smaster->socket_acknowledge_count[mdp->msg_slot]); - if (socket_ack_count == bcp->cpus_in_socket) { - /* - * Both sockets dump their completed count total into - * the message's count. - */ - smaster->socket_acknowledge_count[mdp->msg_slot] = 0; - msg_ack_count = atomic_add_short_return(socket_ack_count, - (struct atomic_short *)&msg->acknowledge_count); - - if (msg_ack_count == bcp->cpus_in_uvhub) { - /* - * All cpus in uvhub saw it; reply - */ - uv_reply_to_message(mdp, bcp); - } - } - - return; -} - -/* - * Determine the first cpu on a uvhub. - */ -static int uvhub_to_first_cpu(int uvhub) -{ - int cpu; - for_each_present_cpu(cpu) - if (uvhub == uv_cpu_to_blade_id(cpu)) - return cpu; - return -1; -} - -/* - * Last resort when we get a large number of destination timeouts is - * to clear resources held by a given cpu. - * Do this with IPI so that all messages in the BAU message queue - * can be identified by their nonzero sw_ack_vector field. - * - * This is entered for a single cpu on the uvhub. - * The sender want's this uvhub to free a specific message's - * sw_ack resources. - */ -static void -uv_do_reset(void *ptr) -{ - int i; - int slot; - int count = 0; - unsigned long mmr; - unsigned long msg_res; - struct bau_control *bcp; - struct reset_args *rap; - struct bau_payload_queue_entry *msg; - struct ptc_stats *stat; - - bcp = &per_cpu(bau_control, smp_processor_id()); - rap = (struct reset_args *)ptr; - stat = bcp->statp; - stat->d_resets++; - - /* - * We're looking for the given sender, and - * will free its sw_ack resource. - * If all cpu's finally responded after the timeout, its - * message 'replied_to' was set. - */ - for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { - /* uv_do_reset: same conditions for cancellation as - uv_bau_process_retry_msg() */ - if ((msg->replied_to == 0) && - (msg->canceled == 0) && - (msg->sending_cpu == rap->sender) && - (msg->sw_ack_vector) && - (msg->msg_type != MSG_NOOP)) { - /* - * make everyone else ignore this message - */ - msg->canceled = 1; - slot = msg - bcp->va_queue_first; - count++; - /* - * only reset the resource if it is still pending - */ - mmr = uv_read_local_mmr - (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = msg->sw_ack_vector; - if (mmr & msg_res) { - stat->d_rcanceled++; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << UV_SW_ACK_NPENDING) | - msg_res); - } - } - } - return; -} - -/* - * Use IPI to get all target uvhubs to release resources held by - * a given sending cpu number. - */ -static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, - int sender) -{ - int uvhub; - int cpu; - cpumask_t mask; - struct reset_args reset_args; - - reset_args.sender = sender; - - cpus_clear(mask); - /* find a single cpu for each uvhub in this distribution mask */ - for (uvhub = 0; - uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; - uvhub++) { - if (!bau_uvhub_isset(uvhub, distribution)) - continue; - /* find a cpu for this uvhub */ - cpu = uvhub_to_first_cpu(uvhub); - cpu_set(cpu, mask); - } - /* IPI all cpus; Preemption is already disabled */ - smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); - return; -} - -static inline unsigned long -cycles_2_us(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long us; - ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) - >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; -} - -/* - * wait for all cpus on this hub to finish their sends and go quiet - * leaves uvhub_quiesce set so that no new broadcasts are started by - * bau_flush_send_and_wait() - */ -static inline void -quiesce_local_uvhub(struct bau_control *hmaster) -{ - atomic_add_short_return(1, (struct atomic_short *) - &hmaster->uvhub_quiesce); -} - -/* - * mark this quiet-requestor as done - */ -static inline void -end_uvhub_quiesce(struct bau_control *hmaster) -{ - atomic_add_short_return(-1, (struct atomic_short *) - &hmaster->uvhub_quiesce); -} - -/* - * Wait for completion of a broadcast software ack message - * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP - */ -static int uv_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift, int this_cpu, - struct bau_control *bcp, struct bau_control *smaster, long try) -{ - unsigned long descriptor_status; - cycles_t ttime; - struct ptc_stats *stat = bcp->statp; - struct bau_control *hmaster; - - hmaster = bcp->uvhub_master; - - /* spin on the status MMR, waiting for it to go idle */ - while ((descriptor_status = (((unsigned long) - uv_read_local_mmr(mmr_offset) >> - right_shift) & UV_ACT_STATUS_MASK)) != - DESC_STATUS_IDLE) { - /* - * Our software ack messages may be blocked because there are - * no swack resources available. As long as none of them - * has timed out hardware will NACK our message and its - * state will stay IDLE. - */ - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { - stat->s_stimeout++; - return FLUSH_GIVEUP; - } else if (descriptor_status == - DESC_STATUS_DESTINATION_TIMEOUT) { - stat->s_dtimeout++; - ttime = get_cycles(); - - /* - * Our retries may be blocked by all destination - * swack resources being consumed, and a timeout - * pending. In that case hardware returns the - * ERROR that looks like a destination timeout. - */ - if (cycles_2_us(ttime - bcp->send_message) < - timeout_us) { - bcp->conseccompletes = 0; - return FLUSH_RETRY_PLUGGED; - } - - bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; - } else { - /* - * descriptor_status is still BUSY - */ - cpu_relax(); - } - } - bcp->conseccompletes++; - return FLUSH_COMPLETE; -} - -static inline cycles_t -sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - -/* - * conditionally add 1 to *v, unless *v is >= u - * return 0 if we cannot add 1 to *v because it is >= u - * return 1 if we can add 1 to *v because it is < u - * the add is atomic - * - * This is close to atomic_add_unless(), but this allows the 'u' value - * to be lowered below the current 'v'. atomic_add_unless can only stop - * on equal. - */ -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) -{ - spin_lock(lock); - if (atomic_read(v) >= u) { - spin_unlock(lock); - return 0; - } - atomic_inc(v); - spin_unlock(lock); - return 1; -} - -/* - * Our retries are blocked by all destination swack resources being - * in use, and a timeout is pending. In that case hardware immediately - * returns the ERROR that looks like a destination timeout. - */ -static void -destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, - struct bau_control *hmaster, struct ptc_stats *stat) -{ - udelay(bcp->plugged_delay); - bcp->plugged_tries++; - if (bcp->plugged_tries >= bcp->plugsb4reset) { - bcp->plugged_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_plug++; - } -} - -static void -destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, - struct bau_control *hmaster, struct ptc_stats *stat) -{ - hmaster->max_bau_concurrent = 1; - bcp->timeout_tries++; - if (bcp->timeout_tries >= bcp->timeoutsb4reset) { - bcp->timeout_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_timeout++; - } -} - -/* - * Completions are taking a very long time due to a congested numalink - * network. - */ -static void -disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) -{ - int tcpu; - struct bau_control *tbcp; - - /* let only one cpu do this disabling */ - spin_lock(&disable_lock); - if (!baudisabled && bcp->period_requests && - ((bcp->period_time / bcp->period_requests) > congested_cycles)) { - /* it becomes this cpu's job to turn on the use of the - BAU again */ - baudisabled = 1; - bcp->set_bau_off = 1; - bcp->set_bau_on_time = get_cycles() + - sec_2_cycles(bcp->congested_period); - stat->s_bau_disabled++; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 1; - } - } - spin_unlock(&disable_lock); -} - -/** - * uv_flush_send_and_wait - * - * Send a broadcast and wait for it to complete. - * - * The flush_mask contains the cpus the broadcast is to be sent to including - * cpus that are on the local uvhub. - * - * Returns 0 if all flushing represented in the mask was done. - * Returns 1 if it gives up entirely and the original cpu mask is to be - * returned to the kernel. - */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) -{ - int right_shift; - int completion_status = 0; - int seq_number = 0; - long try = 0; - int cpu = bcp->uvhub_cpu; - int this_cpu = bcp->cpu; - unsigned long mmr_offset; - unsigned long index; - cycles_t time1; - cycles_t time2; - cycles_t elapsed; - struct ptc_stats *stat = bcp->statp; - struct bau_control *smaster = bcp->socket_master; - struct bau_control *hmaster = bcp->uvhub_master; - - if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, - &hmaster->active_descriptor_count, - hmaster->max_bau_concurrent)) { - stat->s_throttles++; - do { - cpu_relax(); - } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, - &hmaster->active_descriptor_count, - hmaster->max_bau_concurrent)); - } - while (hmaster->uvhub_quiesce) - cpu_relax(); - - if (cpu < UV_CPUS_PER_ACT_STATUS) { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; - } else { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = - ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); - } - time1 = get_cycles(); - do { - if (try == 0) { - bau_desc->header.msg_type = MSG_REGULAR; - seq_number = bcp->message_number++; - } else { - bau_desc->header.msg_type = MSG_RETRY; - stat->s_retry_messages++; - } - bau_desc->header.sequence = seq_number; - index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | - bcp->uvhub_cpu; - bcp->send_message = get_cycles(); - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); - try++; - completion_status = uv_wait_completion(bau_desc, mmr_offset, - right_shift, this_cpu, bcp, smaster, try); - - if (completion_status == FLUSH_RETRY_PLUGGED) { - destination_plugged(bau_desc, bcp, hmaster, stat); - } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - destination_timeout(bau_desc, bcp, hmaster, stat); - } - if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { - bcp->ipi_attempts = 0; - completion_status = FLUSH_GIVEUP; - break; - } - cpu_relax(); - } while ((completion_status == FLUSH_RETRY_PLUGGED) || - (completion_status == FLUSH_RETRY_TIMEOUT)); - time2 = get_cycles(); - bcp->plugged_tries = 0; - bcp->timeout_tries = 0; - if ((completion_status == FLUSH_COMPLETE) && - (bcp->conseccompletes > bcp->complete_threshold) && - (hmaster->max_bau_concurrent < - hmaster->max_bau_concurrent_constant)) - hmaster->max_bau_concurrent++; - while (hmaster->uvhub_quiesce) - cpu_relax(); - atomic_dec(&hmaster->active_descriptor_count); - if (time2 > time1) { - elapsed = time2 - time1; - stat->s_time += elapsed; - if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { - bcp->period_requests++; - bcp->period_time += elapsed; - if ((elapsed > congested_cycles) && - (bcp->period_requests > bcp->congested_reps)) { - disable_for_congestion(bcp, stat); - } - } - } else - stat->s_requestor--; - if (completion_status == FLUSH_COMPLETE && try > 1) - stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) { - stat->s_giveup++; - return 1; - } - return 0; -} - -/** - * uv_flush_tlb_others - globally purge translation cache of a virtual - * address or all TLB's - * @cpumask: mask of all cpu's in which the address is to be removed - * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) - * @cpu: the current cpu - * - * This is the entry point for initiating any UV global TLB shootdown. - * - * Purges the translation caches of all specified processors of the given - * virtual address, or purges all TLB's on specified processors. - * - * The caller has derived the cpumask from the mm_struct. This function - * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) - * - * The cpumask is converted into a uvhubmask of the uvhubs containing - * those cpus. - * - * Note that this function should be called with preemption disabled. - * - * Returns NULL if all remote flushing was done. - * Returns pointer to cpumask if some remote flushing remains to be - * done. The returned pointer is valid till preemption is re-enabled. - */ -const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long va, unsigned int cpu) -{ - int tcpu; - int uvhub; - int locals = 0; - int remotes = 0; - int hubs = 0; - struct bau_desc *bau_desc; - struct cpumask *flush_mask; - struct ptc_stats *stat; - struct bau_control *bcp; - struct bau_control *tbcp; - - /* kernel was booted 'nobau' */ - if (nobau) - return cpumask; - - bcp = &per_cpu(bau_control, cpu); - stat = bcp->statp; - - /* bau was disabled due to slow response */ - if (bcp->baudisabled) { - /* the cpu that disabled it must re-enable it */ - if (bcp->set_bau_off) { - if (get_cycles() >= bcp->set_bau_on_time) { - stat->s_bau_reenabled++; - baudisabled = 0; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 0; - tbcp->period_requests = 0; - tbcp->period_time = 0; - } - } - } - return cpumask; - } - - /* - * Each sending cpu has a per-cpu mask which it fills from the caller's - * cpu mask. All cpus are converted to uvhubs and copied to the - * activation descriptor. - */ - flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); - /* don't actually do a shootdown of the local cpu */ - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); - if (cpu_isset(cpu, *cpumask)) - stat->s_ntargself++; - - bau_desc = bcp->descriptor_base; - bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; - bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - - /* cpu statistics */ - for_each_cpu(tcpu, flush_mask) { - uvhub = uv_cpu_to_blade_id(tcpu); - bau_uvhub_set(uvhub, &bau_desc->distribution); - if (uvhub == bcp->uvhub) - locals++; - else - remotes++; - } - if ((locals + remotes) == 0) - return NULL; - stat->s_requestor++; - stat->s_ntargcpu += remotes + locals; - stat->s_ntargremotes += remotes; - stat->s_ntarglocals += locals; - remotes = bau_uvhub_weight(&bau_desc->distribution); - - /* uvhub statistics */ - hubs = bau_uvhub_weight(&bau_desc->distribution); - if (locals) { - stat->s_ntarglocaluvhub++; - stat->s_ntargremoteuvhub += (hubs - 1); - } else - stat->s_ntargremoteuvhub += hubs; - stat->s_ntarguvhub += hubs; - if (hubs >= 16) - stat->s_ntarguvhub16++; - else if (hubs >= 8) - stat->s_ntarguvhub8++; - else if (hubs >= 4) - stat->s_ntarguvhub4++; - else if (hubs >= 2) - stat->s_ntarguvhub2++; - else - stat->s_ntarguvhub1++; - - bau_desc->payload.address = va; - bau_desc->payload.sending_cpu = cpu; - - /* - * uv_flush_send_and_wait returns 0 if all cpu's were messaged, - * or 1 if it gave up and the original cpumask should be returned. - */ - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) - return NULL; - else - return cpumask; -} - -/* - * The BAU message interrupt comes here. (registered by set_intr_gate) - * See entry_64.S - * - * We received a broadcast assist message. - * - * Interrupts are disabled; this interrupt could represent - * the receipt of several messages. - * - * All cores/threads on this hub get this interrupt. - * The last one to see it does the software ack. - * (the resource will not be freed until noninterruptable cpus see this - * interrupt; hardware may timeout the s/w ack and reply ERROR) - */ -void uv_bau_message_interrupt(struct pt_regs *regs) -{ - int count = 0; - cycles_t time_start; - struct bau_payload_queue_entry *msg; - struct bau_control *bcp; - struct ptc_stats *stat; - struct msg_desc msgdesc; - - time_start = get_cycles(); - bcp = &per_cpu(bau_control, smp_processor_id()); - stat = bcp->statp; - msgdesc.va_queue_first = bcp->va_queue_first; - msgdesc.va_queue_last = bcp->va_queue_last; - msg = bcp->bau_msg_head; - while (msg->sw_ack_vector) { - count++; - msgdesc.msg_slot = msg - msgdesc.va_queue_first; - msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; - msgdesc.msg = msg; - uv_bau_process_message(&msgdesc, bcp); - msg++; - if (msg > msgdesc.va_queue_last) - msg = msgdesc.va_queue_first; - bcp->bau_msg_head = msg; - } - stat->d_time += (get_cycles() - time_start); - if (!count) - stat->d_nomsg++; - else if (count > 1) - stat->d_multmsg++; - ack_APIC_irq(); -} - -/* - * uv_enable_timeouts - * - * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have - * shootdown message timeouts enabled. The timeout does not cause - * an interrupt, but causes an error message to be returned to - * the sender. - */ -static void uv_enable_timeouts(void) -{ - int uvhub; - int nuvhubs; - int pnode; - unsigned long mmr_image; - - nuvhubs = uv_num_possible_blades(); - - for (uvhub = 0; uvhub < nuvhubs; uvhub++) { - if (!uv_blade_nr_possible_cpus(uvhub)) - continue; - - pnode = uv_blade_to_pnode(uvhub); - mmr_image = - uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); - /* - * Set the timeout period and then lock it in, in three - * steps; captures and locks in the period. - * - * To program the period, the SOFT_ACK_MODE must be off. - */ - mmr_image &= ~((unsigned long)1 << - UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); - /* - * Set the 4-bit period. - */ - mmr_image &= ~((unsigned long)0xf << - UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); - mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << - UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); - /* - * Subsequent reversals of the timebase bit (3) cause an - * immediate timeout of one or all INTD resources as - * indicated in bits 2:0 (7 causes all of them to timeout). - */ - mmr_image |= ((unsigned long)1 << - UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); - } -} - -static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) -{ - if (*offset < num_possible_cpus()) - return offset; - return NULL; -} - -static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) -{ - (*offset)++; - if (*offset < num_possible_cpus()) - return offset; - return NULL; -} - -static void uv_ptc_seq_stop(struct seq_file *file, void *data) -{ -} - -static inline unsigned long long -microsec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - -/* - * Display the statistics thru /proc. - * 'data' points to the cpu number - */ -static int uv_ptc_seq_show(struct seq_file *file, void *data) -{ - struct ptc_stats *stat; - int cpu; - - cpu = *(loff_t *)data; - - if (!cpu) { - seq_printf(file, - "# cpu sent stime self locals remotes ncpus localhub "); - seq_printf(file, - "remotehub numuvhubs numuvhubs16 numuvhubs8 "); - seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto "); - seq_printf(file, - "retries rok resetp resett giveup sto bz throt "); - seq_printf(file, - "sw_ack recv rtime all "); - seq_printf(file, - "one mult none retry canc nocan reset rcan "); - seq_printf(file, - "disable enable\n"); - } - if (cpu < num_possible_cpus() && cpu_online(cpu)) { - stat = &per_cpu(ptcstats, cpu); - /* source side statistics */ - seq_printf(file, - "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->s_requestor, cycles_2_us(stat->s_time), - stat->s_ntargself, stat->s_ntarglocals, - stat->s_ntargremotes, stat->s_ntargcpu, - stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, - stat->s_ntarguvhub, stat->s_ntarguvhub16); - seq_printf(file, "%ld %ld %ld %ld %ld ", - stat->s_ntarguvhub8, stat->s_ntarguvhub4, - stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_dtimeout); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", - stat->s_retry_messages, stat->s_retriesok, - stat->s_resets_plug, stat->s_resets_timeout, - stat->s_giveup, stat->s_stimeout, - stat->s_busy, stat->s_throttles); - - /* destination side statistics */ - seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - uv_read_global_mmr64(uv_cpu_to_pnode(cpu), - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), - stat->d_requestee, cycles_2_us(stat->d_time), - stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, - stat->d_nomsg, stat->d_retries, stat->d_canceled, - stat->d_nocanceled, stat->d_resets, - stat->d_rcanceled); - seq_printf(file, "%ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled); - } - - return 0; -} - -/* - * Display the tunables thru debugfs - */ -static ssize_t tunables_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - char buf[300]; - int ret; - - ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", - "max_bau_concurrent plugged_delay plugsb4reset", - "timeoutsb4reset ipi_reset_limit complete_threshold", - "congested_response_us congested_reps congested_period", - max_bau_concurrent, plugged_delay, plugsb4reset, - timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_response_us, congested_reps, congested_period); - - return simple_read_from_buffer(userbuf, count, ppos, buf, ret); -} - -/* - * -1: resetf the statistics - * 0: display meaning of the statistics - */ -static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, - size_t count, loff_t *data) -{ - int cpu; - long input_arg; - char optstr[64]; - struct ptc_stats *stat; - - if (count == 0 || count > sizeof(optstr)) - return -EINVAL; - if (copy_from_user(optstr, user, count)) - return -EFAULT; - optstr[count - 1] = '\0'; - if (strict_strtol(optstr, 10, &input_arg) < 0) { - printk(KERN_DEBUG "%s is invalid\n", optstr); - return -EINVAL; - } - - if (input_arg == 0) { - printk(KERN_DEBUG "# cpu: cpu number\n"); - printk(KERN_DEBUG "Sender statistics:\n"); - printk(KERN_DEBUG - "sent: number of shootdown messages sent\n"); - printk(KERN_DEBUG - "stime: time spent sending messages\n"); - printk(KERN_DEBUG - "numuvhubs: number of hubs targeted with shootdown\n"); - printk(KERN_DEBUG - "numuvhubs16: number times 16 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs8: number times 8 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs4: number times 4 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs2: number times 2 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs1: number times 1 hub targeted\n"); - printk(KERN_DEBUG - "numcpus: number of cpus targeted with shootdown\n"); - printk(KERN_DEBUG - "dto: number of destination timeouts\n"); - printk(KERN_DEBUG - "retries: destination timeout retries sent\n"); - printk(KERN_DEBUG - "rok: : destination timeouts successfully retried\n"); - printk(KERN_DEBUG - "resetp: ipi-style resource resets for plugs\n"); - printk(KERN_DEBUG - "resett: ipi-style resource resets for timeouts\n"); - printk(KERN_DEBUG - "giveup: fall-backs to ipi-style shootdowns\n"); - printk(KERN_DEBUG - "sto: number of source timeouts\n"); - printk(KERN_DEBUG - "bz: number of stay-busy's\n"); - printk(KERN_DEBUG - "throt: number times spun in throttle\n"); - printk(KERN_DEBUG "Destination side statistics:\n"); - printk(KERN_DEBUG - "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); - printk(KERN_DEBUG - "recv: shootdown messages received\n"); - printk(KERN_DEBUG - "rtime: time spent processing messages\n"); - printk(KERN_DEBUG - "all: shootdown all-tlb messages\n"); - printk(KERN_DEBUG - "one: shootdown one-tlb messages\n"); - printk(KERN_DEBUG - "mult: interrupts that found multiple messages\n"); - printk(KERN_DEBUG - "none: interrupts that found no messages\n"); - printk(KERN_DEBUG - "retry: number of retry messages processed\n"); - printk(KERN_DEBUG - "canc: number messages canceled by retries\n"); - printk(KERN_DEBUG - "nocan: number retries that found nothing to cancel\n"); - printk(KERN_DEBUG - "reset: number of ipi-style reset requests processed\n"); - printk(KERN_DEBUG - "rcan: number messages canceled by reset requests\n"); - printk(KERN_DEBUG - "disable: number times use of the BAU was disabled\n"); - printk(KERN_DEBUG - "enable: number times use of the BAU was re-enabled\n"); - } else if (input_arg == -1) { - for_each_present_cpu(cpu) { - stat = &per_cpu(ptcstats, cpu); - memset(stat, 0, sizeof(struct ptc_stats)); - } - } - - return count; -} - -static int local_atoi(const char *name) -{ - int val = 0; - - for (;; name++) { - switch (*name) { - case '0' ... '9': - val = 10*val+(*name-'0'); - break; - default: - return val; - } - } -} - -/* - * set the tunables - * 0 values reset them to defaults - */ -static ssize_t tunables_write(struct file *file, const char __user *user, - size_t count, loff_t *data) -{ - int cpu; - int cnt = 0; - int val; - char *p; - char *q; - char instr[64]; - struct bau_control *bcp; - - if (count == 0 || count > sizeof(instr)-1) - return -EINVAL; - if (copy_from_user(instr, user, count)) - return -EFAULT; - - instr[count] = '\0'; - /* count the fields */ - p = instr + strspn(instr, WHITESPACE); - q = p; - for (; *p; p = q + strspn(q, WHITESPACE)) { - q = p + strcspn(p, WHITESPACE); - cnt++; - if (q == p) - break; - } - if (cnt != 9) { - printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); - return -EINVAL; - } - - p = instr + strspn(instr, WHITESPACE); - q = p; - for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { - q = p + strcspn(p, WHITESPACE); - val = local_atoi(p); - switch (cnt) { - case 0: - if (val == 0) { - max_bau_concurrent = MAX_BAU_CONCURRENT; - max_bau_concurrent_constant = - MAX_BAU_CONCURRENT; - continue; - } - bcp = &per_cpu(bau_control, smp_processor_id()); - if (val < 1 || val > bcp->cpus_in_uvhub) { - printk(KERN_DEBUG - "Error: BAU max concurrent %d is invalid\n", - val); - return -EINVAL; - } - max_bau_concurrent = val; - max_bau_concurrent_constant = val; - continue; - case 1: - if (val == 0) - plugged_delay = PLUGGED_DELAY; - else - plugged_delay = val; - continue; - case 2: - if (val == 0) - plugsb4reset = PLUGSB4RESET; - else - plugsb4reset = val; - continue; - case 3: - if (val == 0) - timeoutsb4reset = TIMEOUTSB4RESET; - else - timeoutsb4reset = val; - continue; - case 4: - if (val == 0) - ipi_reset_limit = IPI_RESET_LIMIT; - else - ipi_reset_limit = val; - continue; - case 5: - if (val == 0) - complete_threshold = COMPLETE_THRESHOLD; - else - complete_threshold = val; - continue; - case 6: - if (val == 0) - congested_response_us = CONGESTED_RESPONSE_US; - else - congested_response_us = val; - continue; - case 7: - if (val == 0) - congested_reps = CONGESTED_REPS; - else - congested_reps = val; - continue; - case 8: - if (val == 0) - congested_period = CONGESTED_PERIOD; - else - congested_period = val; - continue; - } - if (q == p) - break; - } - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->max_bau_concurrent = max_bau_concurrent; - bcp->max_bau_concurrent_constant = max_bau_concurrent; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->congested_response_us = congested_response_us; - bcp->congested_reps = congested_reps; - bcp->congested_period = congested_period; - } - return count; -} - -static const struct seq_operations uv_ptc_seq_ops = { - .start = uv_ptc_seq_start, - .next = uv_ptc_seq_next, - .stop = uv_ptc_seq_stop, - .show = uv_ptc_seq_show -}; - -static int uv_ptc_proc_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &uv_ptc_seq_ops); -} - -static int tunables_open(struct inode *inode, struct file *file) -{ - return 0; -} - -static const struct file_operations proc_uv_ptc_operations = { - .open = uv_ptc_proc_open, - .read = seq_read, - .write = uv_ptc_proc_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations tunables_fops = { - .open = tunables_open, - .read = tunables_read, - .write = tunables_write, -}; - -static int __init uv_ptc_init(void) -{ - struct proc_dir_entry *proc_uv_ptc; - - if (!is_uv_system()) - return 0; - - proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, - &proc_uv_ptc_operations); - if (!proc_uv_ptc) { - printk(KERN_ERR "unable to create %s proc entry\n", - UV_PTC_BASENAME); - return -EINVAL; - } - - tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); - if (!tunables_dir) { - printk(KERN_ERR "unable to create debugfs directory %s\n", - UV_BAU_TUNABLES_DIR); - return -EINVAL; - } - tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, - tunables_dir, NULL, &tunables_fops); - if (!tunables_file) { - printk(KERN_ERR "unable to create debugfs file %s\n", - UV_BAU_TUNABLES_FILE); - return -EINVAL; - } - return 0; -} - -/* - * initialize the sending side's sending buffers - */ -static void -uv_activation_descriptor_init(int node, int pnode) -{ - int i; - int cpu; - unsigned long pa; - unsigned long m; - unsigned long n; - struct bau_desc *bau_desc; - struct bau_desc *bd2; - struct bau_control *bcp; - - /* - * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) - * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub - */ - bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* - UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); - BUG_ON(!bau_desc); - - pa = uv_gpa(bau_desc); /* need the real nasid*/ - n = pa >> uv_nshift; - m = pa & uv_mmask; - - uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - - /* - * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each - * cpu even though we only use the first one; one descriptor can - * describe a broadcast to 256 uv hubs. - */ - for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); - i++, bd2++) { - memset(bd2, 0, sizeof(struct bau_desc)); - bd2->header.sw_ack_flag = 1; - /* - * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub - * in the partition. The bit map will indicate uvhub numbers, - * which are 0-N in a partition. Pnodes are unique system-wide. - */ - bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; - bd2->header.dest_subnodeid = 0x10; /* the LB */ - bd2->header.command = UV_NET_ENDPOINT_INTD; - bd2->header.int_both = 1; - /* - * all others need to be set to zero: - * fairness chaining multilevel count replied_to - */ - } - for_each_present_cpu(cpu) { - if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) - continue; - bcp = &per_cpu(bau_control, cpu); - bcp->descriptor_base = bau_desc; - } -} - -/* - * initialize the destination side's receiving buffers - * entered for each uvhub in the partition - * - node is first node (kernel memory notion) on the uvhub - * - pnode is the uvhub's physical identifier - */ -static void -uv_payload_queue_init(int node, int pnode) -{ - int pn; - int cpu; - char *cp; - unsigned long pa; - struct bau_payload_queue_entry *pqp; - struct bau_payload_queue_entry *pqp_malloc; - struct bau_control *bcp; - - pqp = (struct bau_payload_queue_entry *) kmalloc_node( - (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), - GFP_KERNEL, node); - BUG_ON(!pqp); - pqp_malloc = pqp; - - cp = (char *)pqp + 31; - pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); - - for_each_present_cpu(cpu) { - if (pnode != uv_cpu_to_pnode(cpu)) - continue; - /* for every cpu on this pnode: */ - bcp = &per_cpu(bau_control, cpu); - bcp->va_queue_first = pqp; - bcp->bau_msg_head = pqp; - bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); - } - /* - * need the pnode of where the memory was really allocated - */ - pa = uv_gpa(pqp); - pn = pa >> uv_nshift; - uv_write_global_mmr64(pnode, - UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, - ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, - (unsigned long) - uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1))); - /* in effect, all msg_type's are set to MSG_NOOP */ - memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); -} - -/* - * Initialization of each UV hub's structures - */ -static void __init uv_init_uvhub(int uvhub, int vector) -{ - int node; - int pnode; - unsigned long apicid; - - node = uvhub_to_first_node(uvhub); - pnode = uv_blade_to_pnode(uvhub); - uv_activation_descriptor_init(node, pnode); - uv_payload_queue_init(node, pnode); - /* - * the below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS - */ - apicid = uvhub_to_first_apicid(uvhub); - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, - ((apicid << 32) | vector)); -} - -/* - * We will set BAU_MISC_CONTROL with a timeout period. - * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. - * So the destination timeout period has be be calculated from them. - */ -static int -calculate_destination_timeout(void) -{ - unsigned long mmr_image; - int mult1; - int mult2; - int index; - int base; - int ret; - unsigned long ts_ns; - - mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); - index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; - mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); - mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - base = timeout_base_ns[index]; - ts_ns = base * mult1 * mult2; - ret = ts_ns / 1000; - return ret; -} - -/* - * initialize the bau_control structure for each cpu - */ -static void __init uv_init_per_cpu(int nuvhubs) -{ - int i; - int cpu; - int pnode; - int uvhub; - int have_hmaster; - short socket = 0; - unsigned short socket_mask; - unsigned char *uvhub_mask; - struct bau_control *bcp; - struct uvhub_desc *bdp; - struct socket_desc *sdp; - struct bau_control *hmaster = NULL; - struct bau_control *smaster = NULL; - struct socket_desc { - short num_cpus; - short cpu_number[16]; - }; - struct uvhub_desc { - unsigned short socket_mask; - short num_cpus; - short uvhub; - short pnode; - struct socket_desc socket[2]; - }; - struct uvhub_desc *uvhub_descs; - - timeout_us = calculate_destination_timeout(); - - uvhub_descs = (struct uvhub_desc *) - kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); - memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); - uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - memset(bcp, 0, sizeof(struct bau_control)); - pnode = uv_cpu_hub_info(cpu)->pnode; - uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; - *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); - bdp = &uvhub_descs[uvhub]; - bdp->num_cpus++; - bdp->uvhub = uvhub; - bdp->pnode = pnode; - /* kludge: 'assuming' one node per socket, and assuming that - disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1); - bdp->socket_mask |= (1 << socket); - sdp = &bdp->socket[socket]; - sdp->cpu_number[sdp->num_cpus] = cpu; - sdp->num_cpus++; - } - for (uvhub = 0; uvhub < nuvhubs; uvhub++) { - if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) - continue; - have_hmaster = 0; - bdp = &uvhub_descs[uvhub]; - socket_mask = bdp->socket_mask; - socket = 0; - while (socket_mask) { - if (!(socket_mask & 1)) - goto nextsocket; - sdp = &bdp->socket[socket]; - for (i = 0; i < sdp->num_cpus; i++) { - cpu = sdp->cpu_number[i]; - bcp = &per_cpu(bau_control, cpu); - bcp->cpu = cpu; - if (i == 0) { - smaster = bcp; - if (!have_hmaster) { - have_hmaster++; - hmaster = bcp; - } - } - bcp->cpus_in_uvhub = bdp->num_cpus; - bcp->cpus_in_socket = sdp->num_cpus; - bcp->socket_master = smaster; - bcp->uvhub = bdp->uvhub; - bcp->uvhub_master = hmaster; - bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> - blade_processor_id; - } -nextsocket: - socket++; - socket_mask = (socket_mask >> 1); - } - } - kfree(uvhub_descs); - kfree(uvhub_mask); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->baudisabled = 0; - bcp->statp = &per_cpu(ptcstats, cpu); - /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = microsec_2_cycles(2*timeout_us); - bcp->max_bau_concurrent = max_bau_concurrent; - bcp->max_bau_concurrent_constant = max_bau_concurrent; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->congested_response_us = congested_response_us; - bcp->congested_reps = congested_reps; - bcp->congested_period = congested_period; - } -} - -/* - * Initialization of BAU-related structures - */ -static int __init uv_bau_init(void) -{ - int uvhub; - int pnode; - int nuvhubs; - int cur_cpu; - int vector; - unsigned long mmr; - - if (!is_uv_system()) - return 0; - - if (nobau) - return 0; - - for_each_possible_cpu(cur_cpu) - zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), - GFP_KERNEL, cpu_to_node(cur_cpu)); - - uv_nshift = uv_hub_info->m_val; - uv_mmask = (1UL << uv_hub_info->m_val) - 1; - nuvhubs = uv_num_possible_blades(); - spin_lock_init(&disable_lock); - congested_cycles = microsec_2_cycles(congested_response_us); - - uv_init_per_cpu(nuvhubs); - - uv_partition_base_pnode = 0x7fffffff; - for (uvhub = 0; uvhub < nuvhubs; uvhub++) - if (uv_blade_nr_possible_cpus(uvhub) && - (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) - uv_partition_base_pnode = uv_blade_to_pnode(uvhub); - - vector = UV_BAU_MESSAGE; - for_each_possible_blade(uvhub) - if (uv_blade_nr_possible_cpus(uvhub)) - uv_init_uvhub(uvhub, vector); - - uv_enable_timeouts(); - alloc_intr_gate(vector, uv_bau_message_intr1); - - for_each_possible_blade(uvhub) { - if (uv_blade_nr_possible_cpus(uvhub)) { - pnode = uv_blade_to_pnode(uvhub); - /* INIT the bau */ - uv_write_global_mmr64(pnode, - UVH_LB_BAU_SB_ACTIVATION_CONTROL, - ((unsigned long)1 << 63)); - mmr = 1; /* should be 1 to broadcast to both sockets */ - uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, - mmr); - } - } - - return 0; -} -core_initcall(uv_bau_init); -fs_initcall(uv_ptc_init); diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 4c3da5674e67..a375616d77f7 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -38,19 +38,3 @@ unsigned long __trampinit setup_trampoline(void) memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } - -void __init setup_trampoline_page_table(void) -{ -#ifdef CONFIG_X86_32 - /* Copy kernel address range */ - clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* Initialize low mappings */ - clone_pgd_range(trampoline_pg_dir, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, - KERNEL_PGD_BOUNDARY)); -#endif -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d43968503dd2..cb838ca42c96 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -575,6 +575,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; } diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c deleted file mode 100644 index 7b24460917d5..000000000000 --- a/arch/x86/kernel/uv_irq.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * SGI UV IRQ functions - * - * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. - */ - -#include <linux/module.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include <linux/irq.h> - -#include <asm/apic.h> -#include <asm/uv/uv_irq.h> -#include <asm/uv/uv_hub.h> - -/* MMR offset and pnode of hub sourcing interrupts for a given irq */ -struct uv_irq_2_mmr_pnode{ - struct rb_node list; - unsigned long offset; - int pnode; - int irq; -}; - -static spinlock_t uv_irq_lock; -static struct rb_root uv_irq_root; - -static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); - -static void uv_noop(struct irq_data *data) { } - -static void uv_ack_apic(struct irq_data *data) -{ - ack_APIC_irq(); -} - -static struct irq_chip uv_irq_chip = { - .name = "UV-CORE", - .irq_mask = uv_noop, - .irq_unmask = uv_noop, - .irq_eoi = uv_ack_apic, - .irq_set_affinity = uv_set_irq_affinity, -}; - -/* - * Add offset and pnode information of the hub sourcing interrupts to the - * rb tree for a specific irq. - */ -static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) -{ - struct rb_node **link = &uv_irq_root.rb_node; - struct rb_node *parent = NULL; - struct uv_irq_2_mmr_pnode *n; - struct uv_irq_2_mmr_pnode *e; - unsigned long irqflags; - - n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, - uv_blade_to_memory_nid(blade)); - if (!n) - return -ENOMEM; - - n->irq = irq; - n->offset = offset; - n->pnode = uv_blade_to_pnode(blade); - spin_lock_irqsave(&uv_irq_lock, irqflags); - /* Find the right place in the rbtree: */ - while (*link) { - parent = *link; - e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); - - if (unlikely(irq == e->irq)) { - /* irq entry exists */ - e->pnode = uv_blade_to_pnode(blade); - e->offset = offset; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - kfree(n); - return 0; - } - - if (irq < e->irq) - link = &(*link)->rb_left; - else - link = &(*link)->rb_right; - } - - /* Insert the node into the rbtree. */ - rb_link_node(&n->list, parent, link); - rb_insert_color(&n->list, &uv_irq_root); - - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; -} - -/* Retrieve offset and pnode information from the rb tree for a specific irq */ -int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) -{ - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - - if (e->irq == irq) { - *offset = e->offset; - *pnode = e->pnode; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; - } - - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return -1; -} - -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -static int -arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int limit) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg = get_irq_chip_data(irq); - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode, err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - if (limit == UV_AFFINITY_CPU) - irq_set_status_flags(irq, IRQ_NO_BALANCING); - else - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} - -static int -uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest; - unsigned long mmr_value, mmr_offset; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - if (__ioapic_set_affinity(data, mask, &dest)) - return -1; - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode)) - return -1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return 0; -} - -/* - * Set up a mapping of an available irq and vector, and enable the specified - * MMR that defines the MSI that is to be sent to the specified CPU when an - * interrupt is raised. - */ -int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset, int limit) -{ - int irq, ret; - - irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - - if (irq <= 0) - return -EBUSY; - - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, - limit); - if (ret == irq) - uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); - else - destroy_irq(irq); - - return ret; -} -EXPORT_SYMBOL_GPL(uv_setup_irq); - -/* - * Tear down a mapping of an irq and vector, and disable the specified MMR that - * defined the MSI that was to be sent to the specified CPU when an interrupt - * was raised. - * - * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). - */ -void uv_teardown_irq(unsigned int irq) -{ - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - if (e->irq == irq) { - arch_disable_uv_irq(e->pnode, e->offset); - rb_erase(n, &uv_irq_root); - kfree(e); - break; - } - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - destroy_irq(irq); -} -EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c deleted file mode 100644 index 309c70fb7759..000000000000 --- a/arch/x86/kernel/uv_sysfs.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson - */ - -#include <linux/sysdev.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv.h> - -struct kobject *sgi_uv_kobj; - -static ssize_t partition_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); -} - -static ssize_t coherence_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); -} - -static struct kobj_attribute partition_id_attr = - __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); - -static struct kobj_attribute coherence_id_attr = - __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); - - -static int __init sgi_uv_sysfs_init(void) -{ - unsigned long ret; - - if (!is_uv_system()) - return -ENODEV; - - if (!sgi_uv_kobj) - sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); - if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); - return -EINVAL; - } - - ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); - return ret; - } - - ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); - return ret; - } - - return 0; -} - -device_initcall(sgi_uv_sysfs_init); diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c deleted file mode 100644 index 56e421bc379b..000000000000 --- a/arch/x86/kernel/uv_time.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * SGI RTC clock/timer routines. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Dimitri Sivanich - */ -#include <linux/clockchips.h> -#include <linux/slab.h> - -#include <asm/uv/uv_mmrs.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv.h> -#include <asm/apic.h> -#include <asm/cpu.h> - -#define RTC_NAME "sgi_rtc" - -static cycle_t uv_read_rtc(struct clocksource *cs); -static int uv_rtc_next_event(unsigned long, struct clock_event_device *); -static void uv_rtc_timer_setup(enum clock_event_mode, - struct clock_event_device *); - -static struct clocksource clocksource_uv = { - .name = RTC_NAME, - .rating = 400, - .read = uv_read_rtc, - .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, - .shift = 10, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -static struct clock_event_device clock_event_device_uv = { - .name = RTC_NAME, - .features = CLOCK_EVT_FEAT_ONESHOT, - .shift = 20, - .rating = 400, - .irq = -1, - .set_next_event = uv_rtc_next_event, - .set_mode = uv_rtc_timer_setup, - .event_handler = NULL, -}; - -static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); - -/* There is one of these allocated per node */ -struct uv_rtc_timer_head { - spinlock_t lock; - /* next cpu waiting for timer, local node relative: */ - int next_cpu; - /* number of cpus on this node: */ - int ncpus; - struct { - int lcpu; /* systemwide logical cpu number */ - u64 expires; /* next timer expiration for this cpu */ - } cpu[1]; -}; - -/* - * Access to uv_rtc_timer_head via blade id. - */ -static struct uv_rtc_timer_head **blade_info __read_mostly; - -static int uv_rtc_evt_enable; - -/* - * Hardware interface routines - */ - -/* Send IPIs to another node */ -static void uv_rtc_send_IPI(int cpu) -{ - unsigned long apicid, val; - int pnode; - - apicid = cpu_physical_id(cpu); - pnode = uv_apicid_to_pnode(apicid); - val = (1UL << UVH_IPI_INT_SEND_SHFT) | - (apicid << UVH_IPI_INT_APIC_ID_SHFT) | - (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); - - uv_write_global_mmr64(pnode, UVH_IPI_INT, val); -} - -/* Check for an RTC interrupt pending */ -static int uv_intr_pending(int pnode) -{ - return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & - UVH_EVENT_OCCURRED0_RTC1_MASK; -} - -/* Setup interrupt and return non-zero if early expiration occurred. */ -static int uv_setup_intr(int cpu, u64 expires) -{ - u64 val; - int pnode = uv_cpu_to_pnode(cpu); - - uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, - UVH_RTC1_INT_CONFIG_M_MASK); - uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); - - uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, - UVH_EVENT_OCCURRED0_RTC1_MASK); - - val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | - ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); - - /* Set configuration */ - uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); - /* Initialize comparator value */ - uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - - if (uv_read_rtc(NULL) <= expires) - return 0; - - return !uv_intr_pending(pnode); -} - -/* - * Per-cpu timer tracking routines - */ - -static __init void uv_rtc_deallocate_timers(void) -{ - int bid; - - for_each_possible_blade(bid) { - kfree(blade_info[bid]); - } - kfree(blade_info); -} - -/* Allocate per-node list of cpu timer expiration times. */ -static __init int uv_rtc_allocate_timers(void) -{ - int cpu; - - blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); - if (!blade_info) - return -ENOMEM; - memset(blade_info, 0, uv_possible_blades * sizeof(void *)); - - for_each_present_cpu(cpu) { - int nid = cpu_to_node(cpu); - int bid = uv_cpu_to_blade_id(cpu); - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; - struct uv_rtc_timer_head *head = blade_info[bid]; - - if (!head) { - head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + - (uv_blade_nr_possible_cpus(bid) * - 2 * sizeof(u64)), - GFP_KERNEL, nid); - if (!head) { - uv_rtc_deallocate_timers(); - return -ENOMEM; - } - spin_lock_init(&head->lock); - head->ncpus = uv_blade_nr_possible_cpus(bid); - head->next_cpu = -1; - blade_info[bid] = head; - } - - head->cpu[bcpu].lcpu = cpu; - head->cpu[bcpu].expires = ULLONG_MAX; - } - - return 0; -} - -/* Find and set the next expiring timer. */ -static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) -{ - u64 lowest = ULLONG_MAX; - int c, bcpu = -1; - - head->next_cpu = -1; - for (c = 0; c < head->ncpus; c++) { - u64 exp = head->cpu[c].expires; - if (exp < lowest) { - bcpu = c; - lowest = exp; - } - } - if (bcpu >= 0) { - head->next_cpu = bcpu; - c = head->cpu[bcpu].lcpu; - if (uv_setup_intr(c, lowest)) - /* If we didn't set it up in time, trigger */ - uv_rtc_send_IPI(c); - } else { - uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, - UVH_RTC1_INT_CONFIG_M_MASK); - } -} - -/* - * Set expiration time for current cpu. - * - * Returns 1 if we missed the expiration time. - */ -static int uv_rtc_set_timer(int cpu, u64 expires) -{ - int pnode = uv_cpu_to_pnode(cpu); - int bid = uv_cpu_to_blade_id(cpu); - struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; - u64 *t = &head->cpu[bcpu].expires; - unsigned long flags; - int next_cpu; - - spin_lock_irqsave(&head->lock, flags); - - next_cpu = head->next_cpu; - *t = expires; - - /* Will this one be next to go off? */ - if (next_cpu < 0 || bcpu == next_cpu || - expires < head->cpu[next_cpu].expires) { - head->next_cpu = bcpu; - if (uv_setup_intr(cpu, expires)) { - *t = ULLONG_MAX; - uv_rtc_find_next_timer(head, pnode); - spin_unlock_irqrestore(&head->lock, flags); - return -ETIME; - } - } - - spin_unlock_irqrestore(&head->lock, flags); - return 0; -} - -/* - * Unset expiration time for current cpu. - * - * Returns 1 if this timer was pending. - */ -static int uv_rtc_unset_timer(int cpu, int force) -{ - int pnode = uv_cpu_to_pnode(cpu); - int bid = uv_cpu_to_blade_id(cpu); - struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; - u64 *t = &head->cpu[bcpu].expires; - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&head->lock, flags); - - if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force) - rc = 1; - - if (rc) { - *t = ULLONG_MAX; - /* Was the hardware setup for this timer? */ - if (head->next_cpu == bcpu) - uv_rtc_find_next_timer(head, pnode); - } - - spin_unlock_irqrestore(&head->lock, flags); - - return rc; -} - - -/* - * Kernel interface routines. - */ - -/* - * Read the RTC. - * - * Starting with HUB rev 2.0, the UV RTC register is replicated across all - * cachelines of it's own page. This allows faster simultaneous reads - * from a given socket. - */ -static cycle_t uv_read_rtc(struct clocksource *cs) -{ - unsigned long offset; - - if (uv_get_min_hub_revision_id() == 1) - offset = 0; - else - offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; - - return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); -} - -/* - * Program the next event, relative to now - */ -static int uv_rtc_next_event(unsigned long delta, - struct clock_event_device *ced) -{ - int ced_cpu = cpumask_first(ced->cpumask); - - return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL)); -} - -/* - * Setup the RTC timer in oneshot mode - */ -static void uv_rtc_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - int ced_cpu = cpumask_first(evt->cpumask); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here yet */ - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu, 1); - break; - } -} - -static void uv_rtc_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *ced = &per_cpu(cpu_ced, cpu); - - if (!ced || !ced->event_handler) - return; - - if (uv_rtc_unset_timer(cpu, 0) != 1) - return; - - ced->event_handler(ced); -} - -static int __init uv_enable_evt_rtc(char *str) -{ - uv_rtc_evt_enable = 1; - - return 1; -} -__setup("uvrtcevt", uv_enable_evt_rtc); - -static __init void uv_rtc_register_clockevents(struct work_struct *dummy) -{ - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); - - *ced = clock_event_device_uv; - ced->cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(ced); -} - -static __init int uv_rtc_setup_clock(void) -{ - int rc; - - if (!is_uv_system()) - return -ENODEV; - - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_uv.shift); - - /* If single blade, prefer tsc */ - if (uv_num_possible_blades() == 1) - clocksource_uv.rating = 250; - - rc = clocksource_register(&clocksource_uv); - if (rc) - printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); - else - printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", - sn_rtc_cycles_per_second/(unsigned long)1E6); - - if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback) - return rc; - - /* Setup and register clockevents */ - rc = uv_rtc_allocate_timers(); - if (rc) - goto error; - - x86_platform_ipi_callback = uv_rtc_interrupt; - - clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, - NSEC_PER_SEC, clock_event_device_uv.shift); - - clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / - sn_rtc_cycles_per_second; - - clock_event_device_uv.max_delta_ns = clocksource_uv.mask * - (NSEC_PER_SEC / sn_rtc_cycles_per_second); - - rc = schedule_on_each_cpu(uv_rtc_register_clockevents); - if (rc) { - x86_platform_ipi_callback = NULL; - uv_rtc_deallocate_timers(); - goto error; - } - - printk(KERN_INFO "UV RTC clockevents registered\n"); - - return 0; - -error: - clocksource_unregister(&clocksource_uv); - printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc); - - return rc; -} -arch_initcall(uv_rtc_setup_clock); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c deleted file mode 100644 index 3371bd053b89..000000000000 --- a/arch/x86/kernel/visws_quirks.c +++ /dev/null @@ -1,614 +0,0 @@ -/* - * SGI Visual Workstation support and quirks, unmaintained. - * - * Split out from setup.c by davej@suse.de - * - * Copyright (C) 1999 Bent Hagemark, Ingo Molnar - * - * SGI Visual Workstation interrupt controller - * - * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC - * which serves as the main interrupt controller in the system. Non-legacy - * hardware in the system uses this controller directly. Legacy devices - * are connected to the PIIX4 which in turn has its 8259(s) connected to - * a of the Cobalt APIC entry. - * - * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com - * - * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> - */ -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/visws/cobalt.h> -#include <asm/visws/piix4.h> -#include <asm/io_apic.h> -#include <asm/fixmap.h> -#include <asm/reboot.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/e820.h> -#include <asm/time.h> -#include <asm/io.h> - -#include <linux/kernel_stat.h> - -#include <asm/i8259.h> -#include <asm/irq_vectors.h> -#include <asm/visws/lithium.h> - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/pci_ids.h> - -extern int no_broadcast; - -char visws_board_type = -1; -char visws_board_rev = -1; - -static void __init visws_time_init(void) -{ - printk(KERN_INFO "Starting Cobalt Timer system clock\n"); - - /* Set the countdown value */ - co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); - - /* Start the timer */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); - - /* Enable (unmask) the timer interrupt */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - - setup_default_timer_irq(); -} - -/* Replaces the default init_ISA_irqs in the generic setup */ -static void __init visws_pre_intr_init(void); - -/* Quirk for machine specific memory setup. */ - -#define MB (1024 * 1024) - -unsigned long sgivwfb_mem_phys; -unsigned long sgivwfb_mem_size; -EXPORT_SYMBOL(sgivwfb_mem_phys); -EXPORT_SYMBOL(sgivwfb_mem_size); - -long long mem_size __initdata = 0; - -static char * __init visws_memory_setup(void) -{ - long long gfx_mem_size = 8 * MB; - - mem_size = boot_params.alt_mem_k; - - if (!mem_size) { - printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); - mem_size = 128 * MB; - } - - /* - * this hardcodes the graphics memory to 8 MB - * it really should be sized dynamically (or at least - * set as a boot param) - */ - if (!sgivwfb_mem_size) { - printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n"); - sgivwfb_mem_size = 8 * MB; - } - - /* - * Trim to nearest MB - */ - sgivwfb_mem_size &= ~((1 << 20) - 1); - sgivwfb_mem_phys = mem_size - gfx_mem_size; - - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); - e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); - - return "PROM"; -} - -static void visws_machine_emergency_restart(void) -{ - /* - * Visual Workstations restart after this - * register is poked on the PIIX4 - */ - outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); -} - -static void visws_machine_power_off(void) -{ - unsigned short pm_status; -/* extern unsigned int pci_bus0; */ - - while ((pm_status = inw(PMSTS_PORT)) & 0x100) - outw(pm_status, PMSTS_PORT); - - outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); - - mdelay(10); - -#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) - -/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */ - outl(PIIX_SPECIAL_STOP, 0xCFC); -} - -static void __init visws_get_smp_config(unsigned int early) -{ -} - -/* - * The Visual Workstation is Intel MP compliant in the hardware - * sense, but it doesn't have a BIOS(-configuration table). - * No problem for Linux. - */ - -static void __init MP_processor_info(struct mpc_cpu *m) -{ - int ver, logical_apicid; - physid_mask_t apic_cpus; - - if (!(m->cpuflag & CPU_ENABLED)) - return; - - logical_apicid = m->apicid; - printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", - m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", - m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, - (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver); - - if (m->cpuflag & CPU_BOOTPROCESSOR) - boot_cpu_physical_apicid = m->apicid; - - ver = m->apicver; - if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->apicid, MAX_APICS); - return; - } - - apic->apicid_to_cpu_present(m->apicid, &apic_cpus); - physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - m->apicid); - ver = 0x10; - } - apic_version[m->apicid] = ver; -} - -static void __init visws_find_smp_config(void) -{ - struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); - unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); - - if (ncpus > CO_CPU_MAX) { - printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", - ncpus, mp); - - ncpus = CO_CPU_MAX; - } - - if (ncpus > setup_max_cpus) - ncpus = setup_max_cpus; - -#ifdef CONFIG_X86_LOCAL_APIC - smp_found_config = 1; -#endif - while (ncpus--) - MP_processor_info(mp++); - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -} - -static void visws_trap_init(void); - -void __init visws_early_detect(void) -{ - int raw; - - visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) - >> PIIX_GPI_BD_SHIFT; - - if (visws_board_type < 0) - return; - - /* - * Override the default platform setup functions - */ - x86_init.resources.memory_setup = visws_memory_setup; - x86_init.mpparse.get_smp_config = visws_get_smp_config; - x86_init.mpparse.find_smp_config = visws_find_smp_config; - x86_init.irqs.pre_vector_init = visws_pre_intr_init; - x86_init.irqs.trap_init = visws_trap_init; - x86_init.timers.timer_init = visws_time_init; - x86_init.pci.init = pci_visws_init; - x86_init.pci.init_irq = x86_init_noop; - - /* - * Install reboot quirks: - */ - pm_power_off = visws_machine_power_off; - machine_ops.emergency_restart = visws_machine_emergency_restart; - - /* - * Do not use broadcast IPIs: - */ - no_broadcast = 0; - -#ifdef CONFIG_X86_IO_APIC - /* - * Turn off IO-APIC detection and initialization: - */ - skip_ioapic_setup = 1; -#endif - - /* - * Get Board rev. - * First, we have to initialize the 307 part to allow us access - * to the GPIO registers. Let's map them at 0x0fc0 which is right - * after the PIIX4 PM section. - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable GPIO registers. */ - - /* - * Now, we have to map the power management section to write - * a bit which enables access to the GPIO registers. - * What lunatic came up with this shit? - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable PM registers. */ - - /* - * Now, write the PM register which enables the GPIO registers. - */ - outb_p(SIO_PM_FER2, SIO_PM_INDEX); - outb_p(SIO_PM_GP_EN, SIO_PM_DATA); - - /* - * Now, initialize the GPIO registers. - * We want them all to be inputs which is the - * power on default, so let's leave them alone. - * So, let's just read the board rev! - */ - raw = inb_p(SIO_GP_DATA1); - raw &= 0x7f; /* 7 bits of valid board revision ID. */ - - if (visws_board_type == VISWS_320) { - if (raw < 0x6) { - visws_board_rev = 4; - } else if (raw < 0xc) { - visws_board_rev = 5; - } else { - visws_board_rev = 6; - } - } else if (visws_board_type == VISWS_540) { - visws_board_rev = 2; - } else { - visws_board_rev = raw; - } - - printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", - (visws_board_type == VISWS_320 ? "320" : - (visws_board_type == VISWS_540 ? "540" : - "unknown")), visws_board_rev); -} - -#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) -#define BCD (LI_INTB | LI_INTC | LI_INTD) -#define ALLDEVS (A01234 | BCD) - -static __init void lithium_init(void) -{ - set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); - set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); - - if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); -/* panic("This machine is not SGI Visual Workstation 320/540"); */ - } - - if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); -/* panic("This machine is not SGI Visual Workstation 320/540"); */ - } - - li_pcia_write16(LI_PCI_INTEN, ALLDEVS); - li_pcib_write16(LI_PCI_INTEN, ALLDEVS); -} - -static __init void cobalt_init(void) -{ - /* - * On normal SMP PC this is used only with SMP, but we have to - * use it and set it up here to start the Cobalt clock - */ - set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); - setup_local_APIC(); - printk(KERN_INFO "Local APIC Version %#x, ID %#x\n", - (unsigned int)apic_read(APIC_LVR), - (unsigned int)apic_read(APIC_ID)); - - set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); - set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); - printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", - co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); - - /* Enable Cobalt APIC being careful to NOT change the ID! */ - co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); - - printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", - co_apic_read(CO_APIC_ID)); -} - -static void __init visws_trap_init(void) -{ - lithium_init(); - cobalt_init(); -} - -/* - * IRQ controller / APIC support: - */ - -static DEFINE_SPINLOCK(cobalt_lock); - -/* - * Set the given Cobalt APIC Redirection Table entry to point - * to the given IDT vector/index. - */ -static inline void co_apic_set(int entry, int irq) -{ - co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); - co_apic_write(CO_APIC_HI(entry), 0); -} - -/* - * Cobalt (IO)-APIC functions to handle PCI devices. - */ -static inline int co_apic_ide0_hack(void) -{ - extern char visws_board_type; - extern char visws_board_rev; - - if (visws_board_type == VISWS_320 && visws_board_rev == 5) - return 5; - return CO_APIC_IDE0; -} - -static int is_co_apic(unsigned int irq) -{ - if (IS_CO_APIC(irq)) - return CO_APIC(irq); - - switch (irq) { - case 0: return CO_APIC_CPU; - case CO_IRQ_IDE0: return co_apic_ide0_hack(); - case CO_IRQ_IDE1: return CO_APIC_IDE1; - default: return -1; - } -} - - -/* - * This is the SGI Cobalt (IO-)APIC: - */ -static void enable_cobalt_irq(struct irq_data *data) -{ - co_apic_set(is_co_apic(data->irq), data->irq); -} - -static void disable_cobalt_irq(struct irq_data *data) -{ - int entry = is_co_apic(data->irq); - - co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); - co_apic_read(CO_APIC_LO(entry)); -} - -static void ack_cobalt_irq(struct irq_data *data) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - disable_cobalt_irq(data); - apic_write(APIC_EOI, APIC_EIO_ACK); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static struct irq_chip cobalt_irq_type = { - .name = "Cobalt-APIC", - .irq_enable = enable_cobalt_irq, - .irq_disable = disable_cobalt_irq, - .irq_ack = ack_cobalt_irq, -}; - - -/* - * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt - * -- not the manner expected by the code in i8259.c. - * - * there is a 'master' physical interrupt source that gets sent to - * the CPU. But in the chipset there are various 'virtual' interrupts - * waiting to be handled. We represent this to Linux through a 'master' - * interrupt controller type, and through a special virtual interrupt- - * controller. Device drivers only see the virtual interrupt sources. - */ -static unsigned int startup_piix4_master_irq(struct irq_data *data) -{ - legacy_pic->init(0); - enable_cobalt_irq(data); -} - -static void end_piix4_master_irq(struct irq_data *data) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - enable_cobalt_irq(data); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static struct irq_chip piix4_master_irq_type = { - .name = "PIIX4-master", - .irq_startup = startup_piix4_master_irq, - .irq_ack = ack_cobalt_irq, -}; - -static void pii4_mask(struct irq_data *data) { } - -static struct irq_chip piix4_virtual_irq_type = { - .name = "PIIX4-virtual", - .mask = pii4_mask, -}; - -/* - * PIIX4-8259 master/virtual functions to handle interrupt requests - * from legacy devices: floppy, parallel, serial, rtc. - * - * None of these get Cobalt APIC entries, neither do they have IDT - * entries. These interrupts are purely virtual and distributed from - * the 'master' interrupt source: CO_IRQ_8259. - * - * When the 8259 interrupts its handler figures out which of these - * devices is interrupting and dispatches to its handler. - * - * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ - * enable_irq gets the right irq. This 'master' irq is never directly - * manipulated by any driver. - */ -static irqreturn_t piix4_master_intr(int irq, void *dev_id) -{ - unsigned long flags; - int realirq; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - - /* Find out what's interrupting in the PIIX4 master 8259 */ - outb(0x0c, 0x20); /* OCW3 Poll command */ - realirq = inb(0x20); - - /* - * Bit 7 == 0 means invalid/spurious - */ - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq &= 7; - - if (unlikely(realirq == 2)) { - outb(0x0c, 0xa0); - realirq = inb(0xa0); - - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq = (realirq & 7) + 8; - } - - /* mask and ack interrupt */ - cached_irq_mask |= 1 << realirq; - if (unlikely(realirq > 7)) { - inb(0xa1); - outb(cached_slave_mask, 0xa1); - outb(0x60 + (realirq & 7), 0xa0); - outb(0x60 + 2, 0x20); - } else { - inb(0x21); - outb(cached_master_mask, 0x21); - outb(0x60 + realirq, 0x20); - } - - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - - /* - * handle this 'virtual interrupt' as a Cobalt one now. - */ - generic_handle_irq(realirq); - - return IRQ_HANDLED; - -out_unlock: - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return IRQ_NONE; -} - -static struct irqaction master_action = { - .handler = piix4_master_intr, - .name = "PIIX4-8259", -}; - -static struct irqaction cascade_action = { - .handler = no_action, - .name = "cascade", -}; - -static inline void set_piix4_virtual_irq_type(void) -{ - piix4_virtual_irq_type.enable = i8259A_chip.unmask; - piix4_virtual_irq_type.disable = i8259A_chip.mask; - piix4_virtual_irq_type.unmask = i8259A_chip.unmask; -} - -static void __init visws_pre_intr_init(void) -{ - int i; - - set_piix4_virtual_irq_type(); - - for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - struct irq_chip *chip = NULL; - - if (i == 0) - chip = &cobalt_irq_type; - else if (i == CO_IRQ_IDE0) - chip = &cobalt_irq_type; - else if (i == CO_IRQ_IDE1) - >chip = &cobalt_irq_type; - else if (i == CO_IRQ_8259) - chip = &piix4_master_irq_type; - else if (i < CO_IRQ_APIC0) - chip = &piix4_virtual_irq_type; - else if (IS_CO_APIC(i)) - chip = &cobalt_irq_type; - - if (chip) - set_irq_chip(i, chip); - } - - setup_irq(CO_IRQ_8259, &master_action); - setup_irq(2, &cascade_action); -} diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5ffb5622f793..61fb98519622 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -551,8 +551,14 @@ cannot_handle: int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) { if (VMPI.is_vm86pus) { - if ((trapno == 3) || (trapno == 1)) - return_to_32bit(regs, VM86_TRAP + (trapno << 8)); + if ((trapno == 3) || (trapno == 1)) { + KVM86->regs32->ax = VM86_TRAP + (trapno << 8); + /* setting this flag forces the code in entry_32.S to + call save_v86_state() and change the stack pointer + to KVM86->regs32 */ + set_thread_flag(TIF_IRET); + return 0; + } do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); return 0; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 38e2b67807e1..e03530aebfd0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -301,7 +301,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(PAGE_SIZE) + PERCPU(THREAD_SIZE) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index cd6da6bf3eca..ceb2911aa439 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -6,10 +6,12 @@ #include <linux/init.h> #include <linux/ioport.h> #include <linux/module.h> +#include <linux/pci.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> #include <asm/pci_x86.h> +#include <asm/pci.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> @@ -99,3 +101,8 @@ struct x86_platform_ops x86_platform = { }; EXPORT_SYMBOL_GPL(x86_platform); +struct x86_msi_ops x86_msi = { + .setup_msi_irqs = native_setup_msi_irqs, + .teardown_msi_irq = native_teardown_msi_irq, + .teardown_msi_irqs = default_teardown_msi_irqs, +}; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 9c253bd65e24..547128546cc3 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -394,7 +394,8 @@ static void __init setup_xstate_init(void) * Setup init_xstate_buf to represent the init state of * all the features managed by the xsave */ - init_xstate_buf = alloc_bootmem(xstate_size); + init_xstate_buf = alloc_bootmem_align(xstate_size, + __alignof__(struct xsave_struct)); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; clts(); |