From 6fd592daae2182adc47f405e20d07f34f52d07dd Mon Sep 17 00:00:00 2001 From: "Carlos R. Mafra" Date: Mon, 5 May 2008 20:11:22 -0300 Subject: x86: clean up computation of HPET .mult variables While reading through the HPET code I realized that the computation of .mult variables could be done with less lines of code, resulting in a 1.6% text size saving for hpet.o So I propose the following patch, which applies against today's Linus -git tree. >From 0c6507e400e9ca5f7f14331e18f8c12baf75a9d3 Mon Sep 17 00:00:00 2001 From: Carlos R. Mafra Date: Mon, 5 May 2008 19:38:53 -0300 The computation of clocksource_hpet.mult tmp = (u64)hpet_period << HPET_SHIFT; do_div(tmp, FSEC_PER_NSEC); clocksource_hpet.mult = (u32)tmp; can be streamlined if we note that it is equal to clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); Furthermore, the computation of hpet_clockevent.mult uint64_t hpet_freq; hpet_freq = 1000000000000000ULL; do_div(hpet_freq, hpet_period); hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, NSEC_PER_SEC, hpet_clockevent.shift); can also be streamlined with the observation that hpet_period and hpet_freq are inverse to each other (in proper units). So instead of computing hpet_freq and using (schematically) div_sc(hpet_freq, 10^9, shift) we use the trick of calling with the arguments in reverse order, div_sc(10^6, hpet_period, shift). The different power of ten is due to frequency being in Hertz (1/sec) and the period being in units of femtosecond. Explicitly, mult = (hpet_freq * 2^shift)/10^9 (before) mult = (10^6 * 2^shift)/hpet_period (after) because hpet_freq = 10^15/hpet_period. The comments in the code are also updated to reflect the changes. As a result, text data bss dec hex filename 2957 425 92 3474 d92 arch/x86/kernel/hpet.o 3006 425 92 3523 dc3 arch/x86/kernel/hpet.o.old a 1.6% reduction in text size. Signed-off-by: Carlos R. Mafra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 9b5cfcdfc426..ea230ec69057 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -17,7 +17,7 @@ /* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000 +#define FSEC_PER_NSEC 1000000L /* * HPET address is set in acpi/boot.c, when an ACPI entry exists @@ -206,20 +206,19 @@ static void hpet_enable_legacy_int(void) static void hpet_legacy_clockevent_register(void) { - uint64_t hpet_freq; - /* Start HPET legacy interrupts */ hpet_enable_legacy_int(); /* - * The period is a femto seconds value. We need to calculate the - * scaled math multiplication factor for nanosecond to hpet tick - * conversion. + * The mult factor is defined as (include/linux/clockchips.h) + * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h) + * hpet_period is in units of femtoseconds (per cycle), so + * mult/2^shift = cyc/ns = 10^6/hpet_period + * mult = (10^6 * 2^shift)/hpet_period + * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period */ - hpet_freq = 1000000000000000ULL; - do_div(hpet_freq, hpet_period); - hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, hpet_clockevent.shift); + hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC, + hpet_period, hpet_clockevent.shift); /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); @@ -324,7 +323,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { - u64 tmp, start, now; + u64 start, now; cycle_t t1; /* Start the counter */ @@ -351,21 +350,15 @@ static int hpet_clocksource_register(void) return -ENODEV; } - /* Initialize and register HPET clocksource - * - * hpet period is in femto seconds per cycle - * so we need to convert this to ns/cyc units - * approximated by mult/2^shift - * - * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift - * fsec/cyc * 1ns/1000000fsec * 2^shift = mult - * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult - * (fsec/cyc << shift)/1000000 = mult - * (hpet_period << shift)/FSEC_PER_NSEC = mult + /* + * The definition of mult is (include/linux/clocksource.h) + * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc + * so we first need to convert hpet_period to ns/cyc units: + * mult/2^shift = ns/cyc = hpet_period/10^6 + * mult = (hpet_period * 2^shift)/10^6 + * mult = (hpet_period << shift)/FSEC_PER_NSEC */ - tmp = (u64)hpet_period << HPET_SHIFT; - do_div(tmp, FSEC_PER_NSEC); - clocksource_hpet.mult = (u32)tmp; + clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); clocksource_register(&clocksource_hpet); -- cgit v1.2.3 From e8aa4667baf74dfd85fbaab86861465acb811085 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 9 May 2008 11:49:11 +0200 Subject: x86: enable hpet=force for AMD SB400 Add quirk to allow forced usage of HPET on ATI SB400. I stumbled over machines where HPET is enabled but not reported by BIOS. This patch configures the HPET base address and makes it known to the OS. Signed-off-by: Andreas Herrmann Signed-off-by: Thomas Gleixner --- arch/x86/kernel/quirks.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d89a648fe710..5fe6bd5cc4c7 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -65,6 +65,7 @@ static enum { ICH_FORCE_HPET_RESUME, VT8237_FORCE_HPET_RESUME, NVIDIA_FORCE_HPET_RESUME, + ATI_FORCE_HPET_RESUME, } force_hpet_resume_type; static void __iomem *rcba_base; @@ -330,6 +331,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, vt8237_force_enable_hpet); +static void ati_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x14, 0xfed00000); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void ati_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (!hpet_force_user || hpet_address || force_hpet_address) + return; + + pci_write_config_dword(dev, 0x14, 0xfed00000); + pci_read_config_dword(dev, 0x14, &val); + force_hpet_address = val; + force_hpet_resume_type = ATI_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; + return; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + ati_force_enable_hpet); + /* * Undocumented chipset feature taken from LinuxBIOS. */ @@ -397,6 +423,9 @@ void force_hpet_resume(void) case NVIDIA_FORCE_HPET_RESUME: nvidia_force_hpet_resume(); return; + case ATI_FORCE_HPET_RESUME: + ati_force_hpet_resume(); + return; default: break; } -- cgit v1.2.3 From 7c4728f4a865067d96fb84f1d9c65e0ccd1f355d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 10 May 2008 21:42:14 +0200 Subject: x86: print info about available HPET quirk We have a lot of HPET quirks available which might force enable HPET even when the BIOS does not enable it. Some of those quirks depend on the command line option "hpet=force". Andrew pointed out that hoping that the user will find out about this boot option is not really helpful. Emit a kernel info which informs the user about the "hpet=force" boot option when we enter a quirk which depends on this option and the user did not provide it. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/quirks.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 5fe6bd5cc4c7..ddbf34d16a01 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -175,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, static struct pci_dev *cached_dev; +static void hpet_print_force_info(void) +{ + printk(KERN_INFO "HPET not enabled in BIOS. " + "You might try hpet=force boot option\n"); +} + static void old_ich_force_hpet_resume(void) { u32 val; @@ -254,6 +260,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev) { if (hpet_force_user) old_ich_force_enable_hpet(dev); + else + hpet_print_force_info(); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, @@ -291,8 +299,13 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev) { u32 uninitialized_var(val); - if (!hpet_force_user || hpet_address || force_hpet_address) + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); return; + } pci_read_config_dword(dev, 0x68, &val); /* @@ -341,8 +354,13 @@ static void ati_force_enable_hpet(struct pci_dev *dev) { u32 uninitialized_var(val); - if (!hpet_force_user || hpet_address || force_hpet_address) + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); return; + } pci_write_config_dword(dev, 0x14, 0xfed00000); pci_read_config_dword(dev, 0x14, &val); @@ -369,8 +387,13 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev) { u32 uninitialized_var(val); - if (!hpet_force_user || hpet_address || force_hpet_address) + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); return; + } pci_write_config_dword(dev, 0x44, 0xfed00001); pci_read_config_dword(dev, 0x44, &val); -- cgit v1.2.3 From 8edc5cc5ec880c96de8e6686fb0d7a5231e91c05 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Mon, 12 May 2008 15:43:34 +0200 Subject: x86: remove 6 bank limitation in 64 bit MCE reporting code Eliminate the 6 bank restriction in 64 bit mce reporting code. This restriction is artificial (due to static creation of sysfs files) and 32 bit code does not have any such restriction. This change helps in reporting the details of machine checks on a machine check exception with errors in bank 6 and above on CPUs that support those banks. Without the patch, machine check errors in those banks are not reported. We still have 128 (MCE_EXTENDED_BANK) bank restriction instead of max 256 supported in hardware. That is not changed in the patch below as it will have some user level mcelog utility dependency, with bank 128 being used for thermal reporting currently. The patch below does not create sysfs control (bankNctl) for banks higher than 6 as well. That needs some pre-cleanup in /sysfs mce layout, removal of per cpu /sysfs entries for bankctl as they are really global system level control today. That change will follow. This basic change is critical to report the detailed errors on banks higher than 6. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index e07e8c068ae0..f1f3f5e163b7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -31,7 +31,7 @@ #include #define MISC_MCELOG_MINOR 227 -#define NR_BANKS 6 +#define NR_SYSFS_BANKS 6 atomic_t mce_entry; @@ -46,7 +46,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -209,7 +209,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (i < NR_SYSFS_BANKS && !bank[i]) continue; m.misc = 0; @@ -444,9 +444,10 @@ static void mce_init(void *dummy) rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap & 0xff; - if (banks > NR_BANKS) { - printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); - banks = NR_BANKS; + if (banks > MCE_EXTENDED_BANK) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", + MCE_EXTENDED_BANK); + banks = MCE_EXTENDED_BANK; } /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) @@ -462,7 +463,7 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -766,7 +767,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* TBD should generate these dynamically based on number of available banks */ +/* + * TBD should generate these dynamically based on number of available banks. + * Have only 6 contol banks in /sysfs until then. + */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) -- cgit v1.2.3 From 9b7dc567d03d74a1fbae84e88949b6a60d922d82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 May 2008 20:10:09 +0200 Subject: x86: unify interrupt vector defines The interrupt vector defines are copied 4 times around with minimal differences. Move them all into asm-x86/irq_vectors.h Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/vmiclock_32.c | 3 +- arch/x86/mach-visws/visws_apic.c | 3 +- include/asm-x86/hw_irq.h | 12 +- include/asm-x86/hw_irq_64.h | 71 --------- include/asm-x86/irq_32.h | 2 +- include/asm-x86/irq_64.h | 29 +--- include/asm-x86/irq_vectors.h | 168 ++++++++++++++++++++++ include/asm-x86/mach-default/irq_vectors.h | 96 ------------- include/asm-x86/mach-default/irq_vectors_limits.h | 16 --- include/asm-x86/mach-visws/irq_vectors.h | 62 -------- include/asm-x86/mach-voyager/irq_vectors.h | 79 ---------- 12 files changed, 184 insertions(+), 359 deletions(-) create mode 100644 include/asm-x86/irq_vectors.h delete mode 100644 include/asm-x86/mach-default/irq_vectors.h delete mode 100644 include/asm-x86/mach-default/irq_vectors_limits.h delete mode 100644 include/asm-x86/mach-visws/irq_vectors.h delete mode 100644 include/asm-x86/mach-voyager/irq_vectors.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2a609dc3271c..0c7f64b99e17 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -51,7 +51,7 @@ #include #include #include -#include "irq_vectors.h" +#include /* * We use macros for low-level operations which need to be overridden diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index a2b030780aa9..ba7d19e102b1 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -33,8 +33,7 @@ #include #include #include - -#include +#include #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c index cef9cb1d15ac..d8b2cfd85d92 100644 --- a/arch/x86/mach-visws/visws_apic.c +++ b/arch/x86/mach-visws/visws_apic.c @@ -21,10 +21,9 @@ #include #include #include +#include #include "cobalt.h" -#include "irq_vectors.h" - static DEFINE_SPINLOCK(cobalt_lock); diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 38af08ed43cf..a8c5e8bdaa49 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -13,7 +13,7 @@ * unified by tglx */ -#define NMI_VECTOR 0x02 +#include #ifndef __ASSEMBLY__ @@ -75,6 +75,16 @@ extern void send_IPI(int dest, int vector); extern atomic_t irq_err_count; extern atomic_t irq_mis_count; +/* Voyager functions */ +extern asmlinkage void vic_cpi_interrupt(void); +extern asmlinkage void vic_sys_interrupt(void); +extern asmlinkage void vic_cmn_interrupt(void); +extern asmlinkage void qic_timer_interrupt(void); +extern asmlinkage void qic_invalidate_interrupt(void); +extern asmlinkage void qic_reschedule_interrupt(void); +extern asmlinkage void qic_enable_irq_interrupt(void); +extern asmlinkage void qic_call_function_interrupt(void); + #endif /* !ASSEMBLY_ */ #ifdef CONFIG_X86_32 diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h index 28674576e9fe..98c9d494a711 100644 --- a/include/asm-x86/hw_irq_64.h +++ b/include/asm-x86/hw_irq_64.h @@ -1,74 +1,3 @@ -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define IA32_SYSCALL_VECTOR 0x80 - - -/* Reserve the lowest usable priority level 0x20 - 0x2f for triggering - * cleanup after irq migration. - */ -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - -/* - * Vectors 0x30-0x3f are used for ISA interrupts. - */ -#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) -#define IRQ1_VECTOR (IRQ0_VECTOR + 1) -#define IRQ2_VECTOR (IRQ0_VECTOR + 2) -#define IRQ3_VECTOR (IRQ0_VECTOR + 3) -#define IRQ4_VECTOR (IRQ0_VECTOR + 4) -#define IRQ5_VECTOR (IRQ0_VECTOR + 5) -#define IRQ6_VECTOR (IRQ0_VECTOR + 6) -#define IRQ7_VECTOR (IRQ0_VECTOR + 7) -#define IRQ8_VECTOR (IRQ0_VECTOR + 8) -#define IRQ9_VECTOR (IRQ0_VECTOR + 9) -#define IRQ10_VECTOR (IRQ0_VECTOR + 10) -#define IRQ11_VECTOR (IRQ0_VECTOR + 11) -#define IRQ12_VECTOR (IRQ0_VECTOR + 12) -#define IRQ13_VECTOR (IRQ0_VECTOR + 13) -#define IRQ14_VECTOR (IRQ0_VECTOR + 14) -#define IRQ15_VECTOR (IRQ0_VECTOR + 15) - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - */ -#define SPURIOUS_APIC_VECTOR 0xff -#define ERROR_APIC_VECTOR 0xfe -#define RESCHEDULE_VECTOR 0xfd -#define CALL_FUNCTION_VECTOR 0xfc -/* fb free - please don't readd KDB here because it's useless - (hint - think what a NMI bit does to a vector) */ -#define THERMAL_APIC_VECTOR 0xfa -#define THRESHOLD_APIC_VECTOR 0xf9 -/* f8 free */ -#define INVALIDATE_TLB_VECTOR_END 0xf7 -#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ - -#define NUM_INVALIDATE_TLB_VECTORS 8 - -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - -/* - * First APIC vector available to drivers: (vectors 0x30-0xee) - * we start at 0x41 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) -#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */ - - #ifndef __ASSEMBLY__ typedef int vector_irq_t[NR_VECTORS]; diff --git a/include/asm-x86/irq_32.h b/include/asm-x86/irq_32.h index 0b79f3185243..c5c7542f79ad 100644 --- a/include/asm-x86/irq_32.h +++ b/include/asm-x86/irq_32.h @@ -12,7 +12,7 @@ #include /* include comes from machine specific directory */ -#include "irq_vectors.h" +#include #include static inline int irq_canonicalize(int irq) diff --git a/include/asm-x86/irq_64.h b/include/asm-x86/irq_64.h index 7608176590b6..3037ec667bfb 100644 --- a/include/asm-x86/irq_64.h +++ b/include/asm-x86/irq_64.h @@ -11,34 +11,7 @@ */ #include - -#define TIMER_IRQ 0 - -/* - * 16 8259A IRQ's, 208 potential APIC interrupt sources. - * Right now the APIC is mostly only used for SMP. - * 256 vectors is an architectural limit. (we can have - * more than 256 devices theoretically, but they will - * have to use shared interrupts) - * Since vectors 0x00-0x1f are used/reserved for the CPU, - * the usable vector space is 0x20-0xff (224 vectors) - */ - -/* - * The maximum number of vectors supported by x86_64 processors - * is limited to 256. For processors other than x86_64, NR_VECTORS - * should be changed accordingly. - */ -#define NR_VECTORS 256 - -#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in hw_irq.h */ - -#if NR_CPUS < MAX_IO_APICS -#define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) -#else -#define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) -#endif -#define NR_IRQ_VECTORS NR_IRQS +#include static inline int irq_canonicalize(int irq) { diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h new file mode 100644 index 000000000000..daceaaf0a3a4 --- /dev/null +++ b/include/asm-x86/irq_vectors.h @@ -0,0 +1,168 @@ +#ifndef _ASM_IRQ_VECTORS_H +#define _ASM_IRQ_VECTORS_H + +#include + +#define NMI_VECTOR 0x02 + +/* + * IDT vectors usable for external interrupt sources start + * at 0x20: + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#ifdef CONFIG_X86_32 +# define SYSCALL_VECTOR 0x80 +#else +# define IA32_SYSCALL_VECTOR 0x80 +#endif + +/* + * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. + * + * Reserve the lowest usable priority level 0x20 - 0x2f for triggering + * cleanup after irq migration on 64 bit. + */ +#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR + +/* + * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit + */ +#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#define IRQ1_VECTOR (IRQ0_VECTOR + 1) +#define IRQ2_VECTOR (IRQ0_VECTOR + 2) +#define IRQ3_VECTOR (IRQ0_VECTOR + 3) +#define IRQ4_VECTOR (IRQ0_VECTOR + 4) +#define IRQ5_VECTOR (IRQ0_VECTOR + 5) +#define IRQ6_VECTOR (IRQ0_VECTOR + 6) +#define IRQ7_VECTOR (IRQ0_VECTOR + 7) +#define IRQ8_VECTOR (IRQ0_VECTOR + 8) +#define IRQ9_VECTOR (IRQ0_VECTOR + 9) +#define IRQ10_VECTOR (IRQ0_VECTOR + 10) +#define IRQ11_VECTOR (IRQ0_VECTOR + 11) +#define IRQ12_VECTOR (IRQ0_VECTOR + 12) +#define IRQ13_VECTOR (IRQ0_VECTOR + 13) +#define IRQ14_VECTOR (IRQ0_VECTOR + 14) +#define IRQ15_VECTOR (IRQ0_VECTOR + 15) + +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + * + * Vectors 0xf0-0xfa are free (reserved for future Linux use). + */ +#ifdef CONFIG_X86_32 + +# define SPURIOUS_APIC_VECTOR 0xff +# define ERROR_APIC_VECTOR 0xfe +# define INVALIDATE_TLB_VECTOR 0xfd +# define RESCHEDULE_VECTOR 0xfc +# define CALL_FUNCTION_VECTOR 0xfb +# define THERMAL_APIC_VECTOR 0xf0 + +#else + +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe +#define RESCHEDULE_VECTOR 0xfd +#define CALL_FUNCTION_VECTOR 0xfc +#define THERMAL_APIC_VECTOR 0xfa +#define THRESHOLD_APIC_VECTOR 0xf9 +#define INVALIDATE_TLB_VECTOR_END 0xf7 +#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ + +#define NUM_INVALIDATE_TLB_VECTORS 8 + +#endif + +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef + +/* + * First APIC vector available to drivers: (vectors 0x30-0xee) we + * start at 0x31(0x41) to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) + */ +#ifdef CONFIG_X86_32 +# define FIRST_DEVICE_VECTOR 0x31 +#else +# define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) +#endif + +#define FIRST_SYSTEM_VECTOR 0xef + +#define NR_VECTORS 256 + +#define FPU_IRQ 13 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) + +#if !defined(CONFIG_X86_VISWS) && !defined(CONFIG_X86_VOYAGER) + +# if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) + +# define NR_IRQS 224 + +# if (224 >= 32 * NR_CPUS) +# define NR_IRQ_VECTORS NR_IRQS +# else +# define NR_IRQ_VECTORS (32 * NR_CPUS) +# endif + +# else /* IO_APIC || PARAVIRT */ + +# define NR_IRQS 16 +# define NR_IRQ_VECTORS NR_IRQS + +# endif + +#else /* !VISWS && !VOYAGER */ + +# define NR_IRQS 224 +# define NR_IRQ_VECTORS NR_IRQS + +#endif /* VISWS */ + +/* Voyager specific defines */ +/* These define the CPIs we use in linux */ +#define VIC_CPI_LEVEL0 0 +#define VIC_CPI_LEVEL1 1 +/* now the fake CPIs */ +#define VIC_TIMER_CPI 2 +#define VIC_INVALIDATE_CPI 3 +#define VIC_RESCHEDULE_CPI 4 +#define VIC_ENABLE_IRQ_CPI 5 +#define VIC_CALL_FUNCTION_CPI 6 + +/* Now the QIC CPIs: Since we don't need the two initial levels, + * these are 2 less than the VIC CPIs */ +#define QIC_CPI_OFFSET 1 +#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET) +#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET) +#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET) +#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET) +#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET) + +#define VIC_START_FAKE_CPI VIC_TIMER_CPI +#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_CPI + +/* this is the SYS_INT CPI. */ +#define VIC_SYS_INT 8 +#define VIC_CMN_INT 15 + +/* This is the boot CPI for alternate processors. It gets overwritten + * by the above once the system has activated all available processors */ +#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0 +#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8) + + +#endif /* _ASM_IRQ_VECTORS_H */ diff --git a/include/asm-x86/mach-default/irq_vectors.h b/include/asm-x86/mach-default/irq_vectors.h deleted file mode 100644 index 881c63ca61ad..000000000000 --- a/include/asm-x86/mach-default/irq_vectors.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * This file should contain #defines for all of the interrupt vector - * numbers used by this architecture. - * - * In addition, there are some standard defines: - * - * FIRST_EXTERNAL_VECTOR: - * The first free place for external interrupts - * - * SYSCALL_VECTOR: - * The IRQ vector a syscall makes the user to kernel transition - * under. - * - * TIMER_IRQ: - * The IRQ number the timer interrupt comes in at. - * - * NR_IRQS: - * The total number of interrupt vectors (including all the - * architecture specific interrupts) needed. - * - */ -#ifndef _ASM_IRQ_VECTORS_H -#define _ASM_IRQ_VECTORS_H - -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. - */ - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - * - * Vectors 0xf0-0xfa are free (reserved for future Linux use). - */ -#define SPURIOUS_APIC_VECTOR 0xff -#define ERROR_APIC_VECTOR 0xfe -#define INVALIDATE_TLB_VECTOR 0xfd -#define RESCHEDULE_VECTOR 0xfc -#define CALL_FUNCTION_VECTOR 0xfb - -#define THERMAL_APIC_VECTOR 0xf0 -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - -/* - * First APIC vector available to drivers: (vectors 0x30-0xee) - * we start at 0x31 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xef - -#define TIMER_IRQ 0 - -/* - * 16 8259A IRQ's, 208 potential APIC interrupt sources. - * Right now the APIC is mostly only used for SMP. - * 256 vectors is an architectural limit. (we can have - * more than 256 devices theoretically, but they will - * have to use shared interrupts) - * Since vectors 0x00-0x1f are used/reserved for the CPU, - * the usable vector space is 0x20-0xff (224 vectors) - */ - -/* - * The maximum number of vectors supported by i386 processors - * is limited to 256. For processors other than i386, NR_VECTORS - * should be changed accordingly. - */ -#define NR_VECTORS 256 - -#include "irq_vectors_limits.h" - -#define FPU_IRQ 13 - -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) - - -#endif /* _ASM_IRQ_VECTORS_H */ diff --git a/include/asm-x86/mach-default/irq_vectors_limits.h b/include/asm-x86/mach-default/irq_vectors_limits.h deleted file mode 100644 index a90c7a60109f..000000000000 --- a/include/asm-x86/mach-default/irq_vectors_limits.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _ASM_IRQ_VECTORS_LIMITS_H -#define _ASM_IRQ_VECTORS_LIMITS_H - -#if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) -#define NR_IRQS 224 -# if (224 >= 32 * NR_CPUS) -# define NR_IRQ_VECTORS NR_IRQS -# else -# define NR_IRQ_VECTORS (32 * NR_CPUS) -# endif -#else -#define NR_IRQS 16 -#define NR_IRQ_VECTORS NR_IRQS -#endif - -#endif /* _ASM_IRQ_VECTORS_LIMITS_H */ diff --git a/include/asm-x86/mach-visws/irq_vectors.h b/include/asm-x86/mach-visws/irq_vectors.h deleted file mode 100644 index cb572d8db505..000000000000 --- a/include/asm-x86/mach-visws/irq_vectors.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef _ASM_IRQ_VECTORS_H -#define _ASM_IRQ_VECTORS_H - -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. - */ - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - * - * Vectors 0xf0-0xfa are free (reserved for future Linux use). - */ -#define SPURIOUS_APIC_VECTOR 0xff -#define ERROR_APIC_VECTOR 0xfe -#define INVALIDATE_TLB_VECTOR 0xfd -#define RESCHEDULE_VECTOR 0xfc -#define CALL_FUNCTION_VECTOR 0xfb - -#define THERMAL_APIC_VECTOR 0xf0 -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - -/* - * First APIC vector available to drivers: (vectors 0x30-0xee) - * we start at 0x31 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xef - -#define TIMER_IRQ 0 - -/* - * IRQ definitions - */ -#define NR_VECTORS 256 -#define NR_IRQS 224 -#define NR_IRQ_VECTORS NR_IRQS - -#define FPU_IRQ 13 - -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) - -#endif /* _ASM_IRQ_VECTORS_H */ diff --git a/include/asm-x86/mach-voyager/irq_vectors.h b/include/asm-x86/mach-voyager/irq_vectors.h deleted file mode 100644 index 165421f5821c..000000000000 --- a/include/asm-x86/mach-voyager/irq_vectors.h +++ /dev/null @@ -1,79 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* Copyright (C) 2002 - * - * Author: James.Bottomley@HansenPartnership.com - * - * linux/arch/i386/voyager/irq_vectors.h - * - * This file provides definitions for the VIC and QIC CPIs - */ - -#ifndef _ASM_IRQ_VECTORS_H -#define _ASM_IRQ_VECTORS_H - -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. - */ - -/* These define the CPIs we use in linux */ -#define VIC_CPI_LEVEL0 0 -#define VIC_CPI_LEVEL1 1 -/* now the fake CPIs */ -#define VIC_TIMER_CPI 2 -#define VIC_INVALIDATE_CPI 3 -#define VIC_RESCHEDULE_CPI 4 -#define VIC_ENABLE_IRQ_CPI 5 -#define VIC_CALL_FUNCTION_CPI 6 - -/* Now the QIC CPIs: Since we don't need the two initial levels, - * these are 2 less than the VIC CPIs */ -#define QIC_CPI_OFFSET 1 -#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET) -#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET) -#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET) -#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET) -#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET) - -#define VIC_START_FAKE_CPI VIC_TIMER_CPI -#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_CPI - -/* this is the SYS_INT CPI. */ -#define VIC_SYS_INT 8 -#define VIC_CMN_INT 15 - -/* This is the boot CPI for alternate processors. It gets overwritten - * by the above once the system has activated all available processors */ -#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0 -#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8) - -#define NR_VECTORS 256 -#define NR_IRQS 224 -#define NR_IRQ_VECTORS NR_IRQS - -#define FPU_IRQ 13 - -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) - -#ifndef __ASSEMBLY__ -extern asmlinkage void vic_cpi_interrupt(void); -extern asmlinkage void vic_sys_interrupt(void); -extern asmlinkage void vic_cmn_interrupt(void); -extern asmlinkage void qic_timer_interrupt(void); -extern asmlinkage void qic_invalidate_interrupt(void); -extern asmlinkage void qic_reschedule_interrupt(void); -extern asmlinkage void qic_enable_irq_interrupt(void); -extern asmlinkage void qic_call_function_interrupt(void); -#endif /* !__ASSEMBLY__ */ - -#endif /* _ASM_IRQ_VECTORS_H */ -- cgit v1.2.3 From 0bc471d93051a19545257909bc2ed2ad3b389b54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 May 2008 21:55:12 +0200 Subject: x86: move BUILD_IRQ macro magic to i8259_64.c i8259_64.c is the only place which uses those macros. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i8259_64.c | 14 ++++++++++++++ include/asm-x86/hw_irq_64.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index fa57a1568508..c4ae4769ce67 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -34,6 +34,20 @@ * interrupt-controller happy. */ +#define IRQ_NAME2(nr) nr##_interrupt(void) +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +/* + * SMP has a few special interrupts for IPI messages + */ + +#define BUILD_IRQ(nr) \ + asmlinkage void IRQ_NAME(nr); \ + asm("\n.p2align\n" \ + "IRQ" #nr "_interrupt:\n\t" \ + "push $~(" #nr ") ; " \ + "jmp common_interrupt"); + #define BI(x,y) \ BUILD_IRQ(x##y) diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h index 98c9d494a711..9305f7456a7f 100644 --- a/include/asm-x86/hw_irq_64.h +++ b/include/asm-x86/hw_irq_64.h @@ -17,18 +17,4 @@ extern void native_init_IRQ(void); #include -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt"); - #endif -- cgit v1.2.3 From 1a331957efd214fc3a84f70956dfaec65e70c031 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 3 May 2008 00:30:50 +0200 Subject: x86: move eisa_set_level_irq declaration to header Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 -- include/asm-x86/hw_irq.h | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c49ebcc6c41e..671591fcc61d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -507,8 +507,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - extern void eisa_set_level_irq(unsigned int irq); - if (triggering == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 1db2dff1ef49..1428b41dcbb9 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -79,6 +79,9 @@ extern void send_IPI(int dest, int vector); extern atomic_t irq_err_count; extern atomic_t irq_mis_count; +/* EISA */ +extern void eisa_set_level_irq(unsigned int irq); + /* Voyager functions */ extern asmlinkage void vic_cpi_interrupt(void); extern asmlinkage void vic_sys_interrupt(void); -- cgit v1.2.3 From 305b92a2323eeaa4b481f409d54f778dd7e21a46 Mon Sep 17 00:00:00 2001 From: Alan Mayer Date: Tue, 15 Apr 2008 15:36:56 -0500 Subject: x86: change FIRST_SYSTEM_VECTOR to a variable The SGI UV system needs several more system vectors than a vanilla x86_64 system. Rather than burden the other archs with extra system vectors that they don't use, change FIRST_SYSTEM_VECTOR to a variable, so that it can be dynamic. Signed-off-by: Alan Mayer Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic_32.c | 14 +++++++------- arch/x86/kernel/i8259_64.c | 30 +++++++++++++++--------------- arch/x86/kernel/io_apic_32.c | 8 ++++++-- arch/x86/kernel/io_apic_64.c | 6 +++++- include/asm-x86/desc.h | 22 ++++++++++++++++++++++ include/asm-x86/irq_vectors.h | 2 -- 6 files changed, 55 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1bdeb6c..807158e4b5d0 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1351,13 +1351,13 @@ void __init smp_intr_init(void) * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); } #endif @@ -1370,15 +1370,15 @@ void __init apic_intr_init(void) smp_intr_init(); #endif /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* thermal monitor LVT interrupt */ #ifdef CONFIG_X86_MCE_P4THERMAL - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif } diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index c4ae4769ce67..1870e0e86559 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -493,33 +493,33 @@ void __init native_init_IRQ(void) * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPIs for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54fc1fdd..0ae4a9d00ce8 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -83,6 +83,10 @@ int mp_irq_entries; static int disable_timer_pin_1 __initdata; +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. @@ -1176,7 +1180,7 @@ static int __assign_irq_vector(int irq) offset = current_offset; next: vector += 8; - if (vector >= FIRST_SYSTEM_VECTOR) { + if (vector >= first_system_vector) { offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; } @@ -2269,7 +2273,7 @@ void __init setup_IO_APIC(void) int i; /* Reserve all the system vectors. */ - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) + for (i = first_system_vector; i < NR_VECTORS; i++) set_bit(i, used_vectors); enable_IO_APIC(); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8dfcc529..f1e1ae3e5c7d 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -82,6 +82,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { static int assign_irq_vector(int irq, cpumask_t mask); +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + #define __apicdebuginit __init int sis_apic_bug; /* not actually supported, dummy for compile */ @@ -730,7 +734,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) offset = current_offset; next: vector += 8; - if (vector >= FIRST_SYSTEM_VECTOR) { + if (vector >= first_system_vector) { /* If we run out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index 268a012bcd79..b3875d4b4fab 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -311,6 +311,28 @@ static inline void set_intr_gate(unsigned int n, void *addr) _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); } +#define SYS_VECTOR_FREE 0 +#define SYS_VECTOR_ALLOCED 1 + +extern int first_system_vector; +extern char system_vectors[]; + +static inline void alloc_system_vector(int vector) +{ + if (system_vectors[vector] == SYS_VECTOR_FREE) { + system_vectors[vector] = SYS_VECTOR_ALLOCED; + if (first_system_vector > vector) + first_system_vector = vector; + } else + BUG(); +} + +static inline void alloc_intr_gate(unsigned int n, void *addr) +{ + alloc_system_vector(n); + set_intr_gate(n, addr); +} + /* * This routine sets up an interrupt gate at directory privilege level 3. */ diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h index daceaaf0a3a4..3cb6d8c77b39 100644 --- a/include/asm-x86/irq_vectors.h +++ b/include/asm-x86/irq_vectors.h @@ -96,8 +96,6 @@ # define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) #endif -#define FIRST_SYSTEM_VECTOR 0xef - #define NR_VECTORS 256 #define FPU_IRQ 13 -- cgit v1.2.3 From ce17833183bf0a08ce3d174a2088eff0a06f2080 Mon Sep 17 00:00:00 2001 From: Alan Mayer Date: Wed, 16 Apr 2008 15:17:20 -0500 Subject: x86: change FIRST_SYSTEM_VECTOR to a variable, fix Fixes the build error introduced by my FIRST_SYSTEM_VECTOR patch Signed-off-by: Alan Mayer Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic_32.c | 4 ++++ arch/x86/kernel/io_apic_32.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 807158e4b5d0..d5767cb19d56 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -71,6 +71,10 @@ int local_apic_timer_disabled; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + /* * Debug level, exported for io_apic.c */ diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 0ae4a9d00ce8..76769ccb1425 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -83,10 +83,6 @@ int mp_irq_entries; static int disable_timer_pin_1 __initdata; -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. -- cgit v1.2.3 From 04b361abfdc522239e3a071f3afdebf5787d9f03 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 5 May 2008 12:36:38 +0200 Subject: i386: Execute stack overflow warning on interrupt stack v2 Previously the reporting printk would run on the process stack, which risks overflow an already low stack. Instead execute it on the interrupt stack. This makes it more likely for the printk to make it actually out. It adds one not taken test/branch more to the interrupt path when stack overflow checking is enabled. We could avoid that by duplicating more code, but that seemed not worth it. Based on an observation by Eric Sandeen. v2: Fix warnings in some configs Signed-off-by: Andi Kleen Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 49 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 147352df28b9..3f76561da815 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -61,6 +61,26 @@ static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; #endif +static void stack_overflow(void) +{ + printk("low stack detected by irq handler\n"); + dump_stack(); +} + +static inline void call_on_stack2(void *func, void *stack, + unsigned long arg1, unsigned long arg2) +{ + unsigned long bx; + asm volatile( + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (bx) + : "0" (arg1), "1" (arg2), "2" (stack), + "D" (func) + : "memory", "cc", "ecx"); +} + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific @@ -76,6 +96,7 @@ unsigned int do_IRQ(struct pt_regs *regs) union irq_ctx *curctx, *irqctx; u32 *isp; #endif + int overflow = 0; if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -92,11 +113,8 @@ unsigned int do_IRQ(struct pt_regs *regs) __asm__ __volatile__("andl %%esp,%0" : "=r" (sp) : "0" (THREAD_SIZE - 1)); - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", - sp - sizeof(struct thread_info)); - dump_stack(); - } + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) + overflow = 1; } #endif @@ -112,8 +130,6 @@ unsigned int do_IRQ(struct pt_regs *regs) * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, bx; - /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); irqctx->tinfo.task = curctx->tinfo.task; @@ -127,18 +143,19 @@ unsigned int do_IRQ(struct pt_regs *regs) (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); - asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (bx) - : "0" (irq), "1" (desc), "2" (isp), - "D" (desc->handle_irq) - : "memory", "cc", "ecx" - ); + /* Execute warning on interrupt stack */ + if (unlikely(overflow)) + call_on_stack2(stack_overflow, isp, 0, 0); + + call_on_stack2(desc->handle_irq, isp, irq, (unsigned long)desc); } else #endif + { + /* AK: Slightly bogus here */ + if (overflow) + stack_overflow(); desc->handle_irq(irq, desc); + } irq_exit(); set_irq_regs(old_regs); -- cgit v1.2.3 From de9b10af1287bf25b9c0433de53a2e95ef611aa7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 May 2008 15:58:15 +0200 Subject: x86: janitor stack overflow warning patch Add KERN_WARNING to the printk as this could not be done in the original patch, which allegedly only moves code around. Un#ifdef do_IRQ. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 136 ++++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 61 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 3f76561da815..1c470d2e5af7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq) #endif } +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* Debugging check for stack overflow: is there less than 1KB free? */ +static int check_stack_overflow(void) +{ + long sp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (sp) : "0" (THREAD_SIZE - 1)); + + return sp < (sizeof(struct thread_info) + STACK_WARN); +} + +static void print_stack_overflow(void) +{ + printk(KERN_WARNING "low stack detected by irq handler\n"); + dump_stack(); +} + +#else +static inline int check_stack_overflow(void) { return 0; } +static inline void print_stack_overflow(void) { } +#endif + #ifdef CONFIG_4KSTACKS /* * per-CPU IRQ handling contexts (thread information and stack) @@ -59,18 +82,12 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -#endif - -static void stack_overflow(void) -{ - printk("low stack detected by irq handler\n"); - dump_stack(); -} -static inline void call_on_stack2(void *func, void *stack, - unsigned long arg1, unsigned long arg2) +static inline void call_on_stack(void *func, void *stack, + unsigned long arg1, void *arg2) { unsigned long bx; + asm volatile( " xchgl %%ebx,%%esp \n" " call *%%edi \n" @@ -81,22 +98,61 @@ static inline void call_on_stack2(void *func, void *stack, : "memory", "cc", "ecx"); } +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) +{ + union irq_ctx *curctx, *irqctx; + u32 *isp; + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (unlikely(curctx == irqctx)) + return 0; + + /* build the stack frame on the IRQ stack */ + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* + * Copy the softirq bits in preempt_count so that the + * softirq checks work in the hardirq context. + */ + irqctx->tinfo.preempt_count = + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + + if (unlikely(overflow)) + call_on_stack(print_stack_overflow, isp, 0, NULL); + + call_on_stack(desc->handle_irq, isp, irq, desc); + + return 1; +} + +#else +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } +#endif + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */ unsigned int do_IRQ(struct pt_regs *regs) -{ +{ struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int irq = ~regs->orig_ax; + int overflow, irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; -#ifdef CONFIG_4KSTACKS - union irq_ctx *curctx, *irqctx; - u32 *isp; -#endif - int overflow = 0; if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -106,54 +162,12 @@ unsigned int do_IRQ(struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); -#ifdef CONFIG_DEBUG_STACKOVERFLOW - /* Debugging check for stack overflow: is there less than 1KB free? */ - { - long sp; - - __asm__ __volatile__("andl %%esp,%0" : - "=r" (sp) : "0" (THREAD_SIZE - 1)); - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) - overflow = 1; - } -#endif - -#ifdef CONFIG_4KSTACKS - curctx = (union irq_ctx *) current_thread_info(); - irqctx = hardirq_ctx[smp_processor_id()]; + overflow = check_stack_overflow(); - /* - * this is where we switch to the IRQ stack. However, if we are - * already using the IRQ stack (because we interrupted a hardirq - * handler) we can't do that and just have to keep using the - * current stack (which is the irq stack already after all) - */ - if (curctx != irqctx) { - /* build the stack frame on the IRQ stack */ - isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); - irqctx->tinfo.task = curctx->tinfo.task; - irqctx->tinfo.previous_esp = current_stack_pointer; - - /* - * Copy the softirq bits in preempt_count so that the - * softirq checks work in the hardirq context. - */ - irqctx->tinfo.preempt_count = - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | - (curctx->tinfo.preempt_count & SOFTIRQ_MASK); - - /* Execute warning on interrupt stack */ + if (!execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) - call_on_stack2(stack_overflow, isp, 0, 0); - - call_on_stack2(desc->handle_irq, isp, irq, (unsigned long)desc); - } else -#endif - { - /* AK: Slightly bogus here */ - if (overflow) - stack_overflow(); + print_stack_overflow(); desc->handle_irq(irq, desc); } -- cgit v1.2.3 From 403d8efc94cd02ae36e7db13c4edf1d06d7b7fac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 May 2008 18:13:50 +0200 Subject: x86: irq_32 move 4kstacks code to one place Move the 4KSTACKS related code to one place. This allows to un#ifdef do_IRQ() and share the executed on stack for the stack overflow printk and the softirq call. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 163 ++++++++++++++++++++++------------------------- 1 file changed, 77 insertions(+), 86 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1c470d2e5af7..4e3e8ec60276 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -83,26 +83,28 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static inline void call_on_stack(void *func, void *stack, - unsigned long arg1, void *arg2) +static char softirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__section__(".bss.page_aligned"))); + +static char hardirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__section__(".bss.page_aligned"))); + +static void call_on_stack(void *func, void *stack) { - unsigned long bx; - - asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (bx) - : "0" (arg1), "1" (arg2), "2" (stack), - "D" (func) - : "memory", "cc", "ecx"); + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=b" (stack) + : "0" (stack), + "D"(func) + : "memory", "cc", "edx", "ecx", "eax"); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { union irq_ctx *curctx, *irqctx; - u32 *isp; + u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); irqctx = hardirq_ctx[smp_processor_id()]; @@ -130,64 +132,22 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) (curctx->tinfo.preempt_count & SOFTIRQ_MASK); if (unlikely(overflow)) - call_on_stack(print_stack_overflow, isp, 0, NULL); - - call_on_stack(desc->handle_irq, isp, irq, desc); - - return 1; -} - -#else -static inline int -execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } -#endif - -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -unsigned int do_IRQ(struct pt_regs *regs) -{ - struct pt_regs *old_regs; - /* high bit used in ret_from_ code */ - int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_desc + irq; - - if (unlikely((unsigned)irq >= NR_IRQS)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d\n", - __func__, irq); - BUG(); - } - - old_regs = set_irq_regs(regs); - irq_enter(); - - overflow = check_stack_overflow(); - - if (!execute_on_irq_stack(overflow, desc, irq)) { - if (unlikely(overflow)) - print_stack_overflow(); - desc->handle_irq(irq, desc); - } - - irq_exit(); - set_irq_regs(old_regs); + call_on_stack(print_stack_overflow, isp); + + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (isp) + : "0" (irq), "1" (desc), "2" (isp), + "D" (desc->handle_irq) + : "memory", "cc", "ecx"); return 1; } -#ifdef CONFIG_4KSTACKS - -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - /* * allocate per-cpu stacks for hardirq and for softirq processing */ -void irq_ctx_init(int cpu) +void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; @@ -195,25 +155,25 @@ void irq_ctx_init(int cpu) return; irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; - irqctx->tinfo.task = NULL; - irqctx->tinfo.exec_domain = NULL; - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); hardirq_ctx[cpu] = irqctx; irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; - irqctx->tinfo.task = NULL; - irqctx->tinfo.exec_domain = NULL; - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = 0; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = 0; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); softirq_ctx[cpu] = irqctx; - printk("CPU %u irqstacks, hard=%p soft=%p\n", - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); } void irq_ctx_exit(int cpu) @@ -242,24 +202,55 @@ asmlinkage void do_softirq(void) /* build the stack frame on the softirq stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); - asm volatile( - " xchgl %%ebx,%%esp \n" - " call __do_softirq \n" - " movl %%ebx,%%esp \n" - : "=b"(isp) - : "0"(isp) - : "memory", "cc", "edx", "ecx", "eax" - ); + call_on_stack(__do_softirq, isp); /* * Shouldnt happen, we returned above if in_interrupt(): - */ + */ WARN_ON_ONCE(softirq_count()); } local_irq_restore(flags); } + +#else +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } #endif +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + /* high bit used in ret_from_ code */ + int overflow, irq = ~regs->orig_ax; + struct irq_desc *desc = irq_desc + irq; + + if (unlikely((unsigned)irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __func__, irq); + BUG(); + } + + old_regs = set_irq_regs(regs); + irq_enter(); + + overflow = check_stack_overflow(); + + if (!execute_on_irq_stack(overflow, desc, irq)) { + if (unlikely(overflow)) + print_stack_overflow(); + desc->handle_irq(irq, desc); + } + + irq_exit(); + set_irq_regs(old_regs); + return 1; +} + /* * Interrupt statistics: */ -- cgit v1.2.3 From aa134f1b09df6beaa4d031a50d5fda1f3cebce6c Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 8 Apr 2008 10:49:03 +0200 Subject: x86: iommu: use symbolic constants, not hardcoded numbers Move symbolic constants into gart.h, and use them instead of hardcoded constant. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/pci-gart_64.c | 10 +++++----- drivers/char/agp/amd64-agp.c | 25 +++---------------------- include/asm-x86/gart.h | 21 +++++++++++++++++++++ 3 files changed, 29 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c07455d1695f..bffcf455c857 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -598,13 +598,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) dev = k8_northbridges[i]; gatt_reg = __pa(gatt) >> 12; gatt_reg <<= 4; - pci_write_config_dword(dev, 0x98, gatt_reg); - pci_read_config_dword(dev, 0x90, &ctl); + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, gatt_reg); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl |= 1; - ctl &= ~((1<<4) | (1<<5)); + ctl |= GARTEN; + ctl &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(dev, 0x90, ctl); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } flush_gart(); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index d8200ac8f8cb..25d64224cdbb 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -16,28 +16,9 @@ #include /* PAGE_SIZE */ #include #include +#include #include "agp.h" -/* PTE bits. */ -#define GPTE_VALID 1 -#define GPTE_COHERENT 2 - -/* Aperture control register bits. */ -#define GARTEN (1<<0) -#define DISGARTCPU (1<<4) -#define DISGARTIO (1<<5) - -/* GART cache control register bits. */ -#define INVGART (1<<0) -#define GARTPTEERR (1<<1) - -/* K8 On-cpu GART registers */ -#define AMD64_GARTAPERTURECTL 0x90 -#define AMD64_GARTAPERTUREBASE 0x94 -#define AMD64_GARTTABLEBASE 0x98 -#define AMD64_GARTCACHECTL 0x9c -#define AMD64_GARTEN (1<<0) - /* NVIDIA K8 registers */ #define NVIDIA_X86_64_0_APBASE 0x10 #define NVIDIA_X86_64_1_APBASE1 0x50 @@ -165,7 +146,7 @@ static int amd64_fetch_size(void) * In a multiprocessor x86-64 system, this function gets * called once for each CPU. */ -static u64 amd64_configure (struct pci_dev *hammer, u64 gatt_table) +static u64 amd64_configure(struct pci_dev *hammer, u64 gatt_table) { u64 aperturebase; u32 tmp; @@ -181,7 +162,7 @@ static u64 amd64_configure (struct pci_dev *hammer, u64 gatt_table) addr >>= 12; tmp = (u32) addr<<4; tmp &= ~0xf; - pci_write_config_dword (hammer, AMD64_GARTTABLEBASE, tmp); + pci_write_config_dword(hammer, AMD64_GARTTABLEBASE, tmp); /* Enable GART translation for this hammer. */ pci_read_config_dword(hammer, AMD64_GARTAPERTURECTL, &tmp); diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h index 90958ed993fa..248e5778e928 100644 --- a/include/asm-x86/gart.h +++ b/include/asm-x86/gart.h @@ -5,6 +5,7 @@ extern void pci_iommu_shutdown(void); extern void no_iommu_init(void); extern int force_iommu, no_iommu; extern int iommu_detected; +extern int agp_amd64_init(void); #ifdef CONFIG_GART_IOMMU extern void gart_iommu_init(void); extern void gart_iommu_shutdown(void); @@ -31,4 +32,24 @@ static inline void gart_iommu_shutdown(void) #endif +/* PTE bits. */ +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 + +/* Aperture control register bits. */ +#define GARTEN (1<<0) +#define DISGARTCPU (1<<4) +#define DISGARTIO (1<<5) + +/* GART cache control register bits. */ +#define INVGART (1<<0) +#define GARTPTEERR (1<<1) + +/* K8 On-cpu GART registers */ +#define AMD64_GARTAPERTURECTL 0x90 +#define AMD64_GARTAPERTUREBASE 0x94 +#define AMD64_GARTTABLEBASE 0x98 +#define AMD64_GARTCACHECTL 0x9c +#define AMD64_GARTEN (1<<0) + #endif -- cgit v1.2.3 From 1edc1ab3f68168ec6815e6d630f38948a6da005a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 13 Apr 2008 01:11:41 -0700 Subject: x86: agp_gart size checking for buggy device while looking at Rafael J. Wysocki's system boot log, I found a funny printout: Node 0: aperture @ de000000 size 32 MB Aperture too small (32 MB) AGP bridge at 00:04:00 Aperture from AGP @ de000000 size 4096 MB (APSIZE 0) Aperture too small (0 MB) Your BIOS doesn't leave a aperture memory hole Please enable the IOMMU option in the BIOS setup This costs you 64 MB of RAM Mapping aperture over 65536 KB of RAM @ 4000000 ... agpgart: Detected AGP bridge 20 agpgart: Aperture pointing to RAM agpgart: Aperture from AGP @ de000000 size 4096 MB agpgart: Aperture too small (0 MB) agpgart: No usable aperture found. agpgart: Consider rebooting with iommu=memaper=2 to get a good aperture. it means BIOS allocated the correct gart on the NB and AGP bridge, but because a bug in the silicon (the agp bridge reports the wrong order, it wants 4G instead) the kernel will reject that allocation. Also, because the size is only 32MB, and we try to get another 64M for gart, late fix_northbridge can not revert that change because it still reads the wrong size from agp bridge. So try to double check the order value from the agp bridge, before calling aperture_valid(). [ mingo@elte.hu: 32-bit fix. ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/aperture_64.c | 14 ++++++++++++++ drivers/char/agp/amd64-agp.c | 11 +++++++++++ 2 files changed, 25 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 479926d9e004..2e93b3132dff 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -138,6 +138,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) int nbits; u32 aper_low, aper_hi; u64 aper; + u32 old_order; printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); @@ -146,6 +147,9 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) return 0; } + /* old_order could be the value from NB gart setting */ + old_order = *order; + apsize = apsizereg & 0xfff; /* Some BIOS use weird encodings not in the AGPv3 table. */ if (apsize & 0xff) @@ -159,6 +163,16 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) aper_hi = read_pci_config(num, slot, func, 0x14); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + /* + * On some sick chips, APSIZE is 0. It means it wants 4G + * so let double check that order, and lets trust AMD NB settings: + */ + if (aper + (32UL<<(20 + *order)) > 0x100000000UL) { + printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", + 32 << *order, apsizereg); + *order = old_order; + } + printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, 32 << *order, apsizereg); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 25d64224cdbb..9e3939db76e8 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -312,6 +312,17 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, pci_read_config_dword(agp, 0x10, &aper_low); pci_read_config_dword(agp, 0x14, &aper_hi); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + /* + * On some sick chips APSIZE is 0. This means it wants 4G + * so let double check that order, and lets trust the AMD NB settings + */ + if (aper + (32ULL<<(20 + order)) > 0x100000000ULL) { + printk(KERN_INFO "Aperture size %u MB is not right, using settings from NB\n", + 32 << order); + order = nb_order; + } + printk(KERN_INFO PFX "Aperture from AGP @ %Lx size %u MB\n", aper, 32 << order); if (order < 0 || !aperture_valid(aper, (32*1024*1024)< Date: Sun, 13 Apr 2008 18:42:31 -0700 Subject: x86: checking aperture size order some systems are using 32M for gart and agp when memory is less than 4G. Kernel will reject and try to allcate another 64M that is not needed, and we will waste 64M of perfectly good RAM. this patch adds a workaround by checking aper_base/order between NB and agp bridge. If they are the same, and memory size is less than 4G, it will allow it. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/aperture_64.c | 47 ++++++++++++++++++++++++++++++++----------- drivers/char/agp/amd64-agp.c | 12 +++++------ 2 files changed, 41 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 2e93b3132dff..8c325b7f2d9b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -83,7 +83,7 @@ static u32 __init allocate_aperture(void) return (u32)__pa(p); } -static int __init aperture_valid(u64 aper_base, u32 aper_size) +static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) { if (!aper_base) return 0; @@ -96,8 +96,9 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size) printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); return 0; } - if (aper_size < 64*1024*1024) { - printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20); + if (aper_size < min_size) { + printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n", + aper_size>>20, min_size>>20); return 0; } @@ -167,7 +168,9 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - if (aper + (32UL<<(20 + *order)) > 0x100000000UL) { + printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", + aper, 32 << old_order); + if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", 32 << *order, apsizereg); *order = old_order; @@ -176,7 +179,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, 32 << *order, apsizereg); - if (!aperture_valid(aper, (32*1024*1024) << *order)) + if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) return 0; return (u32)aper; } @@ -302,8 +305,8 @@ void __init early_gart_iommu_check(void) fix = 1; if (gart_fix_e820 && !fix && aper_enabled) { - if (e820_any_mapped(aper_base, aper_base + aper_size, - E820_RAM)) { + if (!e820_all_mapped(aper_base, aper_base + aper_size, + E820_RESERVED)) { /* reserved it, so we can resuse it in second kernel */ printk(KERN_INFO "update e820 for GART\n"); add_memory_region(aper_base, aper_size, E820_RESERVED); @@ -324,8 +327,11 @@ void __init early_gart_iommu_check(void) } +static int __initdata printed_gart_size_msg; + void __init gart_iommu_hole_init(void) { + u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; int fix, num, valid_agp = 0; @@ -337,6 +343,9 @@ void __init gart_iommu_hole_init(void) printk(KERN_INFO "Checking aperture...\n"); + if (!fallback_aper_force) + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + fix = 0; node = 0; for (num = 24; num < 32; num++) { @@ -355,9 +364,21 @@ void __init gart_iommu_hole_init(void) node, aper_base, aper_size >> 20); node++; - if (!aperture_valid(aper_base, aper_size)) { - fix = 1; - break; + if (!aperture_valid(aper_base, aper_size, 64<<20)) { + if (valid_agp && agp_aper_base && + agp_aper_base == aper_base && + agp_aper_order == aper_order) { + /* the same between two setting from NB and agp */ + if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { + printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); + printk(KERN_ERR "please increase GART size in your BIOS setup\n"); + printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + printed_gart_size_msg = 1; + } + } else { + fix = 1; + break; + } } if ((last_aper_order && aper_order != last_aper_order) || @@ -378,8 +399,10 @@ void __init gart_iommu_hole_init(void) return; } - if (!fallback_aper_force) - aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + if (!fallback_aper_force) { + aper_alloc = agp_aper_base; + aper_order = agp_aper_order; + } if (aper_alloc) { /* Got the aperture from the AGP bridge */ diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 9e3939db76e8..9c24470a8252 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -245,11 +245,7 @@ static int __devinit aperture_valid(u64 aper, u32 size) printk(KERN_ERR PFX "No aperture\n"); return 0; } - if (size < 32*1024*1024) { - printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20); - return 0; - } - if ((u64)aper + size > 0x100000000ULL) { + if ((u64)aper + size > 0x100000000ULL) { printk(KERN_ERR PFX "Aperture out of bounds\n"); return 0; } @@ -257,6 +253,10 @@ static int __devinit aperture_valid(u64 aper, u32 size) printk(KERN_ERR PFX "Aperture pointing to RAM\n"); return 0; } + if (size < 32*1024*1024) { + printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20); + return 0; + } /* Request the Aperture. This catches cases when someone else already put a mapping in there - happens with some very broken BIOS @@ -317,7 +317,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, * On some sick chips APSIZE is 0. This means it wants 4G * so let double check that order, and lets trust the AMD NB settings */ - if (aper + (32ULL<<(20 + order)) > 0x100000000ULL) { + if (order >=0 && aper + (32ULL<<(20 + order)) > 0x100000000ULL) { printk(KERN_INFO "Aperture size %u MB is not right, using settings from NB\n", 32 << order); order = nb_order; -- cgit v1.2.3 From 7677b2ef6c0c4fddc84f6473f3863f40eb71821b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 14 Apr 2008 20:40:37 -0700 Subject: x86_64: allocate gart aperture from 512M because we try to reserve dma32 early, so we have chance to get aperture from 64M. with some sequence aperture allocated from RAM, could become E820_RESERVED. and then if doing a kexec with a big kernel that uncompressed size is above 64M we could have a range conflict with still using gart. So allocate gart aperture from 512M instead. Also change the fallback_aper_order to 5, because we don't have chance to get 2G or 4G aperture. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/aperture_64.c | 20 +++++++++++++++++--- arch/x86/kernel/pci-dma.c | 6 +++++- 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 8c325b7f2d9b..c63f8d9fad3e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -55,8 +55,9 @@ static u32 __init allocate_aperture(void) u32 aper_size; void *p; - if (fallback_aper_order > 7) - fallback_aper_order = 7; + /* aper_size should <= 1G */ + if (fallback_aper_order > 5) + fallback_aper_order = 5; aper_size = (32 * 1024 * 1024) << fallback_aper_order; /* @@ -65,7 +66,20 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); + /* + * using 512M as goal, in case kexec will load kernel_big + * that will do the on position decompress, and could overlap with + * that positon with gart that is used. + * sequende: + * kernel_small + * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) + * ==> kernel_small(gart area become e820_reserved) + * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * so don't use 512M below as gart iommu, leave the space for kernel + * code for safe + */ + p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0c37f16b6950..1a017f00e867 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -77,10 +77,14 @@ void __init dma32_reserve_bootmem(void) if (end_pfn <= MAX_DMA32_PFN) return; + /* + * check aperture_64.c allocate_aperture() for reason about + * using 512M as goal + */ align = 64ULL<<20; size = round_up(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, - __pa(MAX_DMA_ADDRESS)); + 512ULL<<20); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else -- cgit v1.2.3 From 55c0d721df80dcc505dc888e85d4ca51ea150ce9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Apr 2008 01:31:11 -0700 Subject: x86: clean up aperture_64.c 1. use symbolic register names where appropriate. 2. num to bus or slot changing 3. handle for new opteron for bus other than 0 Signed-off-by: Yinghai Lu Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 230 +++++++++++++++++++++++++----------------- 1 file changed, 139 insertions(+), 91 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index c63f8d9fad3e..02f4dbaa4df4 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -35,6 +35,18 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; +struct bus_dev_range { + int bus; + int dev_base; + int dev_limit; +}; + +static struct bus_dev_range bus_dev_ranges[] __initdata = { + { 0x00, 0x18, 0x20}, + { 0xff, 0x00, 0x20}, + { 0xfe, 0x00, 0x20} +}; + static struct resource gart_resource = { .name = "GART", .flags = IORESOURCE_MEM, @@ -120,33 +132,33 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) } /* Find a PCI capability */ -static __u32 __init find_cap(int num, int slot, int func, int cap) +static __u32 __init find_cap(int bus, int slot, int func, int cap) { int bytes; u8 pos; - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & PCI_STATUS_CAP_LIST)) return 0; - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { u8 id; pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); if (id == 0xff) break; if (id == cap) return pos; - pos = read_pci_config_byte(num, slot, func, + pos = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_NEXT); } return 0; } /* Read a standard AGPv3 bridge header */ -static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) +static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) { u32 apsize; u32 apsizereg; @@ -155,8 +167,8 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); - apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); + printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); return 0; @@ -174,8 +186,8 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) if ((int)*order < 0) /* < 32MB */ *order = 0; - aper_low = read_pci_config(num, slot, func, 0x10); - aper_hi = read_pci_config(num, slot, func, 0x14); + aper_low = read_pci_config(bus, slot, func, 0x10); + aper_hi = read_pci_config(bus, slot, func, 0x14); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); /* @@ -213,15 +225,15 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) */ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) { - int num, slot, func; + int bus, slot, func; /* Poor man's PCI discovery */ - for (num = 0; num < 256; num++) { + for (bus = 0; bus < 256; bus++) { for (slot = 0; slot < 32; slot++) { for (func = 0; func < 8; func++) { u32 class, cap; u8 type; - class = read_pci_config(num, slot, func, + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); if (class == 0xffffffff) break; @@ -230,17 +242,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) case PCI_CLASS_BRIDGE_HOST: case PCI_CLASS_BRIDGE_OTHER: /* needed? */ /* AGP bridge? */ - cap = find_cap(num, slot, func, + cap = find_cap(bus, slot, func, PCI_CAP_ID_AGP); if (!cap) break; *valid_agp = 1; - return read_agp(num, slot, func, cap, + return read_agp(bus, slot, func, cap, order); } /* No multi-function device? */ - type = read_pci_config_byte(num, slot, func, + type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE); if (!(type & 0x80)) break; @@ -280,38 +292,49 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - int fix, num; + int fix, slot; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; u64 aper_base = 0, last_aper_base = 0; int aper_enabled = 0, last_aper_enabled = 0; + int i; if (!early_pci_allowed()) return; fix = 0; - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - ctl = read_pci_config(0, num, 3, 0x90); - aper_enabled = ctl & 1; - aper_order = (ctl >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; - aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; - - if ((last_aper_order && aper_order != last_aper_order) || - (last_aper_base && aper_base != last_aper_base) || - (last_aper_enabled && aper_enabled != last_aper_enabled)) { - fix = 1; - break; + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + aper_enabled = ctl & AMD64_GARTEN; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base) || + (last_aper_enabled && aper_enabled != last_aper_enabled)) { + fix = 1; + goto out; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; } - last_aper_order = aper_order; - last_aper_base = aper_base; - last_aper_enabled = aper_enabled; } +out: if (!fix && !aper_enabled) return; @@ -330,13 +353,22 @@ void __init early_gart_iommu_check(void) } /* different nodes have different setting, disable them all at first*/ - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; - ctl = read_pci_config(0, num, 3, 0x90); - ctl &= ~1; - write_pci_config(0, num, 3, 0x90, ctl); + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + ctl &= ~AMD64_GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + } } } @@ -348,8 +380,8 @@ void __init gart_iommu_hole_init(void) u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; - int fix, num, valid_agp = 0; - int node; + int fix, slot, valid_agp = 0; + int i, node; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) @@ -362,48 +394,58 @@ void __init gart_iommu_hole_init(void) fix = 0; node = 0; - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - iommu_detected = 1; - gart_iommu_aperture = 1; - - aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; - aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; - - printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); - node++; - - if (!aperture_valid(aper_base, aper_size, 64<<20)) { - if (valid_agp && agp_aper_base && - agp_aper_base == aper_base && - agp_aper_order == aper_order) { - /* the same between two setting from NB and agp */ - if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); - printk(KERN_ERR "please increase GART size in your BIOS setup\n"); - printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); - printed_gart_size_msg = 1; + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + iommu_detected = 1; + gart_iommu_aperture = 1; + + aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", + node, aper_base, aper_size >> 20); + node++; + + if (!aperture_valid(aper_base, aper_size, 64<<20)) { + if (valid_agp && agp_aper_base && + agp_aper_base == aper_base && + agp_aper_order == aper_order) { + /* the same between two setting from NB and agp */ + if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { + printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); + printk(KERN_ERR "please increase GART size in your BIOS setup\n"); + printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + printed_gart_size_msg = 1; + } + } else { + fix = 1; + goto out; } - } else { - fix = 1; - break; } - } - if ((last_aper_order && aper_order != last_aper_order) || - (last_aper_base && aper_base != last_aper_base)) { - fix = 1; - break; + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + goto out; + } + last_aper_order = aper_order; + last_aper_base = aper_base; } - last_aper_order = aper_order; - last_aper_base = aper_base; } +out: if (!fix && !fallback_aper_force) { if (last_aper_base) { unsigned long n = (32 * 1024 * 1024) << last_aper_order; @@ -452,16 +494,22 @@ void __init gart_iommu_hole_init(void) } /* Fix up the north bridges */ - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - /* - * Don't enable translation yet. That is done later. - * Assume this BIOS didn't initialise the GART so - * just overwrite all previous bits - */ - write_pci_config(0, num, 3, 0x90, aper_order<<1); - write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + /* Don't enable translation yet. That is done later. + Assume this BIOS didn't initialise the GART so + just overwrite all previous bits */ + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); + } } } -- cgit v1.2.3 From 330fce23dab6e6a3d1979e55f27aba4c0c301331 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Apr 2008 01:31:45 -0700 Subject: x86: reserve dma32 early for gart fix we can use free_bootmem() directly. Signed-off-by: Yinghai Lu Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a017f00e867..81862d0c7a9a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -92,7 +92,6 @@ void __init dma32_reserve_bootmem(void) } static void __init dma32_free_bootmem(void) { - int node; if (end_pfn <= MAX_DMA32_PFN) return; @@ -100,9 +99,7 @@ static void __init dma32_free_bootmem(void) if (!dma32_bootmem_ptr) return; - for_each_online_node(node) - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr), - dma32_bootmem_size); + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size); dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; -- cgit v1.2.3 From 3bb6fbf9969a8bbe4892968659239273d092e78a Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 15 Apr 2008 12:43:57 +0200 Subject: x86 gart: factor out common code Cleanup gart handling on amd64 a bit: move common code into enable_gart_translation , and use symbolic register names where appropriate. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/pci-gart_64.c | 23 ++++++----------------- drivers/char/agp/amd64-agp.c | 29 +++++++++-------------------- include/asm-x86/gart.h | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index bffcf455c857..1f99b62ff616 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -533,8 +533,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) unsigned aper_size = 0, aper_base_32, aper_order; u64 aper_base; - pci_read_config_dword(dev, 0x94, &aper_base_32); - pci_read_config_dword(dev, 0x90, &aper_order); + pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); aper_order = (aper_order >> 1) & 7; aper_base = aper_base_32 & 0x7fff; @@ -592,19 +592,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; for (i = 0; i < num_k8_northbridges; i++) { - u32 gatt_reg; - u32 ctl; - dev = k8_northbridges[i]; - gatt_reg = __pa(gatt) >> 12; - gatt_reg <<= 4; - pci_write_config_dword(dev, AMD64_GARTTABLEBASE, gatt_reg); - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - - ctl |= GARTEN; - ctl &= ~(DISGARTCPU | DISGARTIO); - - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + enable_gart_translation(dev, __pa(gatt)); } flush_gart(); @@ -648,11 +637,11 @@ void gart_iommu_shutdown(void) u32 ctl; dev = k8_northbridges[i]; - pci_read_config_dword(dev, 0x90, &ctl); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl &= ~1; + ctl &= ~GARTEN; - pci_write_config_dword(dev, 0x90, ctl); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } } diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 9c24470a8252..e3c7ea07f57c 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -150,25 +150,14 @@ static u64 amd64_configure(struct pci_dev *hammer, u64 gatt_table) { u64 aperturebase; u32 tmp; - u64 addr, aper_base; + u64 aper_base; /* Address to map to */ - pci_read_config_dword (hammer, AMD64_GARTAPERTUREBASE, &tmp); + pci_read_config_dword(hammer, AMD64_GARTAPERTUREBASE, &tmp); aperturebase = tmp << 25; aper_base = (aperturebase & PCI_BASE_ADDRESS_MEM_MASK); - /* address of the mappings table */ - addr = (u64) gatt_table; - addr >>= 12; - tmp = (u32) addr<<4; - tmp &= ~0xf; - pci_write_config_dword(hammer, AMD64_GARTTABLEBASE, tmp); - - /* Enable GART translation for this hammer. */ - pci_read_config_dword(hammer, AMD64_GARTAPERTURECTL, &tmp); - tmp |= GARTEN; - tmp &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(hammer, AMD64_GARTAPERTURECTL, tmp); + enable_gart_translation(hammer, gatt_table); return aper_base; } @@ -207,9 +196,9 @@ static void amd64_cleanup(void) for (i = 0; i < num_k8_northbridges; i++) { struct pci_dev *dev = k8_northbridges[i]; /* disable gart translation */ - pci_read_config_dword (dev, AMD64_GARTAPERTURECTL, &tmp); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); tmp &= ~AMD64_GARTEN; - pci_write_config_dword (dev, AMD64_GARTAPERTURECTL, tmp); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp); } } @@ -289,9 +278,9 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, u32 nb_order, nb_base; u16 apsize; - pci_read_config_dword(nb, 0x90, &nb_order); + pci_read_config_dword(nb, AMD64_GARTAPERTURECTL, &nb_order); nb_order = (nb_order >> 1) & 7; - pci_read_config_dword(nb, 0x94, &nb_base); + pci_read_config_dword(nb, AMD64_GARTAPERTUREBASE, &nb_base); nb_aper = nb_base << 25; if (aperture_valid(nb_aper, (32*1024*1024)<> 25); + pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1); + pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25); return 0; } diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h index 248e5778e928..6f22786d2f0c 100644 --- a/include/asm-x86/gart.h +++ b/include/asm-x86/gart.h @@ -52,4 +52,21 @@ static inline void gart_iommu_shutdown(void) #define AMD64_GARTCACHECTL 0x9c #define AMD64_GARTEN (1<<0) +static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) +{ + u32 tmp, ctl; + + /* address of the mappings table */ + addr >>= 12; + tmp = (u32) addr<<4; + tmp &= ~0xf; + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); + + /* Enable GART translation for this hammer. */ + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + ctl |= GARTEN; + ctl &= ~(DISGARTCPU | DISGARTIO); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + #endif -- cgit v1.2.3 From f784946ded3e734524d1f48a6a8286b8a152c3b9 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 2 May 2008 16:45:08 -0700 Subject: x86: use per_cpu data in nmi_32 use per_cpu for per CPU data. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_32.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 11b14bbaa61e..69bdae555c18 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -293,9 +293,9 @@ void stop_apic_nmi_watchdog(void *unused) * here too!] */ -static unsigned int - last_irq_sums [NR_CPUS], - alert_counter [NR_CPUS]; +static DEFINE_PER_CPU(unsigned, last_irq_sum); +static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog(void) { @@ -303,12 +303,13 @@ void touch_nmi_watchdog(void) unsigned cpu; /* - * Just reset the alert counters, (other CPUs might be - * spinning on locks we hold): + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. */ for_each_present_cpu(cpu) { - if (alert_counter[cpu]) - alert_counter[cpu] = 0; + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; } } @@ -358,22 +359,26 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) */ sum = per_cpu(irq_stat, cpu).apic_timer_irqs + per_cpu(irq_stat, cpu).irq0_irqs; + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && last_irq_sums[cpu] == sum) { + if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); } else { - last_irq_sums[cpu] = sum; - alert_counter[cpu] = 0; + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); } /* see if the nmi watchdog went off */ if (!__get_cpu_var(wd_enabled)) -- cgit v1.2.3 From f59a9310b97879159ef4d25227bc270278f1ee71 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 2 May 2008 16:44:52 -0700 Subject: x86: nmi and irq counters are unsigned in nmi_64.c __nmi_count, apic_timer_irqs and irq0_irqs are unsigned. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 5a29ded994fa..33cb8ecbb6dd 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -79,7 +79,7 @@ static __init void nmi_cpu_busy(void *data) int __init check_nmi_watchdog(void) { - int *prev_nmi_count; + unsigned int *prev_nmi_count; int cpu; if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) @@ -316,7 +316,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { - int sum; + unsigned int sum; int touched = 0; int cpu = smp_processor_id(); int rc = 0; -- cgit v1.2.3 From 205f93288093df69f9ab5f6981aef27b91088b28 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 5 May 2008 17:52:52 -0400 Subject: x86: add new cache descriptor The latest rev of Intel doc AP-485 details a new cache descriptor that we don't yet support. A 6MB 24-way assoc L2 cache. Signed-off-by: Dave Jones Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_cacheinfo.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 26d615dcb149..2c8afafa18e8 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata = { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ -- cgit v1.2.3 From 5119e92efc733d730b34f9605a5ae61fdc4bf649 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Thu, 15 May 2008 09:12:01 -0600 Subject: x86: cdev lock_kernel() pushdown Push the cdev lock_kernel() call down into the x86 msr and cpuid drivers. Signed-off-by: Jonathan Corbet --- arch/x86/kernel/cpuid.c | 25 +++++++++++++++++-------- arch/x86/kernel/msr.c | 16 ++++++++++++---- 2 files changed, 29 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index daff52a62248..71f1c2654bec 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, static int cpuid_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ + unsigned int cpu; + struct cpuinfo_x86 *c; + int ret = 0; + + lock_kernel(); + + cpu = iminor(file->f_path.dentry->d_inode); + if (cpu >= NR_CPUS || !cpu_online(cpu)) { + ret = -ENXIO; /* No such CPU */ + goto out; + } + c = &cpu_data(cpu); if (c->cpuid_level < 0) - return -EIO; /* CPUID not supported */ - - return 0; + ret = -EIO; /* CPUID not supported */ +out: + unlock_kernel(); + return ret; } /* diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1f3abe048e93..a153b3905f60 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &cpu_data(cpu); + int ret = 0; - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ - if (!cpu_has(c, X86_FEATURE_MSR)) - return -EIO; /* MSR not supported */ + lock_kernel(); + cpu = iminor(file->f_path.dentry->d_inode); + if (cpu >= NR_CPUS || !cpu_online(cpu)) { + ret = -ENXIO; /* No such CPU */ + goto out; + } + c = &cpu_data(cpu); + if (!cpu_has(c, X86_FEATURE_MSR)) + ret = -EIO; /* MSR not supported */ +out: + unlock_kernel(); return 0; } -- cgit v1.2.3 From 0abbc78a0137fee60ef092f0b20a3d3d7e7e0cc2 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 20 May 2008 16:27:17 +0200 Subject: x86, aperture_64: use symbolic constants Factor-out common aperture_valid code. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/aperture_64.c | 23 +---------------------- drivers/char/agp/amd64-agp.c | 22 ++++------------------ include/asm-x86/gart.h | 24 ++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 02f4dbaa4df4..5373f7834d8a 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -109,27 +109,6 @@ static u32 __init allocate_aperture(void) return (u32)__pa(p); } -static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) -{ - if (!aper_base) - return 0; - - if (aper_base + aper_size > 0x100000000UL) { - printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); - return 0; - } - if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); - return 0; - } - if (aper_size < min_size) { - printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n", - aper_size>>20, min_size>>20); - return 0; - } - - return 1; -} /* Find a PCI capability */ static __u32 __init find_cap(int bus, int slot, int func, int cap) @@ -344,7 +323,7 @@ out: if (gart_fix_e820 && !fix && aper_enabled) { if (!e820_all_mapped(aper_base, aper_base + aper_size, E820_RESERVED)) { - /* reserved it, so we can resuse it in second kernel */ + /* reserve it, so we can reuse it in second kernel */ printk(KERN_INFO "update e820 for GART\n"); add_memory_region(aper_base, aper_size, E820_RESERVED); update_e820(); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index e3c7ea07f57c..f5af65ac8c78 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -228,24 +228,10 @@ static const struct agp_bridge_driver amd_8151_driver = { }; /* Some basic sanity checks for the aperture. */ -static int __devinit aperture_valid(u64 aper, u32 size) +static int __devinit agp_aperture_valid(u64 aper, u32 size) { - if (aper == 0) { - printk(KERN_ERR PFX "No aperture\n"); + if (!aperture_valid(aper, size, 32*1024*1024)) return 0; - } - if ((u64)aper + size > 0x100000000ULL) { - printk(KERN_ERR PFX "Aperture out of bounds\n"); - return 0; - } - if (e820_any_mapped(aper, aper + size, E820_RAM)) { - printk(KERN_ERR PFX "Aperture pointing to RAM\n"); - return 0; - } - if (size < 32*1024*1024) { - printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20); - return 0; - } /* Request the Aperture. This catches cases when someone else already put a mapping in there - happens with some very broken BIOS @@ -282,7 +268,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, nb_order = (nb_order >> 1) & 7; pci_read_config_dword(nb, AMD64_GARTAPERTUREBASE, &nb_base); nb_aper = nb_base << 25; - if (aperture_valid(nb_aper, (32*1024*1024)< + extern void pci_iommu_shutdown(void); extern void no_iommu_init(void); extern int force_iommu, no_iommu; @@ -69,4 +71,26 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } +static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) +{ + if (!aper_base) + return 0; + + if (aper_base + aper_size > 0x100000000ULL) { + printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); + return 0; + } + if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + if (aper_size < min_size) { + printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n", + aper_size>>20, min_size>>20); + return 0; + } + + return 1; +} + #endif -- cgit v1.2.3 From 0748aca6f0af917591dcfd70dccac770a25d4f71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 May 2008 10:34:20 +0200 Subject: x86: remove useless static current_tsc_khz variable current_tsc_khz is just written by the init code and never used again. Remove it. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_32.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index e4790728b224..d53b1040cbb7 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -283,7 +283,6 @@ core_initcall(cpufreq_tsc); /* clock source code */ -static unsigned long current_tsc_khz; static struct clocksource clocksource_tsc; /* @@ -434,9 +433,8 @@ void __init tsc_init(void) unsynchronized_tsc(); check_geode_tsc_reliable(); - current_tsc_khz = tsc_khz; - clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, - clocksource_tsc.shift); + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, + clocksource_tsc.shift); /* lower the rating if we already know its unstable: */ if (check_tsc_unstable()) { clocksource_tsc.rating = 0; -- cgit v1.2.3 From b478458aeebfc55fe409abec43794ac72a623c79 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:44:39 +0200 Subject: x86: avoid re-loading LDT in unrelated address spaces Performance optimization. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ldt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 0224c3637c73..21f2bae98c15 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,9 +20,9 @@ #include #ifdef CONFIG_SMP -static void flush_ldt(void *null) +static void flush_ldt(void *current_mm) { - if (current->active_mm) + if (current->active_mm == current_mm) load_LDT(¤t->active_mm->context); } #endif @@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); + smp_call_function(flush_ldt, current->mm, 1, 1); preempt_enable(); #else load_LDT(pc); -- cgit v1.2.3 From 873b274a41c0cfe58b2eb0a7722424eb367b905b Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 22 May 2008 13:02:23 -0400 Subject: x86: Add Centaur and Transmeta CPUs to PAT whitelist Unconditionally enable PAT support on Centaur and Transmeta CPUs. All known models that advertise PAT have no known errata. Signed-off-by: Dave Jones Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index c2e1ce33c7cb..d8b3e4a9d663 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -62,6 +62,9 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) return; break; + case X86_VENDOR_CENTAUR: + case X86_VENDOR_TRANSMETA: + return; } pat_disable(cpu_has_pat ? -- cgit v1.2.3 From 23adec554a7648f99c8acc0caf49c66320cd2b84 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:41 +0200 Subject: x86: add notrace annotations to vsyscall. Add the notrace annotations to the vsyscall functions - there we are not in kernel context yet, so the tracer function cannot (and must not) be called. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 3 ++- arch/x86/vdso/vclock_gettime.c | 15 ++++++++------- arch/x86/vdso/vgetcpu.c | 3 ++- include/asm-x86/vsyscall.h | 3 ++- 4 files changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 61efa2f7d564..4063dfa2a02d 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -42,7 +42,8 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) \ + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" /* diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 23476c2ebfc4..5cb8f754c52d 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -23,7 +23,7 @@ #define gtod vdso_vsyscall_gtod_data -static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; asm("syscall" : "=a" (ret) : @@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts) return ret; } -static inline long vgetns(void) +notrace static inline long vgetns(void) { long v; cycles_t (*vread)(void); @@ -40,7 +40,7 @@ static inline long vgetns(void) return (v * gtod->clock.mult) >> gtod->clock.shift; } -static noinline int do_realtime(struct timespec *ts) +notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; do { @@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts) } /* Copy of the version in kernel/time.c which we cannot directly access */ -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +notrace static void +vset_normalized_timespec(struct timespec *ts, long sec, long nsec) { while (nsec >= NSEC_PER_SEC) { nsec -= NSEC_PER_SEC; @@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) ts->tv_nsec = nsec; } -static noinline int do_monotonic(struct timespec *ts) +notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { @@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts) return 0; } -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled && gtod->clock.vread)) switch (clock) { @@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index c8097f17f8a9..9fbc6b20026b 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -13,7 +13,8 @@ #include #include "vextern.h" -long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h index 17b3700949bf..6b66ff905af0 100644 --- a/include/asm-x86/vsyscall.h +++ b/include/asm-x86/vsyscall.h @@ -24,7 +24,8 @@ enum vsyscall_num { ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) #define __section_vsyscall_clock __attribute__ \ ((unused, __section__ (".vsyscall_clock"),aligned(16))) -#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) +#define __vsyscall_fn \ + __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 -- cgit v1.2.3 From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: add basic support for gcc profiler instrumentation If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is set to a non-zero value the ftrace routine will be called everytime we enter a kernel function that is not marked with the "notrace" attribute. The ftrace routine will then call a registered function if a function happens to be registered. [ This code has been highly hacked by Steven Rostedt and Ingo Molnar, so don't blame Arnaldo for all of this ;-) ] Update: It is now possible to register more than one ftrace function. If only one ftrace function is registered, that will be the function that ftrace calls directly. If more than one function is registered, then ftrace will call a function that will loop through the functions to call. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Makefile | 4 ++ arch/x86/Kconfig | 1 + arch/x86/kernel/entry_32.S | 27 +++++++++ arch/x86/kernel/entry_64.S | 37 ++++++++++++ include/linux/ftrace.h | 38 +++++++++++++ kernel/Makefile | 1 + kernel/trace/Kconfig | 5 ++ kernel/trace/Makefile | 3 + kernel/trace/ftrace.c | 138 +++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 2 + 10 files changed, 256 insertions(+) create mode 100644 include/linux/ftrace.h create mode 100644 kernel/trace/Kconfig create mode 100644 kernel/trace/Makefile create mode 100644 kernel/trace/ftrace.c (limited to 'arch/x86/kernel') diff --git a/Makefile b/Makefile index 20b32351906b..b4a273f19b52 100644 --- a/Makefile +++ b/Makefile @@ -528,6 +528,10 @@ KBUILD_CFLAGS += -g KBUILD_AFLAGS += -gdwarf-2 endif +ifdef CONFIG_FTRACE +KBUILD_CFLAGS += -pg +endif + # We trigger additional mismatches with less inlining ifdef CONFIG_DEBUG_SECTION_MISMATCH KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae7ef2f..c742dfeb0dbe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2a609dc3271c..f47b9b5440d2 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1109,6 +1109,33 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ +#ifdef CONFIG_FTRACE +ENTRY(mcount) + cmpl $ftrace_stub, ftrace_trace_function + jnz trace + +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + + jmp ftrace_stub +END(mcount) +#endif + .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df522a7..f046e0c64883 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -54,6 +54,43 @@ .code64 +#ifdef CONFIG_FTRACE +ENTRY(mcount) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + retq + +trace: + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + + call *ftrace_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + jmp ftrace_stub +END(mcount) +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h new file mode 100644 index 000000000000..b96ef14c249a --- /dev/null +++ b/include/linux/ftrace.h @@ -0,0 +1,38 @@ +#ifndef _LINUX_FTRACE_H +#define _LINUX_FTRACE_H + +#ifdef CONFIG_FTRACE + +#include + +#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) + +typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); + +struct ftrace_ops { + ftrace_func_t func; + struct ftrace_ops *next; +}; + +/* + * The ftrace_ops must be a static and should also + * be read_mostly. These functions do modify read_mostly variables + * so use them sparely. Never free an ftrace_op or modify the + * next pointer after it has been registered. Even after unregistering + * it, the next pointer may still be used internally. + */ +int register_ftrace_function(struct ftrace_ops *ops); +int unregister_ftrace_function(struct ftrace_ops *ops); +void clear_ftrace_function(void); + +extern void ftrace_stub(unsigned long a0, unsigned long a1); +extern void mcount(void); + +#else /* !CONFIG_FTRACE */ +# define register_ftrace_function(ops) do { } while (0) +# define unregister_ftrace_function(ops) do { } while (0) +# define clear_ftrace_function(ops) do { } while (0) +#endif /* CONFIG_FTRACE */ +#endif /* _LINUX_FTRACE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9d..fa05f6d8bdbf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_FTRACE) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 000000000000..8185c91417bc --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,5 @@ +# +# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# +config HAVE_FTRACE + bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 000000000000..bf4fd215a6a9 --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FTRACE) += libftrace.o + +libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 000000000000..b6a80b98a3fb --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,138 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2004-2008 Ingo Molnar + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include + +static DEFINE_SPINLOCK(ftrace_func_lock); +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; + +/* mcount is defined per arch in assembly */ +EXPORT_SYMBOL(mcount); + +notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op = ftrace_list; + + /* in case someone actually ports this to alpha! */ + read_barrier_depends(); + + while (op != &ftrace_list_end) { + /* silly alpha */ + read_barrier_depends(); + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + + spin_lock_irqsave(&ftrace_func_lock, flags); + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the ftrace_list. + */ + smp_wmb(); + ftrace_list = ops; + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + struct ftrace_ops **p; + int ret = 0; + + spin_lock_irqsave(&ftrace_func_lock, flags); + + /* + * If we are the only function, then the ftrace pointer is + * pointing directly to that function. + */ + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; + goto out; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + /* If we only have one func left, then call that directly */ + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + + out: + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; +} diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d2099f41aa1e..d8b6279a9b42 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -634,6 +634,8 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. +source kernel/trace/Kconfig + config PROVIDE_OHCI1394_DMA_INIT bool "Remote debugging over FireWire early on boot" depends on PCI && X86 -- cgit v1.2.3 From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace irq disabled critical timings This patch adds latency tracing for critical timings (how long interrupts are disabled for). "irqsoff" is added to /debugfs/tracing/available_tracers Note: tracing_max_latency also holds the max latency for irqsoff (in usecs). (default to large number so one must start latency tracing) tracing_thresh threshold (in usecs) to always print out if irqs off is detected to be longer than stated here. If irq_thresh is non-zero, then max_irq_latency is ignored. Here's an example of a trace with ftrace_enabled = 0 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1d.s3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1d.s3 100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1d.s3 100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= And this is a trace with ftrace_enabled == 1 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 -------------------------------------------------------------------- latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1dNs3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000]) swapper-0 1dNs3 46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000]) swapper-0 1dNs3 47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000]) swapper-0 1dNs3 47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47) swapper-0 1dNs3 97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50) swapper-0 1dNs3 98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000]) swapper-0 1dNs3 99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000]) swapper-0 1dNs3 101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_64.c | 3 + arch/x86/lib/Makefile | 1 + arch/x86/lib/thunk_32.S | 47 +++++ arch/x86/lib/thunk_64.S | 19 +- include/asm-x86/irqflags.h | 24 +-- include/linux/ftrace.h | 8 + include/linux/irqflags.h | 12 +- kernel/fork.c | 2 +- kernel/lockdep.c | 23 ++- kernel/printk.c | 2 + kernel/trace/Kconfig | 18 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 402 +++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 531 insertions(+), 31 deletions(-) create mode 100644 arch/x86/lib/thunk_32.S create mode 100644 kernel/trace/trace_irqsoff.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f39988b..dd349c92f051 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -165,7 +165,10 @@ void cpu_idle(void) */ local_irq_disable(); enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 76f60f52a885..84aa2883fe15 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SMP) := msr-on-cpu.o lib-y := delay_$(BITS).o +lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S new file mode 100644 index 000000000000..650b11e00ecc --- /dev/null +++ b/arch/x86/lib/thunk_32.S @@ -0,0 +1,47 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include + +#define ARCH_TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#ifdef CONFIG_TRACE_IRQFLAGS + /* put return address in eax (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + call \func + popl %edx + popl %ecx + popl %eax + ret + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index e009251d4e9f..bf9a7d5a5428 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -2,6 +2,7 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. * Subject to the GNU public license, v.2. No warranty of any kind. */ @@ -42,8 +43,22 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index c242527f970e..24d71b1eb189 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void) * have a reliable stack. x86_64 only. */ #define SWAPGS_UNSAFE_STACK swapgs -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ @@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void) TRACE_IRQS_OFF; #else -#define ARCH_TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - #define ARCH_LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ @@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void) #endif #ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else # define TRACE_IRQS_ON # define TRACE_IRQS_OFF diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db8a5e7abe41..0a20445dcbcc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -50,4 +50,12 @@ extern void mcount(void); # define CALLER_ADDR5 0UL #endif +#ifdef CONFIG_IRQSOFF_TRACER + extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); +#else +# define time_hardirqs_on(a0, a1) do { } while (0) +# define time_hardirqs_off(a0, a1) do { } while (0) +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index e600c4e9b8c5..5b711d4e9fd9 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -12,10 +12,10 @@ #define _LINUX_TRACE_IRQFLAGS_H #ifdef CONFIG_TRACE_IRQFLAGS - extern void trace_hardirqs_on(void); - extern void trace_hardirqs_off(void); extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); + extern void trace_hardirqs_on(void); + extern void trace_hardirqs_off(void); # define trace_hardirq_context(p) ((p)->hardirq_context) # define trace_softirq_context(p) ((p)->softirq_context) # define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) @@ -41,6 +41,14 @@ # define INIT_TRACE_IRQFLAGS #endif +#ifdef CONFIG_IRQSOFF_TRACER + extern void stop_critical_timings(void); + extern void start_critical_timings(void); +#else +# define stop_critical_timings() do { } while (0) +# define start_critical_timings() do { } while (0) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #include diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf80..d66d676dc362 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a3f087..e21924365ea3 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) return 1; } -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void notrace trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; + time_hardirqs_on(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } +EXPORT_SYMBOL(trace_hardirqs_on_caller); +void notrace trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void notrace trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; + time_hardirqs_off(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void) } else debug_atomic_inc(&redundant_hardirqs_off); } +EXPORT_SYMBOL(trace_hardirqs_off_caller); +void notrace trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_off); /* diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3b..ae7d5b9e535d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1041,7 +1041,9 @@ void release_console_sem(void) _log_end = log_end; con_start = log_end; /* Flush */ spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); + start_critical_timings(); local_irq_restore(flags); } console_locked = 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 892ecc94a82b..896df1cf6adc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -26,6 +26,24 @@ config FTRACE (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5508cdb19aea..46be8647fb65 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 000000000000..a9131b0cf1a5 --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,402 @@ +/* + * trace irqs off criticall timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void notrace +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (likely(!tracer_enabled)) + return; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, T2, delta; + unsigned long flags; + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = now(cpu); + + delta = T2-T0; + + latency = nsecs_to_usecs(delta); + + if (data->critical_sequence != max_sequence) + goto out; + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + data->critical_end = parent_ip; + + update_max_tr_single(tr, current, cpu); + + if (tracing_thresh) + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh), t0); + else + printk(KERN_INFO "(%16s-%-5d|#%d):" + " new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol(KERN_CONT "<%s>\n", data->critical_start); + printk(KERN_CONT " => ended at timestamp %lu: ", t1); + print_symbol(KERN_CONT "<%s>\n", data->critical_end); + dump_stack(); + t1 = nsecs_to_usecs(now(cpu)); + printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1); + + max_sequence++; + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + tracing_reset(data); + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); +} + +static inline void notrace +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + data->critical_start = parent_ip; + tracing_reset(data); + + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static inline void notrace +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + check_critical_timing(tr, data, parent_ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +void notrace start_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +void notrace stop_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +#ifdef CONFIG_PROVE_LOCKING +void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(a0, a1); +} + +void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(a0, a1); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void notrace trace_hardirqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void notrace trace_hardirqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ + +static void start_irqsoff_tracer(struct trace_array *tr) +{ + tracer_enabled = 1; + register_ftrace_function(&trace_ops); +} + +static void stop_irqsoff_tracer(struct trace_array *tr) +{ + unregister_ftrace_function(&trace_ops); + tracer_enabled = 0; +} + +static void irqsoff_tracer_init(struct trace_array *tr) +{ + irqsoff_trace = tr; + /* make sure that the tracer is visibel */ + smp_wmb(); + + if (tr->ctrl) + start_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_irqsoff_tracer(tr); + else + stop_irqsoff_tracer(tr); +} + +static void notrace irqsoff_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_irqsoff_tracer(iter->tr); +} + +static void notrace irqsoff_tracer_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + start_irqsoff_tracer(iter->tr); +} + +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +__init static int init_irqsoff_tracer(void) +{ + register_tracer(&irqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); -- cgit v1.2.3 From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace preempt off critical timings Add preempt off timings. A lot of kernel core code is taken from the RT patch latency trace that was written by Ingo Molnar. This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers Now instead of just tracing irqs off, preemption off can be selected to be recorded. When this is selected, it shares the same files as irqs off timings. One can either trace preemption off, irqs off, or one or the other off. By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording of preempt off only is performed. "irqsoff" will only record the time irqs are disabled, but "preemptirqsoff" will take the total time irqs or preemption are disabled. Runtime switching of these options is now supported by simpling echoing in the appropriate trace name into /debugfs/tracing/current_tracer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_32.c | 3 + include/linux/ftrace.h | 8 ++ include/linux/irqflags.h | 3 +- include/linux/preempt.h | 2 +- kernel/sched.c | 24 +++++- kernel/trace/Kconfig | 25 ++++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------ 8 files changed, 197 insertions(+), 53 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476dfbb60d..a30aa1f2607a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -185,7 +185,10 @@ void cpu_idle(void) local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0a20445dcbcc..740c97dcf9cb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,4 +58,12 @@ extern void mcount(void); # define time_hardirqs_off(a0, a1) do { } while (0) #endif +#ifdef CONFIG_PREEMPT_TRACER + extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1); + extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1); +#else +# define trace_preempt_on(a0, a1) do { } while (0) +# define trace_preempt_off(a0, a1) do { } while (0) +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 5b711d4e9fd9..2b1c2e58566e 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -41,7 +41,8 @@ # define INIT_TRACE_IRQFLAGS #endif -#ifdef CONFIG_IRQSOFF_TRACER +#if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); extern void start_critical_timings(void); #else diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 36b03d50bf40..72b1a10a59b6 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,7 +10,7 @@ #include #include -#ifdef CONFIG_DEBUG_PREEMPT +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else diff --git a/kernel/sched.c b/kernel/sched.c index 73e600852365..328494e28df2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4365,26 +4366,44 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 896df1cf6adc..6430016b98e8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,6 +44,31 @@ config IRQSOFF_TRACER echo 0 > /debugfs/tracing/tracing_max_latency + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on GENERIC_TIME + depends on PREEMPT + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in preemption off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 46be8647fb65..3fec653d6533 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a9131b0cf1a5..8b1231633dc5 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -21,6 +21,36 @@ static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; +static DEFINE_PER_CPU(int, tracing_cpu); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +#ifdef CONFIG_PREEMPT_TRACER +static inline int notrace +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int notrace +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + /* * Sequence count - we record it when starting a measurement and * skip the latency if the sequence has changed - some other section @@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (likely(!tracer_enabled)) + if (likely(!__get_cpu_var(tracing_cpu))) return; local_save_flags(flags); - if (!irqs_disabled_flags(flags)) - return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (likely(!tracer_enabled)) return; + if (__get_cpu_var(tracing_cpu)) + return; + cpu = raw_smp_processor_id(); data = tr->data[cpu]; if (unlikely(!data) || unlikely(!data->trace) || - data->critical_start || atomic_read(&data->disabled)) + atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); data->critical_sequence = max_sequence; data->preempt_timestamp = now(cpu); - data->critical_start = parent_ip; + data->critical_start = parent_ip ? : ip; tracing_reset(data); local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + __get_cpu_var(tracing_cpu) = 1; + atomic_dec(&data->disabled); } @@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(__get_cpu_var(tracing_cpu))) + __get_cpu_var(tracing_cpu) = 0; + else + return; + + if (!tracer_enabled) return; cpu = raw_smp_processor_id(); @@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); ftrace(tr, data, ip, parent_ip, flags); - check_critical_timing(tr, data, parent_ip, cpu); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); } +/* start and stop critical timings used to for stoppage (in idle) */ void notrace start_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } void notrace stop_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +#ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); } @@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr) */ void notrace trace_hardirqs_on(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_on); void notrace trace_hardirqs_off(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_off); void notrace trace_hardirqs_on_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); void notrace trace_hardirqs_off_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void notrace trace_preempt_on(unsigned long a0, unsigned long a1) +{ + stop_critical_timing(a0, a1); +} + +void notrace trace_preempt_off(unsigned long a0, unsigned long a1) +{ + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ static void start_irqsoff_tracer(struct trace_array *tr) { @@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) tracer_enabled = 0; } -static void irqsoff_tracer_init(struct trace_array *tr) +static void __irqsoff_tracer_init(struct trace_array *tr) { irqsoff_trace = tr; /* make sure that the tracer is visibel */ @@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter) start_irqsoff_tracer(iter->tr); } +#ifdef CONFIG_IRQSOFF_TRACER +static void irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); +} static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", @@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly = .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, }; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static void preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static void preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif __init static int init_irqsoff_tracer(void) { - register_tracer(&irqsoff_tracer); + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); return 0; } -- cgit v1.2.3 From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: dynamic enabling/disabling of function calls This patch adds a feature to dynamically replace the ftrace code with the jmps to allow a kernel with ftrace configured to run as fast as it can without it configured. The way this works, is on bootup (if ftrace is enabled), a ftrace function is registered to record the instruction pointer of all places that call the function. Later, if there's still any code to patch, a kthread is awoken (rate limited to at most once a second) that performs a stop_machine, and replaces all the code that was called with a jmp over the call to ftrace. It only replaces what was found the previous time. Typically the system reaches equilibrium quickly after bootup and there's no code patching needed at all. e.g. call ftrace /* 5 bytes */ is replaced with jmp 3f /* jmp is 2 bytes and we jump 3 forward */ 3: When we want to enable ftrace for function tracing, the IP recording is removed, and stop_machine is called again to replace all the locations of that were recorded back to the call of ftrace. When it is disabled, we replace the code back to the jmp. Allocation is done by the kthread. If the ftrace recording function is called, and we don't have any record slots available, then we simply skip that call. Once a second a new page (if needed) is allocated for recording new ftrace function calls. A large batch is allocated at boot up to get most of the calls there. Because we do this via stop_machine, we don't have to worry about another CPU executing a ftrace call as we modify it. But we do need to worry about NMI's so all functions that might be called via nmi must be annotated with notrace_nmi. When this code is configured in, the NMI code will not call notrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ftrace.c | 237 +++++++++++++++++++++++++++++++ include/linux/ftrace.h | 18 +++ kernel/trace/Kconfig | 17 +++ kernel/trace/ftrace.c | 356 ++++++++++++++++++++++++++++++++++++++++++----- 5 files changed, 597 insertions(+), 32 deletions(-) create mode 100644 arch/x86/kernel/ftrace.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..e142091524b0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 000000000000..5dd58136ef02 --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,237 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#include +#include +#include +#include +#include +#include + +#define CALL_BACK 5 + +#define JMPFWD 0x03eb + +static unsigned short ftrace_jmp = JMPFWD; + +struct ftrace_record { + struct dyn_ftrace rec; + int failed; +} __attribute__((packed)); + +struct ftrace_page { + struct ftrace_page *next; + int index; + struct ftrace_record records[]; +} __attribute__((packed)); + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +#define MCOUNT_ADDR ((long)(&mcount)) + +union ftrace_code_union { + char code[5]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +{ + struct ftrace_record *rec; + unsigned short save; + + ip -= CALL_BACK; + save = *(short *)ip; + + /* If this was already converted, skip it */ + if (save == JMPFWD) + return NULL; + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + rec = &ftrace_pages->records[ftrace_pages->index++]; + + return &rec->rec; +} + +static int notrace +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned short old = *(unsigned short *)old_code; + unsigned short new = *(unsigned short *)new_code; + unsigned short replaced; + int faulted = 0; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lock\n" + " cmpxchg %w3, (%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + " movl $1, %0\n" + "3: jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r"(faulted), "=a"(replaced) + : "r"(ip), "r"(new), "0"(faulted), "a"(old) + : "memory"); + sync_core(); + + if (replaced != old) + faulted = 2; + + return faulted; +} + +static int notrace ftrace_calc_offset(long ip) +{ + return (int)(MCOUNT_ADDR - ip); +} + +notrace void ftrace_code_disable(struct dyn_ftrace *rec) +{ + unsigned long ip; + union ftrace_code_union save; + struct ftrace_record *r = + container_of(rec, struct ftrace_record, rec); + + ip = rec->ip; + + save.e8 = 0xe8; + save.offset = ftrace_calc_offset(ip); + + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + + r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp); +} + +static void notrace ftrace_replace_code(int saved) +{ + unsigned char *new = NULL, *old = NULL; + struct ftrace_record *rec; + struct ftrace_page *pg; + unsigned long ip; + int i; + + if (saved) + old = (char *)&ftrace_jmp; + else + new = (char *)&ftrace_jmp; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + union ftrace_code_union calc; + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->failed) + continue; + + ip = rec->rec.ip; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip); + + if (saved) + new = calc.code; + else + old = calc.code; + + ip -= CALL_BACK; + + rec->failed = ftrace_modify_code(ip, old, new); + } + } + +} + +notrace void ftrace_startup_code(void) +{ + ftrace_replace_code(1); +} + +notrace void ftrace_shutdown_code(void) +{ + ftrace_replace_code(0); +} + +notrace void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} + +notrace int ftrace_shutdown_arch_init(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 740c97dcf9cb..90dbc0ee2046 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -32,6 +32,24 @@ extern void mcount(void); # define clear_ftrace_function(ops) do { } while (0) #endif /* CONFIG_FTRACE */ +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_HASHBITS 10 +# define FTRACE_HASHSIZE (1< +#include +#include +#include +#include +#include #include +#include +#include +#include + +#include "trace.h" -static DEFINE_SPINLOCK(ftrace_func_lock); +static DEFINE_SPINLOCK(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, @@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) } /** - * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. - * - * Register a function to be called by all functions in the - * kernel. + * clear_ftrace_function - reset the ftrace function * - * Note: @ops->func and all the functions it calls must be labeled - * with "notrace", otherwise it will go into a - * recursive loop. + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag */ -int register_ftrace_function(struct ftrace_ops *ops) +void clear_ftrace_function(void) { - unsigned long flags; + ftrace_trace_function = ftrace_stub; +} + +static int notrace __register_ftrace_function(struct ftrace_ops *ops) +{ + /* Should never be called by interrupts */ + spin_lock(&ftrace_lock); - spin_lock_irqsave(&ftrace_func_lock, flags); ops->next = ftrace_list; /* * We are entering ops into the ftrace_list but another @@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops) */ smp_wmb(); ftrace_list = ops; + /* * For one func, simply call it directly. * For more than one func, call the chain. @@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops) ftrace_trace_function = ops->func; else ftrace_trace_function = ftrace_list_func; - spin_unlock_irqrestore(&ftrace_func_lock, flags); + + spin_unlock(&ftrace_lock); return 0; } -/** - * unregister_ftrace_function - unresgister a function for profiling. - * @ops - ops structure that holds the function to unregister - * - * Unregister a function that was added to be called by ftrace profiling. - */ -int unregister_ftrace_function(struct ftrace_ops *ops) +static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) { - unsigned long flags; struct ftrace_ops **p; int ret = 0; - spin_lock_irqsave(&ftrace_func_lock, flags); + spin_lock(&ftrace_lock); /* - * If we are the only function, then the ftrace pointer is - * pointing directly to that function. + * If we are removing the last function, then simply point + * to the ftrace_stub. */ if (ftrace_list == ops && ops->next == &ftrace_list_end) { ftrace_trace_function = ftrace_stub; @@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) ftrace_trace_function = ftrace_list->func; out: - spin_unlock_irqrestore(&ftrace_func_lock, flags); + spin_unlock(&ftrace_lock); + + return ret; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; + +static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); + +static DEFINE_SPINLOCK(ftrace_shutdown_lock); +static DEFINE_MUTEX(ftraced_lock); + +static int ftraced_trigger; +static int ftraced_suspend; + +static int ftrace_record_suspend; + +static inline int +notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key) +{ + struct dyn_ftrace *p; + struct hlist_node *t; + int found = 0; + + hlist_for_each_entry(p, t, &ftrace_hash[key], node) { + if (p->ip == ip) { + found = 1; + break; + } + } + + return found; +} + +static inline void notrace +ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) +{ + hlist_add_head(&node->node, &ftrace_hash[key]); +} + +static void notrace +ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *node; + unsigned long flags; + unsigned long key; + int resched; + int atomic; + + resched = need_resched(); + preempt_disable_notrace(); + + /* We simply need to protect against recursion */ + __get_cpu_var(ftrace_shutdown_disable_cpu)++; + if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1) + goto out; + + if (unlikely(ftrace_record_suspend)) + goto out; + + key = hash_long(ip, FTRACE_HASHBITS); + + WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + + if (ftrace_ip_in_hash(ip, key)) + goto out; + + atomic = irqs_disabled(); + + spin_lock_irqsave(&ftrace_shutdown_lock, flags); + + /* This ip may have hit the hash before the lock */ + if (ftrace_ip_in_hash(ip, key)) + goto out_unlock; + + /* + * There's a slight race that the ftraced will update the + * hash and reset here. The arch alloc is responsible + * for seeing if the IP has already changed, and if + * it has, the alloc will fail. + */ + node = ftrace_alloc_shutdown_node(ip); + if (!node) + goto out_unlock; + + node->ip = ip; + + ftrace_add_hash(node, key); + + ftraced_trigger = 1; + + out_unlock: + spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + out: + __get_cpu_var(ftrace_shutdown_disable_cpu)--; + + /* prevent recursion with scheduler */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops ftrace_shutdown_ops __read_mostly = +{ + .func = ftrace_record_ip, +}; + + +static int notrace __ftrace_modify_code(void *data) +{ + void (*func)(void) = data; + + func(); + return 0; +} + +static void notrace ftrace_run_startup_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); +} + +static void notrace ftrace_run_shutdown_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); +} + +static void notrace ftrace_startup(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend++; + if (ftraced_suspend != 1) + goto out; + __unregister_ftrace_function(&ftrace_shutdown_ops); + + ftrace_run_startup_code(); + out: + mutex_unlock(&ftraced_lock); +} + +static void notrace ftrace_shutdown(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend--; + if (ftraced_suspend) + goto out; + + ftrace_run_shutdown_code(); + + __register_ftrace_function(&ftrace_shutdown_ops); + out: + mutex_unlock(&ftraced_lock); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int notrace __ftrace_update_code(void *ignore) +{ + struct dyn_ftrace *p; + struct hlist_head head; + struct hlist_node *t; + cycle_t start, stop; + int i; + + /* Don't be calling ftrace ops now */ + __unregister_ftrace_function(&ftrace_shutdown_ops); + + start = now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + /* No locks needed, the machine is stopped! */ + for (i = 0; i < FTRACE_HASHSIZE; i++) { + if (hlist_empty(&ftrace_hash[i])) + continue; + + head = ftrace_hash[i]; + INIT_HLIST_HEAD(&ftrace_hash[i]); + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry(p, t, &head, node) { + ftrace_code_disable(p); + ftrace_update_cnt++; + } + + } + + stop = now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + + __register_ftrace_function(&ftrace_shutdown_ops); return 0; } +static void notrace ftrace_update_code(void) +{ + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); +} + +static int notrace ftraced(void *ignore) +{ + unsigned long usecs; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + /* check once a second */ + schedule_timeout(HZ); + + mutex_lock(&ftraced_lock); + if (ftraced_trigger && !ftraced_suspend) { + ftrace_record_suspend++; + ftrace_update_code(); + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + WARN_ON_ONCE(1); + } + ftraced_trigger = 0; + ftrace_record_suspend--; + } + mutex_unlock(&ftraced_lock); + + ftrace_shutdown_replenish(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __init notrace ftrace_shutdown_init(void) +{ + struct task_struct *p; + int ret; + + ret = ftrace_shutdown_arch_init(); + if (ret) + return ret; + + p = kthread_run(ftraced, NULL, "ftraced"); + if (IS_ERR(p)) + return -1; + + __register_ftrace_function(&ftrace_shutdown_ops); + + return 0; +} + +core_initcall(ftrace_shutdown_init); +#else +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + /** - * clear_ftrace_function - reset the ftrace function + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. */ -void clear_ftrace_function(void) +int register_ftrace_function(struct ftrace_ops *ops) { - ftrace_trace_function = ftrace_stub; + ftrace_startup(); + + return __register_ftrace_function(ops); +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + ret = __unregister_ftrace_function(ops); + + if (ftrace_list == &ftrace_list_end) + ftrace_shutdown(); + + return ret; } -- cgit v1.2.3 From dfa60aba04dae7833d75b2e2be124bb7cfb8239f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: use nops instead of jmp This patch patches the call to mcount with nops instead of a jmp over the mcount call. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/alternative.c | 4 ++-- arch/x86/kernel/ftrace.c | 40 ++++++++++++++++++++++++---------------- include/asm-x86/alternative.h | 2 ++ 3 files changed, 28 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 65c7857a90dd..de240ba2e288 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -static inline const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; @@ -162,7 +162,7 @@ static const struct nop { { -1, NULL } }; -static const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { const unsigned char *const *noptable = intel_nops; int i; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5dd58136ef02..2e060c58b860 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -16,11 +16,12 @@ #include #include -#define CALL_BACK 5 +#include -#define JMPFWD 0x03eb +#define CALL_BACK 5 -static unsigned short ftrace_jmp = JMPFWD; +/* Long is fine, even if it is only 4 bytes ;-) */ +static long *ftrace_nop; struct ftrace_record { struct dyn_ftrace rec; @@ -55,13 +56,13 @@ static struct ftrace_page *ftrace_pages; notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) { struct ftrace_record *rec; - unsigned short save; + unsigned long save; ip -= CALL_BACK; - save = *(short *)ip; + save = *(long *)ip; /* If this was already converted, skip it */ - if (save == JMPFWD) + if (save == *ftrace_nop) return NULL; if (ftrace_pages->index == ENTRIES_PER_PAGE) { @@ -79,9 +80,10 @@ static int notrace ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned short old = *(unsigned short *)old_code; - unsigned short new = *(unsigned short *)new_code; - unsigned short replaced; + unsigned replaced; + unsigned old = *(unsigned *)old_code; /* 4 bytes */ + unsigned new = *(unsigned *)new_code; /* 4 bytes */ + unsigned char newch = new_code[4]; int faulted = 0; /* @@ -94,7 +96,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, */ asm volatile ( "1: lock\n" - " cmpxchg %w3, (%2)\n" + " cmpxchg %3, (%2)\n" + " jnz 2f\n" + " movb %b4, 4(%2)\n" "2:\n" ".section .fixup, \"ax\"\n" " movl $1, %0\n" @@ -102,11 +106,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, ".previous\n" _ASM_EXTABLE(1b, 3b) : "=r"(faulted), "=a"(replaced) - : "r"(ip), "r"(new), "0"(faulted), "a"(old) + : "r"(ip), "r"(new), "r"(newch), + "0"(faulted), "a"(old) : "memory"); sync_core(); - if (replaced != old) + if (replaced != old && replaced != new) faulted = 2; return faulted; @@ -132,7 +137,7 @@ notrace void ftrace_code_disable(struct dyn_ftrace *rec) /* move the IP back to the start of the call */ ip -= CALL_BACK; - r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp); + r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop); } static void notrace ftrace_replace_code(int saved) @@ -144,9 +149,9 @@ static void notrace ftrace_replace_code(int saved) int i; if (saved) - old = (char *)&ftrace_jmp; + old = (char *)ftrace_nop; else - new = (char *)&ftrace_jmp; + new = (char *)ftrace_nop; for (pg = ftrace_pages_start; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { @@ -194,12 +199,15 @@ notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -notrace int ftrace_shutdown_arch_init(void) +notrace int __init ftrace_shutdown_arch_init(void) { + const unsigned char *const *noptable = find_nop_table(); struct ftrace_page *pg; int cnt; int i; + ftrace_nop = (unsigned long *)noptable[CALL_BACK]; + /* allocate a few pages */ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); if (!ftrace_pages_start) diff --git a/include/asm-x86/alternative.h b/include/asm-x86/alternative.h index 1f6a9ca10126..f6aa18eadf71 100644 --- a/include/asm-x86/alternative.h +++ b/include/asm-x86/alternative.h @@ -72,6 +72,8 @@ static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} #endif /* CONFIG_SMP */ +const unsigned char *const *find_nop_table(void); + /* * Alternative instructions for different CPU types or capabilities. * -- cgit v1.2.3 From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: move memory management out of arch code This patch moves the memory management of the ftrace records out of the arch code and into the generic code making the arch code simpler. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ftrace.c | 183 ++++++++--------------------------------------- include/linux/ftrace.h | 18 +++-- kernel/trace/ftrace.c | 154 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 192 insertions(+), 163 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2e060c58b860..b69795efa226 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -23,25 +23,6 @@ /* Long is fine, even if it is only 4 bytes ;-) */ static long *ftrace_nop; -struct ftrace_record { - struct dyn_ftrace rec; - int failed; -} __attribute__((packed)); - -struct ftrace_page { - struct ftrace_page *next; - int index; - struct ftrace_record records[]; -} __attribute__((packed)); - -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record)) - -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - -#define MCOUNT_ADDR ((long)(&mcount)) - union ftrace_code_union { char code[5]; struct { @@ -50,33 +31,41 @@ union ftrace_code_union { } __attribute__((packed)); }; -static struct ftrace_page *ftrace_pages_start; -static struct ftrace_page *ftrace_pages; - -notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +notrace int ftrace_ip_converted(unsigned long ip) { - struct ftrace_record *rec; unsigned long save; ip -= CALL_BACK; save = *(long *)ip; - /* If this was already converted, skip it */ - if (save == *ftrace_nop) - return NULL; + return save == *ftrace_nop; +} - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) - return NULL; - ftrace_pages = ftrace_pages->next; - } +static int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} - rec = &ftrace_pages->records[ftrace_pages->index++]; +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; - return &rec->rec; + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; } -static int notrace +notrace int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -86,6 +75,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char newch = new_code[4]; int faulted = 0; + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting @@ -117,129 +109,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return faulted; } -static int notrace ftrace_calc_offset(long ip) -{ - return (int)(MCOUNT_ADDR - ip); -} - -notrace void ftrace_code_disable(struct dyn_ftrace *rec) -{ - unsigned long ip; - union ftrace_code_union save; - struct ftrace_record *r = - container_of(rec, struct ftrace_record, rec); - - ip = rec->ip; - - save.e8 = 0xe8; - save.offset = ftrace_calc_offset(ip); - - /* move the IP back to the start of the call */ - ip -= CALL_BACK; - - r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop); -} - -static void notrace ftrace_replace_code(int saved) -{ - unsigned char *new = NULL, *old = NULL; - struct ftrace_record *rec; - struct ftrace_page *pg; - unsigned long ip; - int i; - - if (saved) - old = (char *)ftrace_nop; - else - new = (char *)ftrace_nop; - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - for (i = 0; i < pg->index; i++) { - union ftrace_code_union calc; - rec = &pg->records[i]; - - /* don't modify code that has already faulted */ - if (rec->failed) - continue; - - ip = rec->rec.ip; - - calc.e8 = 0xe8; - calc.offset = ftrace_calc_offset(ip); - - if (saved) - new = calc.code; - else - old = calc.code; - - ip -= CALL_BACK; - - rec->failed = ftrace_modify_code(ip, old, new); - } - } - -} - -notrace void ftrace_startup_code(void) -{ - ftrace_replace_code(1); -} - -notrace void ftrace_shutdown_code(void) -{ - ftrace_replace_code(0); -} - -notrace void ftrace_shutdown_replenish(void) -{ - if (ftrace_pages->next) - return; - - /* allocate another page */ - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); -} - -notrace int __init ftrace_shutdown_arch_init(void) +int __init ftrace_dyn_arch_init(void) { const unsigned char *const *noptable = find_nop_table(); - struct ftrace_page *pg; - int cnt; - int i; ftrace_nop = (unsigned long *)noptable[CALL_BACK]; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; - - /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. - */ - - pg = ftrace_pages = ftrace_pages_start; - - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; - - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); - - /* If we fail, we'll try later anyway */ - if (!pg->next) - break; - - pg = pg->next; - } - return 0; } + diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ccd8537dbdb7..d509ad6c9cb8 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -42,19 +42,23 @@ extern void mcount(void); # define FTRACE_HASHBITS 10 # define FTRACE_HASHSIZE (1<node, &ftrace_hash[key]); } +static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +{ + /* If this was already converted, skip it */ + if (ftrace_ip_converted(ip)) + return NULL; + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + static void notrace ftrace_record_ip(unsigned long ip, unsigned long parent_ip) { @@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly = .func = ftrace_record_ip, }; +#define MCOUNT_ADDR ((long)(&mcount)) + +static void notrace ftrace_replace_code(int saved) +{ + unsigned char *new = NULL, *old = NULL; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long ip; + int failed; + int i; + + if (saved) + old = ftrace_nop_replace(); + else + new = ftrace_nop_replace(); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->flags & FTRACE_FL_FAILED) + continue; + + ip = rec->ip; + + if (saved) + new = ftrace_call_replace(ip, MCOUNT_ADDR); + else + old = ftrace_call_replace(ip, MCOUNT_ADDR); + + failed = ftrace_modify_code(ip, old, new); + if (failed) + rec->flags |= FTRACE_FL_FAILED; + } + } +} + +static notrace void ftrace_startup_code(void) +{ + ftrace_replace_code(1); +} + +static notrace void ftrace_shutdown_code(void) +{ + ftrace_replace_code(0); +} + +static notrace void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} static int notrace __ftrace_modify_code(void *data) { @@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data) return 0; } +static notrace void +ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip; + unsigned char *nop, *call; + int failed; + + ip = rec->ip; + + nop = ftrace_nop_replace(); + call = ftrace_call_replace(ip, addr); + + failed = ftrace_modify_code(ip, call, nop); + if (failed) + rec->flags |= FTRACE_FL_FAILED; +} + static void notrace ftrace_run_startup_code(void) { stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); @@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p); + ftrace_code_disable(p, MCOUNT_ADDR); ftrace_update_cnt++; } @@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore) return 0; } +static int __init ftrace_dyn_table_alloc(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + int ret; + + ret = ftrace_dyn_arch_init(); + if (ret) + return ret; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + static int __init notrace ftrace_shutdown_init(void) { struct task_struct *p; int ret; - ret = ftrace_shutdown_arch_init(); + ret = ftrace_dyn_table_alloc(); if (ret) return ret; -- cgit v1.2.3 From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: use dynamic patching for updating mcount calls This patch replaces the indirect call to the mcount function pointer with a direct call that will be patched by the dynamic ftrace routines. On boot up, the mcount function calls the ftace_stub function. When the dynamic ftrace code is initialized, the ftrace_stub is replaced with a call to the ftrace_record_ip, which records the instruction pointers of the locations that call it. Later, the ftraced daemon will call kstop_machine and patch all the locations to nops. When a ftrace is enabled, the original calls to mcount will now be set top call ftrace_caller, which will do a direct call to the registered ftrace function. This direct call is also patched when the function that should be called is updated. All patching is performed by a kstop_machine routine to prevent any type of race conditions that is associated with modifying code on the fly. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 47 +++++++++++- arch/x86/kernel/entry_64.S | 67 ++++++++++++++++- arch/x86/kernel/ftrace.c | 41 +++++++++- include/linux/ftrace.h | 7 +- kernel/trace/ftrace.c | 183 ++++++++++++++++++++++++++------------------- 5 files changed, 261 insertions(+), 84 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f47b9b5440d2..e6517ce0b824 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1110,10 +1110,50 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ #ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + +.globl mcount_call +mcount_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace - .globl ftrace_stub ftrace_stub: ret @@ -1126,7 +1166,7 @@ trace: movl 0xc(%esp), %eax movl 0x4(%ebp), %edx - call *ftrace_trace_function + call *ftrace_trace_function popl %edx popl %ecx @@ -1134,7 +1174,8 @@ trace: jmp ftrace_stub END(mcount) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f046e0c64883..fe25e5febca3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,70 @@ .code64 #ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + +.globl mcount_call +mcount_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + retq +END(mcount) + +ENTRY(ftrace_caller) + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + +.globl ftrace_stub +ftrace_stub: + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -89,7 +153,8 @@ trace: jmp ftrace_stub END(mcount) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b69795efa226..9f44623e0072 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -109,10 +109,49 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return faulted; } -int __init ftrace_dyn_arch_init(void) +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[5], *new; + int ret; + + ip += CALL_BACK; + + memcpy(old, &ftrace_call, 5); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[5], *new; + + /* ip is at the location, but modify code will subtact this */ + ip += CALL_BACK; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, 5); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) { const unsigned char *const *noptable = find_nop_table(); + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + ftrace_nop = (unsigned long *)noptable[CALL_BACK]; return 0; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d509ad6c9cb8..b0dd0093058f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -56,9 +56,14 @@ struct dyn_ftrace { extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr); -extern int ftrace_dyn_arch_init(void); +extern int ftrace_dyn_arch_init(void *data); +extern int ftrace_mcount_set(unsigned long *data); extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code); +extern int ftrace_update_ftrace_func(ftrace_func_t func); +extern void ftrace_caller(void); +extern void ftrace_call(void); +extern void mcount_call(void); #endif #ifdef CONFIG_FRAME_POINTER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6d9af3bf66b..88544f9bc0ed 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -26,14 +26,8 @@ #include "trace.h" -#ifdef CONFIG_DYNAMIC_FTRACE -# define FTRACE_ENABLED_INIT 1 -#else -# define FTRACE_ENABLED_INIT 0 -#endif - -int ftrace_enabled = FTRACE_ENABLED_INIT; -static int last_ftrace_enabled = FTRACE_ENABLED_INIT; +int ftrace_enabled; +static int last_ftrace_enabled; static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); @@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), +}; + static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); @@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head(&node->node, &ftrace_hash[key]); } -static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - /* If this was already converted, skip it */ - if (ftrace_ip_converted(ip)) - return NULL; - if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) return NULL; @@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) } static void notrace -ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; unsigned long flags; @@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) int resched; int atomic; + if (!ftrace_enabled) + return; + resched = need_resched(); preempt_disable_notrace(); @@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) /* * There's a slight race that the ftraced will update the - * hash and reset here. The arch alloc is responsible - * for seeing if the IP has already changed, and if - * it has, the alloc will fail. + * hash and reset here. If it is already converted, skip it. */ - node = ftrace_alloc_shutdown_node(ip); + if (ftrace_ip_converted(ip)) + goto out_unlock; + + node = ftrace_alloc_dyn_node(ip); if (!node) goto out_unlock; @@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } -static struct ftrace_ops ftrace_shutdown_ops __read_mostly = -{ - .func = ftrace_record_ip, -}; - +#define FTRACE_ADDR ((long)(&ftrace_caller)) #define MCOUNT_ADDR ((long)(&mcount)) static void notrace ftrace_replace_code(int saved) @@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved) ip = rec->ip; if (saved) - new = ftrace_call_replace(ip, MCOUNT_ADDR); + new = ftrace_call_replace(ip, FTRACE_ADDR); else - old = ftrace_call_replace(ip, MCOUNT_ADDR); + old = ftrace_call_replace(ip, FTRACE_ADDR); failed = ftrace_modify_code(ip, old, new); if (failed) @@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved) } } -static notrace void ftrace_startup_code(void) -{ - ftrace_replace_code(1); -} - -static notrace void ftrace_shutdown_code(void) -{ - ftrace_replace_code(0); -} - static notrace void ftrace_shutdown_replenish(void) { if (ftrace_pages->next) @@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static int notrace __ftrace_modify_code(void *data) -{ - void (*func)(void) = data; - - func(); - return 0; -} - static notrace void -ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; unsigned char *nop, *call; @@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, addr); + call = ftrace_call_replace(ip, MCOUNT_ADDR); failed = ftrace_modify_code(ip, call, nop); if (failed) rec->flags |= FTRACE_FL_FAILED; } -static void notrace ftrace_run_startup_code(void) +static int notrace __ftrace_modify_code(void *data) { - stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); + unsigned long addr; + int *command = data; + + if (*command & FTRACE_ENABLE_CALLS) + ftrace_replace_code(1); + else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_ENABLE_MCOUNT) { + addr = (unsigned long)ftrace_record_ip; + ftrace_mcount_set(&addr); + } else if (*command & FTRACE_DISABLE_MCOUNT) { + addr = (unsigned long)ftrace_stub; + ftrace_mcount_set(&addr); + } + + return 0; } -static void notrace ftrace_run_shutdown_code(void) +static void notrace ftrace_run_update_code(int command) { - stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); + stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } +static ftrace_func_t saved_ftrace_func; + static void notrace ftrace_startup(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend++; - if (ftraced_suspend != 1) + if (ftraced_suspend == 1) + command |= FTRACE_ENABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) goto out; - __unregister_ftrace_function(&ftrace_shutdown_ops); - if (ftrace_enabled) - ftrace_run_startup_code(); + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend--; - if (ftraced_suspend) - goto out; + if (!ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; - if (ftrace_enabled) - ftrace_run_shutdown_code(); + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } - __register_ftrace_function(&ftrace_shutdown_ops); + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_startup_sysctl(void) { + int command = FTRACE_ENABLE_MCOUNT; + mutex_lock(&ftraced_lock); + /* Force update next time */ + saved_ftrace_func = NULL; /* ftraced_suspend is true if we want ftrace running */ if (ftraced_suspend) - ftrace_run_startup_code(); + command |= FTRACE_ENABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown_sysctl(void) { + int command = FTRACE_DISABLE_MCOUNT; + mutex_lock(&ftraced_lock); /* ftraced_suspend is true if ftrace is running */ if (ftraced_suspend) - ftrace_run_shutdown_code(); + command |= FTRACE_DISABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } @@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore) struct dyn_ftrace *p; struct hlist_head head; struct hlist_node *t; + int save_ftrace_enabled; cycle_t start, stop; int i; - /* Don't be calling ftrace ops now */ - __unregister_ftrace_function(&ftrace_shutdown_ops); + /* Don't be recording funcs now */ + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; start = now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p, MCOUNT_ADDR); + ftrace_code_disable(p); ftrace_update_cnt++; } @@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore) ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - __register_ftrace_function(&ftrace_shutdown_ops); + ftrace_enabled = save_ftrace_enabled; return 0; } @@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void) struct ftrace_page *pg; int cnt; int i; - int ret; - - ret = ftrace_dyn_arch_init(); - if (ret) - return ret; /* allocate a few pages */ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); @@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void) return 0; } -static int __init notrace ftrace_shutdown_init(void) +static int __init notrace ftrace_dynamic_init(void) { struct task_struct *p; + unsigned long addr; int ret; + addr = (unsigned long)ftrace_record_ip; + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + return addr; + ret = ftrace_dyn_table_alloc(); if (ret) return ret; @@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void) if (IS_ERR(p)) return -1; - __register_ftrace_function(&ftrace_shutdown_ops); + last_ftrace_enabled = ftrace_enabled = 1; return 0; } -core_initcall(ftrace_shutdown_init); +core_initcall(ftrace_dynamic_init); #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) @@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_sysctl_lock); - ftrace_startup(); - ret = __register_ftrace_function(ops); + ftrace_startup(); mutex_unlock(&ftrace_sysctl_lock); return ret; @@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); - - if (ftrace_list == &ftrace_list_end) - ftrace_shutdown(); - + ftrace_shutdown(); mutex_unlock(&ftrace_sysctl_lock); return ret; -- cgit v1.2.3 From f43fdad8627fec2d21df92799b254dceb66c9c3c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: fix kexec disable the tracer while kexec pulls the rug from under the old kernel. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/machine_kexec_32.c | 4 ++++ arch/x86/kernel/machine_kexec_64.c | 4 ++++ include/linux/ftrace.h | 7 +++++++ 3 files changed, 15 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d0b234c9fc31..88923fd7a6fc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include #include @@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 576a03db4511..1558fdc174f9 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include #include @@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f5911d2d42c3..a42390c1d6e1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -68,6 +68,13 @@ extern void ftrace_call(void); extern void mcount_call(void); #endif +static inline void tracer_disable(void) +{ +#ifdef CONFIG_FTRACE + ftrace_enabled = 0; +#endif +} + #ifdef CONFIG_FRAME_POINTER /* TODO: need to fix this for ARM */ # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -- cgit v1.2.3 From a56be3fe2f65f9f776e727bfd382e35db75911d6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:56 +0200 Subject: ftrace: fix the fault label in updating code The fault label to jump to on fault of updating the code was misplaced preventing the fault from being recorded. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9f44623e0072..498608c015fb 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -93,8 +93,8 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, " movb %b4, 4(%2)\n" "2:\n" ".section .fixup, \"ax\"\n" - " movl $1, %0\n" - "3: jmp 2b\n" + "3: movl $1, %0\n" + " jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b, 3b) : "=r"(faulted), "=a"(replaced) -- cgit v1.2.3 From 2f1dafe50cc4e58a239fd81bd47f87f32042a1ee Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:01 +0200 Subject: x86: fix SMP alternatives: use mutex instead of spinlock, text_poke is sleepable text_poke is sleepable. The original fix by Mathieu Desnoyers . Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/alternative.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de240ba2e288..2763cb37b553 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -279,7 +279,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_SPINLOCK(smp_alt); +static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, @@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name, __func__, smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_add_tail(&smp->next, &smp_alt_modules); if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(smp->locks, smp->locks_end, smp->text, smp->text_end); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_module_del(struct module *mod) @@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod) if (smp_alt_once || noreplace_smp) return; - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); DPRINTK("%s: %s\n", __func__, item->name); kfree(item); return; } - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_switch(int smp) @@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp) return; BUG_ON(!smp && (num_online_cpus() > 1)); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); /* * Avoid unnecessary switches because it forces JIT based VMs to @@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp) mod->text, mod->text_end); } smp_mode = smp; - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } #endif -- cgit v1.2.3 From 37135677e653537ffc6e7def679443272a1c03c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 14 May 2008 08:10:31 +0200 Subject: ftrace: fix mcount export bug David S. Miller noticed the following bug: the -pg instrumentation function callback is named differently on each platform. On x86 it is mcount, on sparc it is _mcount. So the export does not make sense in kernel/trace/ftrace.c - move it to x86. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i386_ksyms_32.c | 9 ++++++++- arch/x86/kernel/x8664_ksyms_64.c | 11 +++++++++-- kernel/trace/ftrace.c | 3 --- 3 files changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index deb43785e923..29999dbb754c 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,7 +1,14 @@ +#include #include + #include -#include #include +#include + +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index f6c05d0410fb..122885bc5f3b 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -1,15 +1,22 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ +#include #include -#include #include +#include + #include -#include #include +#include #include +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif + EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(__get_user_1); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 07b2a14943f8..a3e47f43f8a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -50,9 +50,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -/* mcount is defined per arch in assembly */ -EXPORT_SYMBOL(mcount); - void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; -- cgit v1.2.3 From 7fa09f24b477ad41b821713eba757b3aa7a2864a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:32 -0400 Subject: ftrace: use the new kbuild CFLAGS_REMOVE for x86/kernel directory This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro in the x86/kernel directory. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e142091524b0..739d49acd2f1 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,6 +6,13 @@ extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc_64.o = -pg +CFLAGS_REMOVE_tsc_32.o = -pg +CFLAGS_REMOVE_rtc.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: -- cgit v1.2.3 From 40bd21740012132afb62d78ac3e6b82372b2fbc2 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:44:02 +0200 Subject: x86: automatical unification of i8259.c Make conversion of i8259 very mechanical -- i8259 was generated by --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259.c | 368 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/i8259_32.c | 295 ------------------------------------ arch/x86/kernel/i8259_64.c | 309 ------------------------------------- include/asm-x86/i8259.h | 2 + 5 files changed, 371 insertions(+), 605 deletions(-) create mode 100644 arch/x86/kernel/i8259.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..f71a76ef2596 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 000000000000..2decba6b0101 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,368 @@ +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#else /* CONFIG_X86_64 */ +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#endif /* ! CONFIG_X86_64 */ +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { +#ifndef CONFIG_X86_64 + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); +#else /* CONFIG_X86_64 */ + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); +#endif /* CONFIG_X86_64 */ + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + .name = "i8259", + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); +#endif /* CONFIG_X86_64 */ + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + +#endif /* CONFIG_X86_64 */ + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index fe631967d625..d66914287ee1 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,302 +21,7 @@ #include #include -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index fa57a1568508..a95d2bd27e9d 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -87,315 +87,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = #undef IRQ #undef IRQLIST_16 -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); - /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG - "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); - /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); - /* 8259A-2 is a slave on master's IR2 */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); - /* (slave's support for AEOI in flat mode is to be investigated) */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); - - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h index 45d4df3e51e6..2f98df91f1f2 100644 --- a/include/asm-x86/i8259.h +++ b/include/asm-x86/i8259.h @@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port) udelay(2); } +extern struct irq_chip i8259A_chip; + #endif /* __ASM_I8259_H__ */ -- cgit v1.2.3 From 6b4d3afbe722ee71b8da346c2c4393938440c471 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:47:24 +0200 Subject: x86: i8259.c: remove #ifdefs around includes Remove #ifdefs around includes; including too much should be always safe. Signed-off-by: Pavel Machek Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8259.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 2decba6b0101..8b7eb0cb2e7e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -1,14 +1,10 @@ -#ifdef CONFIG_X86_64 #include -#endif /* CONFIG_X86_64 */ #include #include #include #include #include -#ifdef CONFIG_X86_64 #include -#endif /* CONFIG_X86_64 */ #include #include #include @@ -16,24 +12,17 @@ #include #include -#ifdef CONFIG_X86_64 #include -#endif /* CONFIG_X86_64 */ #include #include #include -#ifndef CONFIG_X86_64 #include -#else /* CONFIG_X86_64 */ #include -#endif /* CONFIG_X86_64 */ #include #include #include #include -#ifndef CONFIG_X86_64 #include -#endif /* ! CONFIG_X86_64 */ #include /* -- cgit v1.2.3 From 4f6f3bac18c80ff6cf072d90796755c69cc4c748 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:52:52 +0200 Subject: x86: i8259.c: remove trivial ifdefs Remove #ifdefs where the only difference is formatting of comments. Signed-off-by: Pavel Machek Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8259.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8b7eb0cb2e7e..433e49edfd4f 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -175,24 +175,14 @@ handle_real_irq: if (irq & 8) { inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ outb(cached_slave_mask, PIC_SLAVE_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ -#else /* CONFIG_X86_64 */ /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); + outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); } else { inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ outb(cached_master_mask, PIC_MASTER_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ -#else /* CONFIG_X86_64 */ - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -215,12 +205,8 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { -#ifndef CONFIG_X86_64 - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#else /* CONFIG_X86_64 */ printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#endif /* CONFIG_X86_64 */ spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); -- cgit v1.2.3 From 4d9a6e6128dd8ada0d2feed2a9bb6506043e4655 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:57:52 +0200 Subject: x86: i8259: cleanup codingstyle Signed-off-by: Pavel Machek Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8259.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 433e49edfd4f..7a0fda8f01b5 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -129,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq) int irqmask = 1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ return value; } -- cgit v1.2.3 From f20b11e716936f85850654c49e06a1f955b9e5fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 15:59:58 +0200 Subject: x86: rename the i8259_32/64.c leftovers to initirq_32/64.c The leftovers of the i8259 unification have nothing to do with i8259 at all. They contain interrupt init code and the i8259_xx name is just misleading now. Rename them to initirq_32/64.c Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259_32.c | 114 ------------------------ arch/x86/kernel/i8259_64.c | 203 ------------------------------------------- arch/x86/kernel/irqinit_32.c | 114 ++++++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 203 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 318 insertions(+), 318 deletions(-) delete mode 100644 arch/x86/kernel/i8259_32.c delete mode 100644 arch/x86/kernel/i8259_64.c create mode 100644 arch/x86/kernel/irqinit_32.c create mode 100644 arch/x86/kernel/irqinit_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f71a76ef2596..2a53ad2cb450 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c deleted file mode 100644 index d66914287ee1..000000000000 --- a/arch/x86/kernel/i8259_32.c +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - extern void math_error(void __user *); - outb(0,0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .mask = CPU_MASK_NONE, - .name = "fpu", -}; - -void __init init_ISA_irqs (void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < 16; i++) { - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); - } - - /* setup after call gates are initialised (usually add in - * the architecture specific gates) - */ - intr_init_hook(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c deleted file mode 100644 index a95d2bd27e9d..000000000000 --- a/arch/x86/kernel/i8259_64.c +++ /dev/null @@ -1,203 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -void __init init_ISA_irqs (void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -#endif - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c new file mode 100644 index 000000000000..d66914287ee1 --- /dev/null +++ b/arch/x86/kernel/irqinit_32.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .mask = CPU_MASK_NONE, + .name = "fpu", +}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < 16; i++) { + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c new file mode 100644 index 000000000000..a95d2bd27e9d --- /dev/null +++ b/arch/x86/kernel/irqinit_64.c @@ -0,0 +1,203 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + +#undef BUILD_16_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +/* for the irq vectors */ +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { + IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) +}; + +#undef IRQ +#undef IRQLIST_16 + + + + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +void __init init_ISA_irqs (void) +{ + int i; + + init_bsp_APIC(); + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef CONFIG_SMP + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} -- cgit v1.2.3 From fce39665abb71d01d74ac74eb13dd69a799dfc2f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: make init_ISA_irqs() static Moved to i8259 branch to avoid conflicts. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index a95d2bd27e9d..64bc0f14285f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -120,7 +120,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -void __init init_ISA_irqs (void) +static void __init init_ISA_irqs (void) { int i; -- cgit v1.2.3 From 21fd5132b223a10bdf17713dd0bf321cbd6471d2 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:44:02 +0200 Subject: x86: automatical unification of i8259.c Make conversion of i8259 very mechanical -- i8259 was generated by diff -D, with too different parts left in i8259_32 and i8259_64.c. Only "by hand" changes were removal of #ifdef from middle of the comment (prevented compilation) and removal of one static to allow splitting into files. Of course, it will need some cleanups now, and those will follow. Signed-of-by: Pavel Machek --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259.c | 368 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/i8259_32.c | 295 ------------------------------------ arch/x86/kernel/i8259_64.c | 309 ------------------------------------- include/asm-x86/i8259.h | 2 + 5 files changed, 371 insertions(+), 605 deletions(-) create mode 100644 arch/x86/kernel/i8259.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..f71a76ef2596 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 000000000000..2decba6b0101 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,368 @@ +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#else /* CONFIG_X86_64 */ +#include +#endif /* CONFIG_X86_64 */ +#include +#include +#include +#include +#ifndef CONFIG_X86_64 +#include +#endif /* ! CONFIG_X86_64 */ +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { +#ifndef CONFIG_X86_64 + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); +#else /* CONFIG_X86_64 */ + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); +#endif /* CONFIG_X86_64 */ + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + .name = "i8259", + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); +#endif /* CONFIG_X86_64 */ + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + +#endif /* CONFIG_X86_64 */ + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index fe631967d625..d66914287ee1 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,302 +21,7 @@ #include #include -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index 1870e0e86559..b44095efcf83 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -101,315 +101,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = #undef IRQ #undef IRQLIST_16 -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); - /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG - "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); - /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); - /* 8259A-2 is a slave on master's IR2 */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); - /* (slave's support for AEOI in flat mode is to be investigated) */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); - - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h index 45d4df3e51e6..2f98df91f1f2 100644 --- a/include/asm-x86/i8259.h +++ b/include/asm-x86/i8259.h @@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port) udelay(2); } +extern struct irq_chip i8259A_chip; + #endif /* __ASM_I8259_H__ */ -- cgit v1.2.3 From 15d613cb25efd978dd55592d011a6ffc487b3432 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:47:24 +0200 Subject: x86: i8259.c: remove #ifdefs around includes Remove #ifdefs around includes; including too much should be always safe. Signed-off-by: Pavel Machek Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8259.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 2decba6b0101..8b7eb0cb2e7e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -1,14 +1,10 @@ -#ifdef CONFIG_X86_64 #include -#endif /* CONFIG_X86_64 */ #include #include #include #include #include -#ifdef CONFIG_X86_64 #include -#endif /* CONFIG_X86_64 */ #include #include #include @@ -16,24 +12,17 @@ #include #include -#ifdef CONFIG_X86_64 #include -#endif /* CONFIG_X86_64 */ #include #include #include -#ifndef CONFIG_X86_64 #include -#else /* CONFIG_X86_64 */ #include -#endif /* CONFIG_X86_64 */ #include #include #include #include -#ifndef CONFIG_X86_64 #include -#endif /* ! CONFIG_X86_64 */ #include /* -- cgit v1.2.3 From 3e8631d27088f394b6788829e238a60bf07d47ab Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:52:52 +0200 Subject: x86: i8259.c: remove trivial ifdefs Remove #ifdefs where the only difference is formatting of comments. Signed-off-by: Pavel Machek Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8259.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8b7eb0cb2e7e..433e49edfd4f 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -175,24 +175,14 @@ handle_real_irq: if (irq & 8) { inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ outb(cached_slave_mask, PIC_SLAVE_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ -#else /* CONFIG_X86_64 */ /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); + outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); } else { inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ outb(cached_master_mask, PIC_MASTER_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ -#else /* CONFIG_X86_64 */ - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -215,12 +205,8 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { -#ifndef CONFIG_X86_64 - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#else /* CONFIG_X86_64 */ printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#endif /* CONFIG_X86_64 */ spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); -- cgit v1.2.3 From 680afbf989d697b9ba1382017a73c8cfa53b3f89 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 21 May 2008 11:57:52 +0200 Subject: x86: i8259: cleanup codingstyle Signed-off-by: Pavel Machek Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8259.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 433e49edfd4f..7a0fda8f01b5 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -129,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq) int irqmask = 1<> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ return value; } -- cgit v1.2.3 From d23b200a75f78e62744c3c25d078855eec8a689b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: make init_ISA_irqs() static Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/i8259_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index b44095efcf83..31f49e8f46a7 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -134,7 +134,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -void __init init_ISA_irqs (void) +static void __init init_ISA_irqs (void) { int i; -- cgit v1.2.3 From ec42418f1973e1f77da1fcca3be59c0acb878a9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 15:59:58 +0200 Subject: x86: rename the i8259_32/64.c leftovers to irqinit_32/64.c The leftovers of the i8259 unification have nothing to do with i8259 at all. They contain interrupt init code and the i8259_xx name is just misleading now. Rename them to irqinit_32/64.c Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259_32.c | 114 ----------------------- arch/x86/kernel/i8259_64.c | 217 ------------------------------------------- arch/x86/kernel/irqinit_32.c | 114 +++++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 217 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 332 insertions(+), 332 deletions(-) delete mode 100644 arch/x86/kernel/i8259_32.c delete mode 100644 arch/x86/kernel/i8259_64.c create mode 100644 arch/x86/kernel/irqinit_32.c create mode 100644 arch/x86/kernel/irqinit_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f71a76ef2596..2a53ad2cb450 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c deleted file mode 100644 index d66914287ee1..000000000000 --- a/arch/x86/kernel/i8259_32.c +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - extern void math_error(void __user *); - outb(0,0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .mask = CPU_MASK_NONE, - .name = "fpu", -}; - -void __init init_ISA_irqs (void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < 16; i++) { - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); - } - - /* setup after call gates are initialised (usually add in - * the architecture specific gates) - */ - intr_init_hook(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c deleted file mode 100644 index 31f49e8f46a7..000000000000 --- a/arch/x86/kernel/i8259_64.c +++ /dev/null @@ -1,217 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt"); - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -static void __init init_ISA_irqs (void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -#endif - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c new file mode 100644 index 000000000000..d66914287ee1 --- /dev/null +++ b/arch/x86/kernel/irqinit_32.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .mask = CPU_MASK_NONE, + .name = "fpu", +}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < 16; i++) { + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c new file mode 100644 index 000000000000..31f49e8f46a7 --- /dev/null +++ b/arch/x86/kernel/irqinit_64.c @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define IRQ_NAME2(nr) nr##_interrupt(void) +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +/* + * SMP has a few special interrupts for IPI messages + */ + +#define BUILD_IRQ(nr) \ + asmlinkage void IRQ_NAME(nr); \ + asm("\n.p2align\n" \ + "IRQ" #nr "_interrupt:\n\t" \ + "push $~(" #nr ") ; " \ + "jmp common_interrupt"); + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + +#undef BUILD_16_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +/* for the irq vectors */ +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { + IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) +}; + +#undef IRQ +#undef IRQLIST_16 + + + + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +static void __init init_ISA_irqs (void) +{ + int i; + + init_bsp_APIC(); + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef CONFIG_SMP + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} -- cgit v1.2.3 From 6360b1fbb4a939efd34fc770c2ebd927c55506e0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:44:41 +0200 Subject: move BUG_TABLE into RODATA Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/avr32/kernel/vmlinux.lds.S | 2 -- arch/parisc/kernel/vmlinux.lds.S | 1 - arch/powerpc/kernel/vmlinux.lds.S | 2 -- arch/s390/kernel/vmlinux.lds.S | 1 - arch/sh/kernel/vmlinux_32.lds.S | 1 - arch/sh/kernel/vmlinux_64.lds.S | 1 - arch/x86/kernel/vmlinux_32.lds.S | 8 +++----- arch/x86/kernel/vmlinux_64.lds.S | 10 ++++------ include/asm-generic/vmlinux.lds.h | 6 ++++++ 9 files changed, 13 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index 481cfd40c053..bc932c9b4272 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -93,8 +93,6 @@ SECTIONS __stop___ex_table = .; } - BUG_TABLE - RODATA . = ALIGN(THREAD_SIZE); diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 50b4a3a25d0a..ff7d4ff4675a 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -66,7 +66,6 @@ SECTIONS _etext = .; RODATA - BUG_TABLE /* writeable */ /* Make sure this is page aligned so diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 0c3000bf8d75..53d57d17a894 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -64,8 +64,6 @@ SECTIONS NOTES - BUG_TABLE - /* * Init sections discarded at runtime */ diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index b4607155e8d0..76c1e60c92f3 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -40,7 +40,6 @@ SECTIONS _etext = .; /* End of text section */ NOTES :text :note - BUG_TABLE :text RODATA diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S index c7113786ecd4..7b4b82bd1156 100644 --- a/arch/sh/kernel/vmlinux_32.lds.S +++ b/arch/sh/kernel/vmlinux_32.lds.S @@ -44,7 +44,6 @@ SECTIONS _etext = .; /* End of text section */ - BUG_TABLE NOTES RO_DATA(PAGE_SIZE) diff --git a/arch/sh/kernel/vmlinux_64.lds.S b/arch/sh/kernel/vmlinux_64.lds.S index d1e177009a41..33fa46451406 100644 --- a/arch/sh/kernel/vmlinux_64.lds.S +++ b/arch/sh/kernel/vmlinux_64.lds.S @@ -65,7 +65,6 @@ SECTIONS _etext = .; /* End of text section */ - BUG_TABLE NOTES RO_DATA(PAGE_SIZE) diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index ce5ed083a1e9..aa0855471c79 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -49,16 +49,14 @@ SECTIONS _etext = .; /* End of text section */ } :text = 0x9090 + NOTES :text :note + . = ALIGN(16); /* Exception table */ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; - } - - NOTES :text :note - - BUG_TABLE :text + } :text = 0x9090 . = ALIGN(4); .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fad3674b06a5..d123747af1e4 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -19,7 +19,7 @@ PHDRS { data PT_LOAD FLAGS(7); /* RWE */ user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(4); /* R__ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { @@ -40,16 +40,14 @@ SECTIONS _etext = .; /* End of text section */ } :text = 0x9090 + NOTES :text :note + . = ALIGN(16); /* Exception table */ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; - } - - NOTES :text :note - - BUG_TABLE :text + } :text = 0x9090 RODATA diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f054778e916c..dd2cc8122ad8 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -67,6 +67,8 @@ *(.rodata1) \ } \ \ + BUG_TABLE \ + \ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ @@ -310,6 +312,7 @@ .stab.indexstr 0 : { *(.stab.indexstr) } \ .comment 0 : { *(.comment) } +#ifdef CONFIG_GENERIC_BUG #define BUG_TABLE \ . = ALIGN(8); \ __bug_table : AT(ADDR(__bug_table) - LOAD_OFFSET) { \ @@ -317,6 +320,9 @@ *(__bug_table) \ __stop___bug_table = .; \ } +#else +#define BUG_TABLE +#endif #define NOTES \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ -- cgit v1.2.3 From 63687a528c39a67c1a213cdffa09feb0e6af9dbe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:44:41 +0200 Subject: x86: move tracedata to RODATA .. allowing it to be write-protected just as other read-only data under CONFIG_DEBUG_RODATA. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vmlinux_32.lds.S | 7 ------- arch/x86/kernel/vmlinux_64.lds.S | 7 ------- drivers/base/power/trace.c | 2 +- include/asm-generic/vmlinux.lds.h | 14 ++++++++++++++ include/asm-x86/resume-trace.h | 2 +- include/linux/resume-trace.h | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index ce5ed083a1e9..2674f5796275 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -60,13 +60,6 @@ SECTIONS BUG_TABLE :text - . = ALIGN(4); - .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { - __tracedata_start = .; - *(.tracedata) - __tracedata_end = .; - } - RODATA /* writeable */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fad3674b06a5..687041bfbae5 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -53,13 +53,6 @@ SECTIONS RODATA - . = ALIGN(4); - .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { - __tracedata_start = .; - *(.tracedata) - __tracedata_end = .; - } - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 2b4b392dcbc1..87a7f1d02578 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(set_trace_device); * it's not any guarantee, but it's a high _likelihood_ that * the match is valid). */ -void generate_resume_trace(void *tracedata, unsigned int user) +void generate_resume_trace(const void *tracedata, unsigned int user) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f054778e916c..f1992dc5c424 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -93,6 +93,8 @@ VMLINUX_SYMBOL(__end_rio_route_ops) = .; \ } \ \ + TRACEDATA \ + \ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab) = .; \ @@ -318,6 +320,18 @@ __stop___bug_table = .; \ } +#ifdef CONFIG_PM_TRACE +#define TRACEDATA \ + . = ALIGN(4); \ + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { \ + __tracedata_start = .; \ + *(.tracedata) \ + __tracedata_end = .; \ + } +#else +#define TRACEDATA +#endif + #define NOTES \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_notes) = .; \ diff --git a/include/asm-x86/resume-trace.h b/include/asm-x86/resume-trace.h index 2557514d7ef6..8d9f0b41ee86 100644 --- a/include/asm-x86/resume-trace.h +++ b/include/asm-x86/resume-trace.h @@ -6,7 +6,7 @@ #define TRACE_RESUME(user) \ do { \ if (pm_trace_enabled) { \ - void *tracedata; \ + const void *tracedata; \ asm volatile(_ASM_MOV_UL " $1f,%0\n" \ ".section .tracedata,\"a\"\n" \ "1:\t.word %c1\n\t" \ diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h index f3f4f28c6960..c9ba2fdf807d 100644 --- a/include/linux/resume-trace.h +++ b/include/linux/resume-trace.h @@ -8,7 +8,7 @@ extern int pm_trace_enabled; struct device; extern void set_trace_device(struct device *); -extern void generate_resume_trace(void *tracedata, unsigned int user); +extern void generate_resume_trace(const void *tracedata, unsigned int user); #define TRACE_DEVICE(dev) do { \ if (pm_trace_enabled) \ -- cgit v1.2.3 From a2eddfa95919a730e0e5ed17e9c303fe5ba249cd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:44:41 +0200 Subject: x86: make /proc/stat account for all interrupts LAPIC interrupts, which don't go through the generic interrupt handling code, aren't accounted for in /proc/stat. Hence this patch adds a mechanism architectures can use to accordingly adjust the statistics. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq_32.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irq_64.c | 28 ++++++++++++++++++++++++++++ fs/proc/proc_misc.c | 9 +++++++++ include/asm-x86/hardirq.h | 6 ++++++ 4 files changed, 81 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 147352df28b9..468acd04aa2e 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -313,16 +313,20 @@ skip: per_cpu(irq_stat,j).irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif +#ifdef CONFIG_X86_MCE seq_printf(p, "TRM: "); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "SPU: "); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); +#endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); @@ -331,6 +335,40 @@ skip: return 0; } +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = nmi_count(cpu); + +#ifdef CONFIG_X86_LOCAL_APIC + sum += per_cpu(irq_stat, cpu).apic_timer_irqs; +#endif +#ifdef CONFIG_SMP + sum += per_cpu(irq_stat, cpu).irq_resched_count; + sum += per_cpu(irq_stat, cpu).irq_call_count; + sum += per_cpu(irq_stat, cpu).irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += per_cpu(irq_stat, cpu).irq_thermal_count; +#endif +#ifdef CONFIG_X86_LOCAL_APIC + sum += per_cpu(irq_stat, cpu).irq_spurious_count; +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +} + #ifdef CONFIG_HOTPLUG_CPU #include diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 3aac15466a91..1f78b238d8d2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -135,6 +135,7 @@ skip: seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif +#ifdef CONFIG_X86_MCE seq_printf(p, "TRM: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); @@ -143,6 +144,7 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); +#endif seq_printf(p, "SPU: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); @@ -152,6 +154,32 @@ skip: return 0; } +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = cpu_pda(cpu)->__nmi_count; + + sum += cpu_pda(cpu)->apic_timer_irqs; +#ifdef CONFIG_SMP + sum += cpu_pda(cpu)->irq_resched_count; + sum += cpu_pda(cpu)->irq_call_count; + sum += cpu_pda(cpu)->irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += cpu_pda(cpu)->irq_thermal_count; + sum += cpu_pda(cpu)->irq_threshold_count; +#endif + sum += cpu_pda(cpu)->irq_spurious_count; + return sum; +} + +u64 arch_irq_stat(void) +{ + return atomic_read(&irq_err_count); +} + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 74a323d2b850..903e617bec58 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -472,6 +472,13 @@ static const struct file_operations proc_vmalloc_operations = { }; #endif +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + static int show_stat(struct seq_file *p, void *v) { int i; @@ -509,7 +516,9 @@ static int show_stat(struct seq_file *p, void *v) sum += temp; per_irq_sum[j] += temp; } + sum += arch_irq_stat_cpu(i); } + sum += arch_irq_stat(); seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), diff --git a/include/asm-x86/hardirq.h b/include/asm-x86/hardirq.h index 314434d664e7..000787df66e6 100644 --- a/include/asm-x86/hardirq.h +++ b/include/asm-x86/hardirq.h @@ -3,3 +3,9 @@ #else # include "hardirq_64.h" #endif + +extern u64 arch_irq_stat_cpu(unsigned int cpu); +#define arch_irq_stat_cpu arch_irq_stat_cpu + +extern u64 arch_irq_stat(void); +#define arch_irq_stat arch_irq_stat -- cgit v1.2.3 From c2c14fb7afcc67b48724571506e095fccc65a670 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: tsc_64.c make constant UL arch/x86/kernel/tsc_64.c:245:13: warning: constant 0x100000000 is so big it is long Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index fcc16e58609e..c19b0e273ef1 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -242,7 +242,7 @@ void __init tsc_calibrate(void) if (hpet) { printk(KERN_INFO "TSC calibrated against HPET\n"); if (hpet2 < hpet1) - hpet2 += 0x100000000; + hpet2 += 0x100000000UL; hpet2 -= hpet1; tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; } else { -- cgit v1.2.3 From eef8f871d84b5df1a24902a4e4700188be1aff2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:34 +0200 Subject: x86: vsmp_64 add missing includes sparse mutters: arch/x86/kernel/vsmp_64.c:126:5: warning: symbol 'is_vsmp_box' was not declared. Should it be static? arch/x86/kernel/vsmp_64.c:145:13: warning: symbol 'vsmp_init' was not declared. Should it be static? Include the appropriate headers. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index ba8c0b75ab0a..0c029e8959c7 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -15,9 +15,12 @@ #include #include #include + +#include #include #include #include +#include #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* -- cgit v1.2.3 From d1097635deed7196c2a859f285c29267779ca45a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 May 2008 23:42:01 +0200 Subject: x86: move mmconfig declarations to header arch/x86/kernel/mmconf-fam10h_64.c is missing the prototypes, which are decalred in arch/x86/kernel/setup_64.c. Move the prototypes and the inline stubs to the appropriate header file. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/mmconf-fam10h_64.c | 1 + arch/x86/kernel/setup_64.c | 13 +------------ include/asm-x86/mmconfig.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 12 deletions(-) create mode 100644 include/asm-x86/mmconfig.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index edc5fbfe85c0..fdfdc550b366 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../pci/pci.h" diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff1286ad8a..341230db74e1 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #ifdef CONFIG_PARAVIRT @@ -293,18 +294,6 @@ static void __init parse_setup_data(void) } } -#ifdef CONFIG_PCI_MMCONFIG -extern void __cpuinit fam10h_check_enable_mmcfg(void); -extern void __init check_enable_amd_mmconf_dmi(void); -#else -void __cpuinit fam10h_check_enable_mmcfg(void) -{ -} -void __init check_enable_amd_mmconf_dmi(void) -{ -} -#endif - /* * setup_arch - architecture-specific boot-time initializations * diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h new file mode 100644 index 000000000000..95beda07c6fa --- /dev/null +++ b/include/asm-x86/mmconfig.h @@ -0,0 +1,12 @@ +#ifndef _ASM_MMCONFIG_H +#define _ASM_MMCONFIG_H + +#ifdef CONFIG_PCI_MMCONFIG +extern void __cpuinit fam10h_check_enable_mmcfg(void); +extern void __init check_enable_amd_mmconf_dmi(void); +#else +static inline void fam10h_check_enable_mmcfg(void) { } +static inline void check_enable_amd_mmconf_dmi(void) { } +#endif + +#endif -- cgit v1.2.3 From e0b32d768cad17af07af3aced67498bde98296c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: make command_line static in setup_64.c Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 341230db74e1..78c1be69c6d3 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(edid_info); extern int root_mountflags; -char __initdata command_line[COMMAND_LINE_SIZE]; +static char __initdata command_line[COMMAND_LINE_SIZE]; static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, -- cgit v1.2.3 From 311f83494830fec17f086401a5b43984bb447cd2 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: kernel/pci-dma.c cleanups This patch contains the following cleanups: - make the following needlessly global code static: - dma_alloc_pages() Signed-off-by: Adrian Bunk Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index c5ef1af8e79d..5bc29ca4196e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -357,7 +357,7 @@ int dma_supported(struct device *dev, u64 mask) EXPORT_SYMBOL(dma_supported); /* Allocate DMA memory on node near device */ -noinline struct page * +static noinline struct page * dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { int node; -- cgit v1.2.3 From ebdd561a19d7917ac49fff9c355aa4833c504bf1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: constify data in reboot.c Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/reboot.c | 18 +++++++++--------- arch/x86/kernel/reboot_fixups_32.c | 4 ++-- include/asm-x86/reboot.h | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f6be7d5f82f8..f8a62160e151 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -27,7 +27,7 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static long no_idt[3]; +static const struct desc_ptr no_idt = {}; static int reboot_mode; enum reboot_type reboot_type = BOOT_KBD; int reboot_force; @@ -201,15 +201,15 @@ core_initcall(reboot_init); controller to pulse the CPU reset line, which is more thorough, but doesn't work with at least one type of 486 motherboard. It is easy to stop this code working; hence the copious comments. */ -static unsigned long long +static const unsigned long long real_mode_gdt_entries [3] = { 0x0000000000000000ULL, /* Null descriptor */ - 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ - 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ + 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ + 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct desc_ptr +static const struct desc_ptr real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, real_mode_idt = { 0x3ff, 0 }; @@ -231,7 +231,7 @@ real_mode_idt = { 0x3ff, 0 }; More could be done here to set up the registers as if a CPU reset had occurred; hopefully real BIOSs don't assume much. */ -static unsigned char real_mode_switch [] = +static const unsigned char real_mode_switch [] = { 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ @@ -245,7 +245,7 @@ static unsigned char real_mode_switch [] = 0x24, 0x10, /* f: andb $0x10,al */ 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ }; -static unsigned char jump_to_bios [] = +static const unsigned char jump_to_bios [] = { 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ }; @@ -255,7 +255,7 @@ static unsigned char jump_to_bios [] = * specified by the code and length parameters. * We assume that length will aways be less that 100! */ -void machine_real_restart(unsigned char *code, int length) +void machine_real_restart(const unsigned char *code, int length) { local_irq_disable(); @@ -368,7 +368,7 @@ static void native_machine_emergency_restart(void) } case BOOT_TRIPLE: - load_idt((const struct desc_ptr *)&no_idt); + load_idt(&no_idt); __asm__ __volatile__("int3"); reboot_type = BOOT_KBD; diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index dec0b5ec25c2..61a837743fe5 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -49,7 +49,7 @@ struct device_fixup { void (*reboot_fixup)(struct pci_dev *); }; -static struct device_fixup fixups_table[] = { +static const struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, @@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = { */ void mach_reboot_fixups(void) { - struct device_fixup *cur; + const struct device_fixup *cur; struct pci_dev *dev; int i; diff --git a/include/asm-x86/reboot.h b/include/asm-x86/reboot.h index e63741f19392..206f355786dc 100644 --- a/include/asm-x86/reboot.h +++ b/include/asm-x86/reboot.h @@ -14,8 +14,8 @@ struct machine_ops { extern struct machine_ops machine_ops; -void machine_real_restart(unsigned char *code, int length); void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); +void machine_real_restart(const unsigned char *code, int length); #endif /* _ASM_REBOOT_H */ -- cgit v1.2.3 From 369101da7e3f6f971b7303922d2978b8483241c6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: head_64.S cleanup - use predefined flags from processor-flags.h We should better use already defined flags from processor-flags.h instead of defining own ones [>>> object code check >>>] original md5sum: 9cfa6dbf045a046bb5dfb85f8bcfe8c4 arch/x86/kernel/head_64.o text data bss dec hex filename 37361 4432 8192 49985 c341 arch/x86/kernel/head_64.o patched md5sum: 9cfa6dbf045a046bb5dfb85f8bcfe8c4 arch/x86/kernel/head_64.o text data bss dec hex filename 37361 4432 8192 49985 c341 arch/x86/kernel/head_64.o [<<< object code check <<<] Signed-off-by: Cyrill Gorcunov Acked-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 10a1955bb1d1..2f59ca8bd16c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_PARAVIRT #include @@ -184,14 +185,10 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_PM 1 /* protected mode */ -#define CR0_MP (1<<1) -#define CR0_ET (1<<4) -#define CR0_NE (1<<5) -#define CR0_WP (1<<16) -#define CR0_AM (1<<18) -#define CR0_PAGING (1<<31) - movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) + movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 -- cgit v1.2.3 From 2cc74111c78dbc7b26c41b4e87cfebfc3aed49c4 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sun, 11 May 2008 19:35:54 +0800 Subject: x86: ipi.c: removed duplicated include Removed duplicated include in arch/x86/kernel/ipi.c. Signed-off-by: Huang Weiyi Cc: mingo@redhat.com Cc: hpa@zytor.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ipi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index c0df7b89ca23..9d98cda39ad9 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 883b7af932b4435eb4798cfa4fec0848639c2a87 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sun, 11 May 2008 19:36:57 +0800 Subject: x86: smpboot.c: removed duplicated include Removed duplicated include in arch/x86/kernel/smpboot.c. Signed-off-by: Huang Weiyi Cc: mingo@redhat.com Cc: hpa@zytor.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 38988491c622..0974fc0997b9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 05139d8fb445d9a3569071af1c5920fc46191b7b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 May 2008 21:14:22 +0400 Subject: x86: head_64.S cleanup - use straight move to CR4 register Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2f59ca8bd16c..7475b4c91f05 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -155,9 +155,7 @@ ENTRY(secondary_startup_64) */ /* Enable PAE mode and PGE */ - xorq %rax, %rax - btsq $5, %rax - btsq $7, %rax + movl $(X86_CR4_PAE | X86_CR4_PGE), %eax movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ -- cgit v1.2.3 From 0e192b99d7d09f08b445c31ee7c7f3d0bfb27345 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 May 2008 20:55:40 +0400 Subject: x86: head_64.S cleanup - use PMD_SHIFT instead of numeric constant Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7475b4c91f05..d8ed3259e6e4 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -322,11 +322,11 @@ early_idt_ripmsg: ENTRY(name) /* Automate the creation of 1 to 1 mapping pmd entries */ -#define PMDS(START, PERM, COUNT) \ - i = 0 ; \ - .rept (COUNT) ; \ - .quad (START) + (i << 21) + (PERM) ; \ - i = i + 1 ; \ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ + i = i + 1 ; \ .endr /* -- cgit v1.2.3 From bfe4bb1526945e446d2912bef2e1e2cbd2c7349e Mon Sep 17 00:00:00 2001 From: Miklos Vajna Date: Sat, 17 May 2008 22:48:13 +0200 Subject: x86: janitor work in bugs.c Just moved trailing statements to the next line, removed space before open/close parenthesis, wrapped long lines. Signed-off-by: Miklos Vajna Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 170d2f5523b2..1b1c56bb338f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -59,8 +59,12 @@ static void __init check_fpu(void) return; } -/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ - /* Test for the divl bug.. */ + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug.. + */ __asm__("fninit\n\t" "fldl %1\n\t" "fdivl %2\n\t" @@ -108,10 +112,15 @@ static void __init check_popad(void) "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " : "=&a" (res) : "d" (inp) - : "ecx", "edi" ); - /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ - if (res != 12345678) printk( "Buggy.\n" ); - else printk( "OK.\n" ); + : "ecx", "edi"); + /* + * If this fails, it means that any user program may lock the + * CPU hard. Too bad. + */ + if (res != 12345678) + printk("Buggy.\n"); + else + printk("OK.\n"); #endif } @@ -137,7 +146,8 @@ static void __init check_config(void) * i486+ only features! (WP works in supervisor mode and the * new "invlpg" and "bswap" instructions) */ -#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) +#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ + defined(CONFIG_X86_BSWAP) if (boot_cpu_data.x86 == 3) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif @@ -170,6 +180,7 @@ void __init check_bugs(void) check_fpu(); check_hlt(); check_popad(); - init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); } -- cgit v1.2.3 From 83cd1daa1dc3400e5ca2255818641233ebb30680 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:44:39 +0200 Subject: x86: eliminate dead code in x86_64 entry.S Remove the not longer used handlers for reserved vectors. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_64.S | 4 ---- arch/x86/kernel/traps_64.c | 3 --- 2 files changed, 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df522a7..1f1751dfb2f3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1120,10 +1120,6 @@ ENTRY(coprocessor_segment_overrun) zeroentry do_coprocessor_segment_overrun END(coprocessor_segment_overrun) -ENTRY(reserved) - zeroentry do_reserved -END(reserved) - /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index adff76ea97c4..ec6d3b2130c4 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -71,7 +71,6 @@ asmlinkage void general_protection(void); asmlinkage void page_fault(void); asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); -asmlinkage void reserved(void); asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); @@ -702,12 +701,10 @@ DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_ERROR( 4, SIGSEGV, "overflow", overflow) DO_ERROR( 5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR(18, SIGSEGV, "reserved", reserved) /* Runs on IST stack */ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) -- cgit v1.2.3 From 0da72a4aeb4482c64c1142a2e36b556d13374937 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Apr 2008 20:11:51 +0200 Subject: x86: fix sparse warning in mtrr/generic.c arch/x86/kernel/cpu/mtrr/generic.c:216:12: warning: symbol 'lo' shadows an earlier one Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5d241ce94a44..0625d4158e58 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -213,12 +213,12 @@ void __init get_mtrr_state(void) mtrr_state.enabled = (lo & 0xc00) >> 10; if (amd_special_default_mtrr()) { - unsigned lo, hi; + unsigned low, high; /* TOP_MEM2 */ - rdmsr(MSR_K8_TOP_MEM2, lo, hi); - tom2 = hi; + rdmsr(MSR_K8_TOP_MEM2, low, high); + tom2 = high; tom2 <<= 32; - tom2 |= lo; + tom2 |= low; tom2 &= 0xffffff8000000ULL; } if (mtrr_show) { -- cgit v1.2.3 From 0dbfafa5fcd4dd189e2adc7b6ed9e0405e846d79 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 23 Apr 2008 15:09:05 +0200 Subject: x86: move i386 memory setup code to e820_32.c The x86_64 code has centralized the memory setup code in e820_64.c. This patch copies that approach to i386: - early_param("mem", ...) parsing is moved from setup_32.c to e820_32.c. - setup_memory_map() and finish_e820_parsing() are factored out from setup_arch(), and declarations are added to e820_32.h. - print_memory_map() is made static and removed from e820_32.h. - user_defined_memmap is marked as __initdata. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/e820_32.c | 60 ++++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/setup_32.c | 50 ++------------------------------------ include/asm-x86/e820_32.h | 4 +++- 3 files changed, 63 insertions(+), 51 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index ed733e7cf4e6..31ea2bb8c91a 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -30,7 +30,6 @@ unsigned long pci_mem_start = 0x10000000; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif -extern int user_defined_memmap; static struct resource system_rom_resource = { .name = "System ROM", @@ -584,7 +583,7 @@ void __init e820_register_memory(void) pci_mem_start, gapstart, gapsize); } -void __init print_memory_map(char *who) +static void __init print_memory_map(char *who) { int i; @@ -692,6 +691,54 @@ e820_all_mapped(unsigned long s, unsigned long e, unsigned type) return 0; } +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __init __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + +void __init setup_memory_map(void) +{ + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(memory_setup()); +} + +static int __initdata user_defined_memmap; + +/* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to , overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * to +, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ +static int __init parse_mem(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "nopentium") == 0) { + setup_clear_cpu_cap(X86_FEATURE_PSE); + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long mem_size; + + mem_size = memparse(arg, &arg); + limit_regions(mem_size); + user_defined_memmap = 1; + } + return 0; +} +early_param("mem", parse_mem); + static int __init parse_memmap(char *arg) { if (!arg) @@ -762,6 +809,15 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, new_type); } } + +void __init finish_e820_parsing(void) +{ + if (user_defined_memmap) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); + } +} + void __init update_e820(void) { u8 nr_map; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 2c5f8b213e86..b54c79c91efd 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -237,42 +237,6 @@ static inline void copy_edd(void) } #endif -int __initdata user_defined_memmap; - -/* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to , overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * to +, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] - */ -static int __init parse_mem(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "nopentium") == 0) { - setup_clear_cpu_cap(X86_FEATURE_PSE); - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; - - mem_size = memparse(arg, &arg); - limit_regions(mem_size); - user_defined_memmap = 1; - } - return 0; -} -early_param("mem", parse_mem); - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. @@ -725,12 +689,6 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - #ifdef CONFIG_NUMA /* * In the golden day, when everything among i386 and x86_64 will be @@ -786,8 +744,7 @@ void __init setup_arch(char **cmdline_p) #endif ARCH_SETUP - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); + setup_memory_map(); copy_edd(); @@ -807,10 +764,7 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - if (user_defined_memmap) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } + finish_e820_parsing(); strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index a9f7c6ec32bf..e1f10c60901f 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -18,6 +18,9 @@ #ifndef __ASSEMBLY__ +extern void setup_memory_map(void); +extern void finish_e820_parsing(void); + extern struct e820map e820; extern void update_e820(void); @@ -32,7 +35,6 @@ extern void update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void e820_register_memory(void); extern void limit_regions(unsigned long long size); -extern void print_memory_map(char *who); extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); -- cgit v1.2.3 From 95ffa2438d0e9c48779f0106b1c0eb36165e759c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 29 Apr 2008 03:52:33 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete layout, v8 some BIOS like to use continus MTRR layout, and X driver can not add WB entries for graphical cards when 4g or more RAM installed. the patch will change MTRR to discrete. mtrr_chunk_size= could be used to have smaller continuous block to hold holes. default is 256m, could be set according to size of graphics card memory. mtrr_gran_size= could be used to send smallest mtrr block to avoid run out of MTRRs v2: fix -1 for UC checking v3: default to disable, and need use enable_mtrr_cleanup to enable this feature skip the var state change warning. remove next_basek in range_to_mtrr() v4: correct warning mask. v5: CONFIG_MTRR_SANITIZER v6: fix 1g, 2g, 512 aligment with extra hole v7: gran_sizek to prevent running out of MTRRs. v8: fix hole_basek caculation caused when removing next_basek gran_sizek using when basek is 0. need to apply [PATCH] x86: fix trimming e820 with MTRR holes. right after this one. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 14 ++ arch/x86/Kconfig | 26 ++ arch/x86/kernel/cpu/mtrr/generic.c | 32 ++- arch/x86/kernel/cpu/mtrr/main.c | 467 +++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/mtrr/mtrr.h | 3 + 5 files changed, 528 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e07c432c731f..44427607b7c6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -599,6 +599,20 @@ and is between 256 and 4096 characters. It is defined in the file See drivers/char/README.epca and Documentation/digiepca.txt. + disable_mtrr_cleanup [X86] + enable_mtrr_cleanup [X86] + The kernel tries to adjust MTRR layout from continuous + to discrete, to make X server driver able to add WB + entry later. This parameter enables/disables that. + + mtrr_chunk_size=nn[KMG] [X86] + used for mtrr cleanup. It is largest continous chunk + that could hold holes aka. UC entries. + + mtrr_gran_size=nn[KMG] [X86] + used for mtrr cleanup. It is granity of mtrr block. + Big value could prevent small alignment use up MTRRs. + disable_mtrr_trim [X86, Intel and AMD only] By default the kernel will trim any uncacheable memory out of your available memory pool based on diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae7ef2f..97a1764a3d20 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1092,6 +1092,32 @@ config MTRR See for more information. +config MTRR_SANITIZER + def_bool y + prompt "MTRR cleanup support" + depends on MTRR + help + Convert MTRR layout from continuous to discrete, so some X driver + could add WB entries. + + Say N here if you see bootup problems (boot crash, boot hang, + spontaneous reboots). + + Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size + could be used to send largest mtrr entry size for continuous block + to hold holes (aka. UC entries) + + If unsure, say Y. + +config MTRR_SANITIZER_ENABLE_DEFAULT + def_bool y + prompt "Enable MTRR cleanup by default" + depends on MTRR_SANITIZER + help + Enable mtrr cleanup by default + + If unsure, say Y. + config X86_PAT bool prompt "x86 PAT support" diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0625d4158e58..5aae648600bf 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = { static unsigned long smp_changes_mask; static struct mtrr_state mtrr_state = {}; static int mtrr_state_set; -static u64 tom2; +u64 mtrr_tom2; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." @@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) } } - if (tom2) { - if (start >= (1ULL<<32) && (end < tom2)) + if (mtrr_tom2) { + if (start >= (1ULL<<32) && (end < mtrr_tom2)) return MTRR_TYPE_WRBACK; } @@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } +/* fill the MSR pair relating to a var range */ +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) +{ + struct mtrr_var_range *vr; + + vr = mtrr_state.var_ranges; + + vr[index].base_lo = base_lo; + vr[index].base_hi = base_hi; + vr[index].mask_lo = mask_lo; + vr[index].mask_hi = mask_hi; +} + static void get_fixed_ranges(mtrr_type * frs) { @@ -216,10 +230,10 @@ void __init get_mtrr_state(void) unsigned low, high; /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); - tom2 = high; - tom2 <<= 32; - tom2 |= low; - tom2 &= 0xffffff8000000ULL; + mtrr_tom2 = high; + mtrr_tom2 <<= 32; + mtrr_tom2 |= low; + mtrr_tom2 &= 0xffffff8000000ULL; } if (mtrr_show) { int high_width; @@ -251,9 +265,9 @@ void __init get_mtrr_state(void) else printk(KERN_INFO "MTRR %u disabled\n", i); } - if (tom2) { + if (mtrr_tom2) { printk(KERN_INFO "TOM2: %016llx aka %lldM\n", - tom2, tom2>>20); + mtrr_tom2, mtrr_tom2>>20); } } mtrr_state_set = 1; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6a1e278d9323..8a6f68b45e3e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -609,6 +610,452 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; +#ifdef CONFIG_MTRR_SANITIZER + +#ifdef CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT +static int enable_mtrr_cleanup __initdata = 1; +#else +static int enable_mtrr_cleanup __initdata; +#endif + +#else + +static int enable_mtrr_cleanup __initdata = -1; + +#endif + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); + +#define RANGE_NUM 256 + +struct res_range { + unsigned long start; + unsigned long end; +}; + +static int __init add_range(struct res_range *range, int nr_range, unsigned long start, + unsigned long end, int merge) +{ + int i; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + for (i = 0; i < nr_range; i++) { + unsigned long final_start, final_end; + unsigned long common_start, common_end; + + if (!range[i].end) + continue; + + common_start = max(range[i].start, start); + common_end = min(range[i].end, end); + if (common_start > common_end + 1) + continue; + + final_start = min(range[i].start, start); + final_end = max(range[i].end, end); + + range[i].start = final_start; + range[i].end = final_end; + return nr_range; + } + +addit: + /* need to add that */ + if (nr_range >= RANGE_NUM) + return nr_range; + + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; + +} +static void __init subtract_range(struct res_range *range, unsigned long start, + unsigned long end) +{ + int i; + int j; + + for (j = 0; j < RANGE_NUM; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { + range[j].end = start - 1; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* find the new spare */ + for (i = 0; i < RANGE_NUM; i++) { + if (range[i].end == 0) + break; + } + if (i < RANGE_NUM) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + continue; + } + } +} + +static int __init cmp_range(const void *x1, const void *x2) +{ + const struct res_range *r1 = x1; + const struct res_range *r2 = x2; + long start1, start2; + + start1 = r1->start; + start2 = r2->start; + + return start1 - start2; +} + +struct var_mtrr_state { + unsigned long range_startk, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; + unsigned address_bits; +}; + +static void __init set_var_mtrr( + unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned address_bits) +{ + u32 base_lo, base_hi, mask_lo, mask_hi; + unsigned address_mask_high; + + if (!sizek) { + fill_mtrr_var_range(reg, 0, 0, 0, 0); + return; + } + + address_mask_high = ((1u << (address_bits - 32u)) - 1u); + + base_hi = basek >> 22; + base_lo = basek << 10; + + if (sizek < 4*1024*1024) { + mask_hi = address_mask_high; + mask_lo = ~((sizek << 10) - 1); + } else { + mask_hi = address_mask_high & (~((sizek >> 22) - 1)); + mask_lo = 0; + } + + base_lo |= type; + mask_lo |= 0x800; + fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +} + +static unsigned int __init range_to_mtrr(unsigned int reg, + unsigned long range_startk, unsigned long range_sizek, + unsigned char type, unsigned address_bits) +{ + if (!range_sizek || (reg >= num_var_ranges)) + return reg; + + while (range_sizek) { + unsigned long max_align, align; + unsigned long sizek; + /* Compute the maximum size I can make a range */ + if (range_startk) + max_align = ffs(range_startk) - 1; + else + max_align = 32; + align = fls(range_sizek) - 1; + if (align > max_align) + align = max_align; + + sizek = 1 << align; + printk(KERN_INFO "Setting variable MTRR %d, base: %ldMB, range: %ldMB, type %s\n", + reg, range_startk >> 10, sizek >> 10, + (type == MTRR_TYPE_UNCACHABLE)?"UC": + ((type == MTRR_TYPE_WRBACK)?"WB":"Other") + ); + set_var_mtrr(reg++, range_startk, sizek, type, address_bits); + range_startk += sizek; + range_sizek -= sizek; + if (reg >= num_var_ranges) + break; + } + return reg; +} + +static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) +{ + unsigned long hole_basek, hole_sizek; + unsigned long range0_basek, range0_sizek; + unsigned long range_basek, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + + hole_basek = 0; + hole_sizek = 0; + chunk_sizek = state->chunk_sizek; + gran_sizek = state->gran_sizek; + + /* align with gran size, prevent small block used up MTRRs */ + range_basek = ALIGN(state->range_startk, gran_sizek); + if ((range_basek > basek) && basek) + return; + range_sizek = ALIGN(state->range_sizek - (range_basek - state->range_startk), gran_sizek); + + while (range_basek + range_sizek > (state->range_startk + state->range_sizek)) { + range_sizek -= gran_sizek; + if (!range_sizek) + return; + } + state->range_startk = range_basek; + state->range_sizek = range_sizek; + + /* try to append some small hole */ + range0_basek = state->range_startk; + range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + if ((range0_sizek == state->range_sizek) || + ((range0_basek + range0_sizek - chunk_sizek > basek) && basek)) { + printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits); + return; + } + + + range0_sizek -= chunk_sizek; + printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK, state->address_bits); + + range_basek = range0_basek + range0_sizek; + range_sizek = chunk_sizek; + if (range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) { + hole_sizek = range_sizek - (state->range_sizek - range0_sizek); + hole_basek = range_basek + range_sizek - hole_sizek; + } else + range_sizek = state->range_sizek - range0_sizek; + + printk(KERN_INFO "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK, state->address_bits); + if (hole_sizek) { + printk(KERN_INFO "hole: %016lx - %016lx\n", hole_basek<<10, (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE, state->address_bits); + } +} + +static void __init set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn) +{ + unsigned long basek, sizek; + + if (state->reg >= num_var_ranges) + return; + + basek = base_pfn << (PAGE_SHIFT - 10); + sizek = size_pfn << (PAGE_SHIFT - 10); + + /* See if I can merge with the last range */ + if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { + unsigned long endk = basek + sizek; + state->range_sizek = endk - state->range_startk; + return; + } + /* Write the range mtrrs */ + if (state->range_sizek != 0) { + range_to_mtrr_with_hole(state, basek); + + state->range_startk = 0; + state->range_sizek = 0; + } + /* Allocate an msr */ + state->range_startk = basek; + state->range_sizek = sizek; +} + +/* mininum size of mtrr block that can take hole */ +static u64 mtrr_chunk_size __initdata = (256ULL<<20); + +static int __init parse_mtrr_chunk_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_chunk_size = memparse(p, &p); + return 0; +} +early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); + +/* granity of mtrr of block */ +static u64 mtrr_gran_size __initdata = (64ULL<<20); + +static int __init parse_mtrr_gran_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_gran_size = memparse(p, &p); + return 0; +} +early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); + +static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, unsigned address_bits) +{ + struct var_mtrr_state var_state; + int i; + + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.address_bits = address_bits; + var_state.chunk_sizek = mtrr_chunk_size >> 10; + var_state.gran_sizek = mtrr_gran_size >> 10; + + /* Write the range etc */ + for (i = 0; i < nr_range; i++) + set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + + /* Write the last range */ + range_to_mtrr_with_hole(&var_state, 0); + printk(KERN_INFO "DONE variable MTRRs\n"); + /* Clear out the extra MTRR's */ + while (var_state.reg < num_var_ranges) + set_var_mtrr(var_state.reg++, 0, 0, 0, var_state.address_bits); +} + +static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) +{ + unsigned long i, base, size; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type != MTRR_TYPE_WRBACK) + continue; + nr_range = add_range(range, nr_range, base, base + size - 1, 1); + } + printk(KERN_INFO "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + + /* take out UC ranges */ + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type != MTRR_TYPE_UNCACHABLE) + continue; + if (!size) + continue; + subtract_range(range, base, base + size - 1); + } + if (extra_remove_size) + subtract_range(range, extra_remove_base, extra_remove_base + extra_remove_size - 1); + + /* get new range num */ + nr_range = 0; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + nr_range++; + } + printk(KERN_INFO "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + printk(KERN_INFO "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + + return nr_range; +} + +static int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long i, base, size, def, dummy; + mtrr_type type; + struct res_range range[RANGE_NUM]; + int nr_range; + unsigned long extra_remove_base, extra_remove_size; + + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* check if we got UC entries */ + if (!num[MTRR_TYPE_UNCACHABLE]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + extra_remove_size = 0; + if (mtrr_tom2) { + extra_remove_base = 1 << (32 - PAGE_SHIFT); + extra_remove_size = (mtrr_tom2>>PAGE_SHIFT) - extra_remove_base; + } + nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); + + /* convert ranges to var ranges state */ + x86_setup_var_mtrrs(range, nr_range, address_bits); + + return 1; + +} + static int disable_mtrr_trim; static int __init disable_mtrr_trim_setup(char *str) @@ -729,18 +1176,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ void __init mtrr_bp_init(void) { + u32 phys_addr; init_ifs(); + phys_addr = 32; + if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; size_or_mask = 0xff000000; /* 36 bits */ size_and_mask = 0x00f00000; + phys_addr = 36; /* This is an AMD specific MSR, but we assume(hope?) that Intel will implement it to when they extend the address bus of the Xeon. */ if (cpuid_eax(0x80000000) >= 0x80000008) { - u32 phys_addr; phys_addr = cpuid_eax(0x80000008) & 0xff; /* CPUID workaround for Intel 0F33/0F34 CPU */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -758,6 +1208,7 @@ void __init mtrr_bp_init(void) don't support PAE */ size_or_mask = 0xfff00000; /* 32 bits */ size_and_mask = 0; + phys_addr = 32; } } else { switch (boot_cpu_data.x86_vendor) { @@ -791,8 +1242,13 @@ void __init mtrr_bp_init(void) if (mtrr_if) { set_num_var_ranges(); init_table(); - if (use_intel()) + if (use_intel()) { get_mtrr_state(); + + if (mtrr_cleanup(phys_addr)) + mtrr_if->set_all(); + + } } } @@ -829,9 +1285,10 @@ static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; - if (use_intel()) - mtrr_state_warn(); - else { + if (use_intel()) { + if (enable_mtrr_cleanup < 1) + mtrr_state_warn(); + } else { /* The CPUs haven't MTRR and seem to not support SMP. They have * specific drivers, we use a tricky method to support * suspend/resume for them. diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2cc77eb6fea3..2dc4ec656b23 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt); void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); extern void set_mtrr_ops(struct mtrr_ops * ops); @@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if; #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) extern unsigned int num_var_ranges; +extern u64 mtrr_tom2; void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); -- cgit v1.2.3 From 42651f15824d003e8357693ab72c4dbb3e280836 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 29 Apr 2008 01:59:49 -0700 Subject: x86: fix trimming e820 with MTRR holes. converting MTRR layout from continous to discrete, some time could run out of MTRRs. So add gran_sizek to prevent that by dumpping small RAM piece less than gran_sizek. previous trimming only can handle highest_pfn from mtrr to end_pfn from e820. when have more than 4g RAM installed, there will be holes below 4g. so need to check ram below 4g is coverred well. need to be applied after [PATCH] x86: mtrr cleanup for converting continuous to discrete layout v7 Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/main.c | 101 +++++++++++++++++++++++++++++++++------- arch/x86/kernel/e820_32.c | 7 ++- arch/x86/kernel/e820_64.c | 6 ++- include/asm-x86/e820_32.h | 2 +- include/asm-x86/e820_64.h | 2 +- 5 files changed, 97 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8a6f68b45e3e..9ab5c16b0d52 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1095,6 +1095,17 @@ int __init amd_special_default_mtrr(void) return 0; } +static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) +{ + u64 trim_start, trim_size; + trim_start = start_pfn; + trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + return update_memory_range(trim_start, trim_size, E820_RAM, + E820_RESERVED); +} /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs * @end_pfn: ending page frame number @@ -1110,8 +1121,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - u64 trim_start, trim_size; + struct res_range range[RANGE_NUM]; + int nr_range; + u64 total_real_trim_size; + int changed; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: @@ -1123,9 +1139,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) if (def != MTRR_TYPE_UNCACHABLE) return 0; - if (amd_special_default_mtrr()) - return 0; - /* Find highest cached pfn */ for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -1145,26 +1158,80 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } - if (highest_pfn < end_pfn) { + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* no entry for WB? */ + if (!num[MTRR_TYPE_WRBACK]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + nr_range = 0; + if (mtrr_tom2) { + range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); + range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; + if (highest_pfn < range[nr_range].end + 1) + highest_pfn = range[nr_range].end + 1; + nr_range++; + } + nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + + changed = 0; + total_real_trim_size = 0; + + /* check the top at first */ + i = nr_range - 1; + if (range[i].end + 1 < end_pfn) { + total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); + } + + if (total_real_trim_size) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %luMB of RAM.\n", - (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); + " all of memory, losing %lluMB of RAM.\n", + total_real_trim_size >> 20); WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr\n"); - trim_start = highest_pfn; - trim_start <<= PAGE_SHIFT; - trim_size = end_pfn; - trim_size <<= PAGE_SHIFT; - trim_size -= trim_start; - update_memory_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); + printk(KERN_INFO "update e820 for mtrr -- end_pfn\n"); update_e820(); - return 1; + changed = 1; } - return 0; + total_real_trim_size = 0; + if (range[0].start) + total_real_trim_size += real_trim_memory(0, range[0].start); + + for (i = 0; i < nr_range - 1; i--) { + if (range[i].end + 1 < range[i+1].start) + total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); + } + + if (total_real_trim_size) { + printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" + " all of memory, losing %lluMB of RAM.\n", + total_real_trim_size >> 20); + + WARN_ON(1); + + printk(KERN_INFO "update e820 for mtrr -- holes\n"); + update_e820(); + changed = 1; + } + + return changed; } /** diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 31ea2bb8c91a..857f706273a8 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -783,10 +783,11 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); -void __init update_memory_range(u64 start, u64 size, unsigned old_type, +u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { int i; + u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -798,6 +799,7 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, /* totally covered? */ if (ei->addr >= start && ei->size <= size) { ei->type = new_type; + real_updated_size += ei->size; continue; } /* partially covered */ @@ -807,7 +809,10 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, continue; add_memory_region(final_start, final_end - final_start, new_type); + real_updated_size += final_end - final_start; } + + return real_updated_size; } void __init finish_e820_parsing(void) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 124480c0008d..848b2cd2d1dd 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -829,10 +829,11 @@ void __init finish_e820_parsing(void) } } -void __init update_memory_range(u64 start, u64 size, unsigned old_type, +u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { int i; + u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -844,6 +845,7 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, /* totally covered? */ if (ei->addr >= start && ei->size <= size) { ei->type = new_type; + real_updated_size += ei->size; continue; } /* partially covered */ @@ -853,7 +855,9 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, continue; add_memory_region(final_start, final_end - final_start, new_type); + real_updated_size += final_end - final_start; } + return real_updated_size; } void __init update_e820(void) diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index e1f10c60901f..af0711b220df 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -31,7 +31,7 @@ extern void propagate_e820_map(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void add_memory_region(unsigned long long start, unsigned long long size, int type); -extern void update_memory_range(u64 start, u64 size, unsigned old_type, +extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void e820_register_memory(void); extern void limit_regions(unsigned long long size); diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 71c4d685d30d..6ae3e2803286 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -21,7 +21,7 @@ extern unsigned long find_e820_area_size(unsigned long start, unsigned long align); extern void add_memory_region(unsigned long start, unsigned long size, int type); -extern void update_memory_range(u64 start, u64 size, unsigned old_type, +extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void setup_memory_region(void); extern void contig_e820_setup(void); -- cgit v1.2.3 From 8a374026c265476b1acfc3c186a66d59ebdb2cda Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 29 Apr 2008 20:25:16 -0700 Subject: x86: fix trimming e820 with MTRR holes. - fix v2: process hole then end_pfn fix update_memory_range with whole cover comparing Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/main.c | 44 ++++++++++++++--------------------------- arch/x86/kernel/e820_32.c | 3 ++- arch/x86/kernel/e820_64.c | 3 ++- 3 files changed, 19 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 9ab5c16b0d52..5a2a4c146333 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1098,11 +1098,12 @@ int __init amd_special_default_mtrr(void) static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) { u64 trim_start, trim_size; - trim_start = start_pfn; + trim_start = start_pfn; trim_start <<= PAGE_SHIFT; trim_size = limit_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; + return update_memory_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } @@ -1124,7 +1125,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) struct res_range range[RANGE_NUM]; int nr_range; u64 total_real_trim_size; - int changed; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1189,49 +1189,35 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); - changed = 0; - total_real_trim_size = 0; - - /* check the top at first */ - i = nr_range - 1; - if (range[i].end + 1 < end_pfn) { - total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); - } - - if (total_real_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_real_trim_size >> 20); - - WARN_ON(1); - - printk(KERN_INFO "update e820 for mtrr -- end_pfn\n"); - update_e820(); - changed = 1; - } - total_real_trim_size = 0; + /* check the head */ if (range[0].start) total_real_trim_size += real_trim_memory(0, range[0].start); - - for (i = 0; i < nr_range - 1; i--) { + /* check the holes */ + for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); } + /* check the top */ + i = nr_range - 1; + if (range[i].end + 1 < end_pfn) + total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); if (total_real_trim_size) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" " all of memory, losing %lluMB of RAM.\n", total_real_trim_size >> 20); - WARN_ON(1); + if (enable_mtrr_cleanup < 1) + WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr -- holes\n"); + printk(KERN_INFO "update e820 for mtrr\n"); update_e820(); - changed = 1; + + return 1; } - return changed; + return 0; } /** diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 857f706273a8..751d517d802c 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -797,7 +797,8 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, if (ei->type != old_type) continue; /* totally covered? */ - if (ei->addr >= start && ei->size <= size) { + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { ei->type = new_type; real_updated_size += ei->size; continue; diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 848b2cd2d1dd..c45b4dea4055 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -843,7 +843,8 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, if (ei->type != old_type) continue; /* totally covered? */ - if (ei->addr >= start && ei->size <= size) { + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { ei->type = new_type; real_updated_size += ei->size; continue; -- cgit v1.2.3 From f5098d62c1d1cede8ff23d01bbf50a421f110562 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 29 Apr 2008 20:25:58 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete layout v8 - fix v9: address format change requests by Ingo more case handling in range_to_var_with_hole Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 9 +-- arch/x86/kernel/cpu/mtrr/main.c | 164 ++++++++++++++++++++++------------------ 2 files changed, 94 insertions(+), 79 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 97a1764a3d20..f417fe33d852 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1110,13 +1110,12 @@ config MTRR_SANITIZER If unsure, say Y. config MTRR_SANITIZER_ENABLE_DEFAULT - def_bool y - prompt "Enable MTRR cleanup by default" + int "MTRR cleanup enable value (0-1)" + range 0 1 + default "0" depends on MTRR_SANITIZER help - Enable mtrr cleanup by default - - If unsure, say Y. + Enable mtrr cleanup default value config X86_PAT bool diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5a2a4c146333..79597d3c3431 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -611,17 +611,9 @@ static struct sysdev_driver mtrr_sysdev_driver = { }; #ifdef CONFIG_MTRR_SANITIZER - -#ifdef CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT -static int enable_mtrr_cleanup __initdata = 1; -#else -static int enable_mtrr_cleanup __initdata; -#endif - +static int enable_mtrr_cleanup __initdata = CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; #else - static int enable_mtrr_cleanup __initdata = -1; - #endif static int __init disable_mtrr_cleanup_setup(char *str) @@ -640,6 +632,7 @@ static int __init enable_mtrr_cleanup_setup(char *str) } early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); +/* should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 struct res_range { @@ -647,13 +640,27 @@ struct res_range { unsigned long end; }; -static int __init add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end, int merge) +static int __init +add_range(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) { - int i; + /* out of slots */ + if (nr_range >= RANGE_NUM) + return nr_range; - if (!merge) - goto addit; + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; +} + +static int __init +add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) +{ + int i; /* try to merge it with old one */ for (i = 0; i < nr_range; i++) { @@ -676,24 +683,14 @@ static int __init add_range(struct res_range *range, int nr_range, unsigned long return nr_range; } -addit: /* need to add that */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; - + return add_range(range, nr_range, start, end); } -static void __init subtract_range(struct res_range *range, unsigned long start, - unsigned long end) + +static void __init +subtract_range(struct res_range *range, unsigned long start, unsigned long end) { - int i; - int j; + int i, j; for (j = 0; j < RANGE_NUM; j++) { if (!range[j].end) @@ -747,46 +744,47 @@ static int __init cmp_range(const void *x1, const void *x2) } struct var_mtrr_state { - unsigned long range_startk, range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; - unsigned address_bits; + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; + unsigned int address_bits; }; -static void __init set_var_mtrr( - unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned address_bits) +static void __init +set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; - unsigned address_mask_high; + u64 base, mask; if (!sizek) { fill_mtrr_var_range(reg, 0, 0, 0, 0); return; } - address_mask_high = ((1u << (address_bits - 32u)) - 1u); + mask = (1ULL << address_bits) - 1; + mask &= ~((((u64)sizek) << 10) - 1); - base_hi = basek >> 22; - base_lo = basek << 10; + base = ((u64)basek) << 10; - if (sizek < 4*1024*1024) { - mask_hi = address_mask_high; - mask_lo = ~((sizek << 10) - 1); - } else { - mask_hi = address_mask_high & (~((sizek >> 22) - 1)); - mask_lo = 0; - } + base |= type; + mask |= 0x800; + + base_lo = base & ((1ULL<<32) - 1); + base_hi = base >> 32; + + mask_lo = mask & ((1ULL<<32) - 1); + mask_hi = mask >> 32; - base_lo |= type; - mask_lo |= 0x800; fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); } -static unsigned int __init range_to_mtrr(unsigned int reg, - unsigned long range_startk, unsigned long range_sizek, - unsigned char type, unsigned address_bits) +static unsigned int __init +range_to_mtrr(unsigned int reg, unsigned long range_startk, + unsigned long range_sizek, unsigned char type, + unsigned address_bits) { if (!range_sizek || (reg >= num_var_ranges)) return reg; @@ -794,6 +792,7 @@ static unsigned int __init range_to_mtrr(unsigned int reg, while (range_sizek) { unsigned long max_align, align; unsigned long sizek; + /* Compute the maximum size I can make a range */ if (range_startk) max_align = ffs(range_startk) - 1; @@ -818,7 +817,8 @@ static unsigned int __init range_to_mtrr(unsigned int reg, return reg; } -static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) +static void __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) { unsigned long hole_basek, hole_sizek; unsigned long range0_basek, range0_sizek; @@ -848,23 +848,31 @@ static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigne /* try to append some small hole */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - if ((range0_sizek == state->range_sizek) || - ((range0_basek + range0_sizek - chunk_sizek > basek) && basek)) { + if (range0_sizek == state->range_sizek) { printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits); return; + } else if (basek) { + while (range0_basek + range0_sizek - chunk_sizek > basek) { + range0_sizek -= chunk_sizek; + if (!range0_sizek) + break; + } } - range0_sizek -= chunk_sizek; + if (range0_sizek > chunk_sizek) + range0_sizek -= chunk_sizek; printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK, state->address_bits); range_basek = range0_basek + range0_sizek; range_sizek = chunk_sizek; - if (range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) { + + if ((range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) && + (range_basek + range_sizek <= basek)) { hole_sizek = range_sizek - (state->range_sizek - range0_sizek); hole_basek = range_basek + range_sizek - hole_sizek; } else @@ -880,7 +888,9 @@ static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigne } } -static void __init set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn) +static void __init +set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, + unsigned long size_pfn) { unsigned long basek, sizek; @@ -921,7 +931,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); /* granity of mtrr of block */ -static u64 mtrr_gran_size __initdata = (64ULL<<20); +static u64 mtrr_gran_size __initdata = (1ULL<<20); static int __init parse_mtrr_gran_size_opt(char *p) { @@ -932,17 +942,19 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, unsigned address_bits) +static void __init +x86_setup_var_mtrrs(struct res_range *range, int nr_range, + unsigned address_bits) { struct var_mtrr_state var_state; int i; - var_state.range_startk = 0; - var_state.range_sizek = 0; - var_state.reg = 0; - var_state.address_bits = address_bits; - var_state.chunk_sizek = mtrr_chunk_size >> 10; - var_state.gran_sizek = mtrr_gran_size >> 10; + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.address_bits = address_bits; + var_state.chunk_sizek = mtrr_chunk_size >> 10; + var_state.gran_sizek = mtrr_gran_size >> 10; /* Write the range etc */ for (i = 0; i < nr_range; i++) @@ -952,11 +964,16 @@ static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, un range_to_mtrr_with_hole(&var_state, 0); printk(KERN_INFO "DONE variable MTRRs\n"); /* Clear out the extra MTRR's */ - while (var_state.reg < num_var_ranges) - set_var_mtrr(var_state.reg++, 0, 0, 0, var_state.address_bits); + while (var_state.reg < num_var_ranges) { + set_var_mtrr(var_state.reg, 0, 0, 0, var_state.address_bits); + var_state.reg++; + } } -static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) +static int __init +x86_get_mtrr_mem_range(struct res_range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) { unsigned long i, base, size; mtrr_type type; @@ -965,7 +982,7 @@ static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, mtrr_if->get(i, &base, &size, &type); if (type != MTRR_TYPE_WRBACK) continue; - nr_range = add_range(range, nr_range, base, base + size - 1, 1); + nr_range = add_range_with_merge(range, nr_range, base, base + size - 1); } printk(KERN_INFO "After WB checking\n"); for (i = 0; i < nr_range; i++) @@ -1005,11 +1022,11 @@ static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, static int __init mtrr_cleanup(unsigned address_bits) { + unsigned long extra_remove_base, extra_remove_size; unsigned long i, base, size, def, dummy; - mtrr_type type; struct res_range range[RANGE_NUM]; + mtrr_type type; int nr_range; - unsigned long extra_remove_base, extra_remove_size; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1053,7 +1070,6 @@ static int __init mtrr_cleanup(unsigned address_bits) x86_setup_var_mtrrs(range, nr_range, address_bits); return 1; - } static int disable_mtrr_trim; -- cgit v1.2.3 From 12031a624af7816ec7660b82be648aa3703b4ebe Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 2 May 2008 02:40:22 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete - auto detect v4 Loop through mtrr chunk_size and gran_size from 1M to 2G to find out the optimal value so user does not need to add mtrr_chunk_size and mtrr_gran_size to the kernel command line. If optimal value is not found, print out all list to help select less optimal value. Add mtrr_spare_reg_nr= so user could set 2 instead of 1, if the card need more entries. v2: find the one with more spare entries v3: fix hole_basek offset v4: tight the compare between range and range_new loop stop with 4g Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Gabriel C Cc: Mika Fischer Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 13 +- arch/x86/Kconfig | 9 + arch/x86/kernel/cpu/mtrr/main.c | 610 +++++++++++++++++++++++++++--------- 3 files changed, 484 insertions(+), 148 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 44427607b7c6..2000977af776 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -610,8 +610,17 @@ and is between 256 and 4096 characters. It is defined in the file that could hold holes aka. UC entries. mtrr_gran_size=nn[KMG] [X86] - used for mtrr cleanup. It is granity of mtrr block. - Big value could prevent small alignment use up MTRRs. + Used for mtrr cleanup. It is granularity of mtrr block. + Default is 1. + Large value could prevent small alignment from + using up MTRRs. + + mtrr_spare_reg_nr=n [X86] + Format: + Range: 0,7 : spare reg number + Default : 1 + Used for mtrr cleanup. It is spare mtrr entries number. + Set to 2 or more if your graphical card needs more. disable_mtrr_trim [X86, Intel and AMD only] By default the kernel will trim any uncacheable diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f417fe33d852..9e9761504422 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1117,6 +1117,15 @@ config MTRR_SANITIZER_ENABLE_DEFAULT help Enable mtrr cleanup default value +config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT + int "MTRR cleanup spare reg num (0-7)" + range 0 7 + default "1" + depends on MTRR_SANITIZER + help + mtrr cleanup spare entries default, it can be changed via + mtrr_spare_reg_nr= + config X86_PAT bool prompt "x86 PAT support" diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79597d3c3431..e6c162c379ac 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -610,28 +610,6 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; -#ifdef CONFIG_MTRR_SANITIZER -static int enable_mtrr_cleanup __initdata = CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; -#else -static int enable_mtrr_cleanup __initdata = -1; -#endif - -static int __init disable_mtrr_cleanup_setup(char *str) -{ - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 0; - return 0; -} -early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); - -static int __init enable_mtrr_cleanup_setup(char *str) -{ - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 1; - return 0; -} -early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); - /* should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 @@ -702,13 +680,15 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) continue; } - if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { + if (start <= range[j].start && end < range[j].end && + range[j].start < end + 1) { range[j].start = end + 1; continue; } - if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { + if (start > range[j].start && end >= range[j].end && + range[j].end > start - 1) { range[j].end = start - 1; continue; } @@ -743,18 +723,123 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __init +x86_get_mtrr_mem_range(struct res_range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) +{ + unsigned long i, base, size; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + nr_range = add_range_with_merge(range, nr_range, base, + base + size - 1); + } + printk(KERN_DEBUG "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + + /* take out UC ranges */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_UNCACHABLE) + continue; + size = range_state[i].size_pfn; + if (!size) + continue; + base = range_state[i].base_pfn; + subtract_range(range, base, base + size - 1); + } + if (extra_remove_size) + subtract_range(range, extra_remove_base, + extra_remove_base + extra_remove_size - 1); + + /* get new range num */ + nr_range = 0; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + nr_range++; + } + printk(KERN_DEBUG "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + printk(KERN_DEBUG "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + + /* clear those is not used */ + for (i = nr_range; i < RANGE_NUM; i++) + memset(&range[i], 0, sizeof(range[i])); + + return nr_range; +} + +static struct res_range __initdata range[RANGE_NUM]; + +#ifdef CONFIG_MTRR_SANITIZER + +static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +{ + unsigned long sum; + int i; + + sum = 0; + for (i = 0; i < nr_range; i++) + sum += range[i].end + 1 - range[i].start; + + return sum; +} + +static int enable_mtrr_cleanup __initdata = + CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); + struct var_mtrr_state { unsigned long range_startk; unsigned long range_sizek; unsigned long chunk_sizek; unsigned long gran_sizek; unsigned int reg; - unsigned int address_bits; }; static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned address_bits) + unsigned char type, unsigned int address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -781,10 +866,34 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); } +static void __init +save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); + range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); + range_state[reg].type = type; +} + +static void __init +set_var_mtrr_all(unsigned int address_bits) +{ + unsigned long basek, sizek; + unsigned char type; + unsigned int reg; + + for (reg = 0; reg < num_var_ranges; reg++) { + basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); + sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); + type = range_state[reg].type; + + set_var_mtrr(reg, basek, sizek, type, address_bits); + } +} + static unsigned int __init range_to_mtrr(unsigned int reg, unsigned long range_startk, - unsigned long range_sizek, unsigned char type, - unsigned address_bits) + unsigned long range_sizek, unsigned char type) { if (!range_sizek || (reg >= num_var_ranges)) return reg; @@ -803,12 +912,13 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1 << align; - printk(KERN_INFO "Setting variable MTRR %d, base: %ldMB, range: %ldMB, type %s\n", + printk(KERN_DEBUG "Setting variable MTRR %d, base: %ldMB, " + "range: %ldMB, type %s\n", reg, range_startk >> 10, sizek >> 10, (type == MTRR_TYPE_UNCACHABLE)?"UC": ((type == MTRR_TYPE_WRBACK)?"WB":"Other") ); - set_var_mtrr(reg++, range_startk, sizek, type, address_bits); + save_var_mtrr(reg++, range_startk, sizek, type); range_startk += sizek; range_sizek -= sizek; if (reg >= num_var_ranges) @@ -817,10 +927,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, return reg; } -static void __init -range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) +static unsigned __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, + unsigned long sizek) { unsigned long hole_basek, hole_sizek; + unsigned long second_basek, second_sizek; unsigned long range0_basek, range0_sizek; unsigned long range_basek, range_sizek; unsigned long chunk_sizek; @@ -828,64 +940,95 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) hole_basek = 0; hole_sizek = 0; + second_basek = 0; + second_sizek = 0; chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; /* align with gran size, prevent small block used up MTRRs */ range_basek = ALIGN(state->range_startk, gran_sizek); if ((range_basek > basek) && basek) - return; - range_sizek = ALIGN(state->range_sizek - (range_basek - state->range_startk), gran_sizek); + return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); + range_sizek = ALIGN(state->range_sizek, gran_sizek); - while (range_basek + range_sizek > (state->range_startk + state->range_sizek)) { + while (range_sizek > state->range_sizek) { range_sizek -= gran_sizek; if (!range_sizek) - return; + return 0; } - state->range_startk = range_basek; state->range_sizek = range_sizek; /* try to append some small hole */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); if (range0_sizek == state->range_sizek) { - printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits); - return; - } else if (basek) { - while (range0_basek + range0_sizek - chunk_sizek > basek) { + printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", range0_basek<<10, + (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK); + return 0; + } + + range0_sizek -= chunk_sizek; + if (range0_sizek && sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { range0_sizek -= chunk_sizek; if (!range0_sizek) break; } } + if (range0_sizek) { + printk(KERN_DEBUG "range0: %016lx - %016lx\n", range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); - if (range0_sizek > chunk_sizek) - range0_sizek -= chunk_sizek; - printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - range0_sizek, MTRR_TYPE_WRBACK, state->address_bits); + } range_basek = range0_basek + range0_sizek; range_sizek = chunk_sizek; - if ((range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) && - (range_basek + range_sizek <= basek)) { - hole_sizek = range_sizek - (state->range_sizek - range0_sizek); - hole_basek = range_basek + range_sizek - hole_sizek; - } else + if (range_basek + range_sizek > basek && + range_basek + range_sizek <= (basek + sizek)) { + /* one hole */ + second_basek = basek; + second_sizek = range_basek + range_sizek - basek; + } + + /* if last piece, only could one hole near end */ + if ((second_basek || !basek) && + range_sizek - (state->range_sizek - range0_sizek) - second_sizek < + (chunk_sizek >> 1)) { + /* + * one hole in middle (second_sizek is 0) or at end + * (second_sizek is 0 ) + */ + hole_sizek = range_sizek - (state->range_sizek - range0_sizek) + - second_sizek; + hole_basek = range_basek + range_sizek - hole_sizek + - second_sizek; + } else { + /* fallback for big hole, or several holes */ range_sizek = state->range_sizek - range0_sizek; + second_basek = 0; + second_sizek = 0; + } - printk(KERN_INFO "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, - range_sizek, MTRR_TYPE_WRBACK, state->address_bits); + printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, + MTRR_TYPE_WRBACK); if (hole_sizek) { - printk(KERN_INFO "hole: %016lx - %016lx\n", hole_basek<<10, (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, - hole_sizek, MTRR_TYPE_UNCACHABLE, state->address_bits); + printk(KERN_DEBUG "hole: %016lx - %016lx\n", hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, + MTRR_TYPE_UNCACHABLE); + } + + return second_sizek; } static void __init @@ -893,6 +1036,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn) { unsigned long basek, sizek; + unsigned long second_sizek = 0; if (state->reg >= num_var_ranges) return; @@ -901,21 +1045,19 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, sizek = size_pfn << (PAGE_SHIFT - 10); /* See if I can merge with the last range */ - if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { + if ((basek <= 1024) || + (state->range_startk + state->range_sizek == basek)) { unsigned long endk = basek + sizek; state->range_sizek = endk - state->range_startk; return; } /* Write the range mtrrs */ - if (state->range_sizek != 0) { - range_to_mtrr_with_hole(state, basek); + if (state->range_sizek != 0) + second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - state->range_startk = 0; - state->range_sizek = 0; - } /* Allocate an msr */ - state->range_startk = basek; - state->range_sizek = sizek; + state->range_startk = basek + second_sizek; + state->range_sizek = sizek - second_sizek; } /* mininum size of mtrr block that can take hole */ @@ -931,7 +1073,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); /* granity of mtrr of block */ -static u64 mtrr_gran_size __initdata = (1ULL<<20); +static u64 mtrr_gran_size __initdata; static int __init parse_mtrr_gran_size_opt(char *p) { @@ -942,91 +1084,84 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static void __init +static int nr_mtrr_spare_reg __initdata = + CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; + +static int __init parse_mtrr_spare_reg(char *arg) +{ + if (arg) + nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); + +static int __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, - unsigned address_bits) + u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; int i; + int num_reg; var_state.range_startk = 0; var_state.range_sizek = 0; var_state.reg = 0; - var_state.address_bits = address_bits; - var_state.chunk_sizek = mtrr_chunk_size >> 10; - var_state.gran_sizek = mtrr_gran_size >> 10; + var_state.chunk_sizek = chunk_size >> 10; + var_state.gran_sizek = gran_size >> 10; + + memset(range_state, 0, sizeof(range_state)); /* Write the range etc */ for (i = 0; i < nr_range; i++) - set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + set_var_mtrr_range(&var_state, range[i].start, + range[i].end - range[i].start + 1); /* Write the last range */ - range_to_mtrr_with_hole(&var_state, 0); - printk(KERN_INFO "DONE variable MTRRs\n"); + if (var_state.range_sizek != 0) + range_to_mtrr_with_hole(&var_state, 0, 0); + printk(KERN_DEBUG "DONE variable MTRRs\n"); + + num_reg = var_state.reg; /* Clear out the extra MTRR's */ while (var_state.reg < num_var_ranges) { - set_var_mtrr(var_state.reg, 0, 0, 0, var_state.address_bits); + save_var_mtrr(var_state.reg, 0, 0, 0); var_state.reg++; } -} - -static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, - unsigned long extra_remove_base, - unsigned long extra_remove_size) -{ - unsigned long i, base, size; - mtrr_type type; - - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - if (type != MTRR_TYPE_WRBACK) - continue; - nr_range = add_range_with_merge(range, nr_range, base, base + size - 1); - } - printk(KERN_INFO "After WB checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); - /* take out UC ranges */ - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - if (type != MTRR_TYPE_UNCACHABLE) - continue; - if (!size) - continue; - subtract_range(range, base, base + size - 1); - } - if (extra_remove_size) - subtract_range(range, extra_remove_base, extra_remove_base + extra_remove_size - 1); + return num_reg; +} - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } - printk(KERN_INFO "After UC checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); +struct mtrr_cleanup_result { + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; +}; - /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - printk(KERN_INFO "After sorting\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); +/* + * gran_size: 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 4G + * so we need (2+13)*6 + */ +#define NUM_RESULT 90 +#define PSHIFT (PAGE_SHIFT - 10) - return nr_range; -} +static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +static struct res_range __initdata range_new[RANGE_NUM]; +static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static int __init mtrr_cleanup(unsigned address_bits) { unsigned long extra_remove_base, extra_remove_size; unsigned long i, base, size, def, dummy; - struct res_range range[RANGE_NUM]; mtrr_type type; - int nr_range; + int nr_range, nr_range_new; + u64 chunk_size, gran_size; + unsigned long range_sums, range_sums_new; + int index_good; + int num_reg_good; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1038,10 +1173,20 @@ static int __init mtrr_cleanup(unsigned address_bits) if (def != MTRR_TYPE_UNCACHABLE) return 0; + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + /* check entries number */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); + type = range_state[i].type; + size = range_state[i].size_pfn; if (type >= MTRR_NUM_TYPES) continue; if (!size) @@ -1062,15 +1207,172 @@ static int __init mtrr_cleanup(unsigned address_bits) extra_remove_size = 0; if (mtrr_tom2) { extra_remove_base = 1 << (32 - PAGE_SHIFT); - extra_remove_size = (mtrr_tom2>>PAGE_SHIFT) - extra_remove_base; + extra_remove_size = + (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; + } + nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, + extra_remove_size); + range_sums = sum_ranges(range, nr_range); + printk(KERN_INFO "total RAM coverred: %ldM\n", + range_sums >> (20 - PAGE_SHIFT)); + + if (mtrr_chunk_size && mtrr_gran_size) { + int num_reg; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, + mtrr_gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, + extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + i = 0; + result[i].chunk_sizek = mtrr_chunk_size >> 10; + result[i].gran_sizek = mtrr_gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; + + printk(KERN_INFO " %sgran_size: %ldM \tchunk_size: %ldM \t", + result[i].bad?" BAD ":"", result[i].gran_sizek >> 10, + result[i].chunk_sizek >> 10); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + result[i].num_reg, result[i].bad?"-":"", + result[i].lose_cover_sizek >> 10); + if (!result[i].bad) { + set_var_mtrr_all(address_bits); + return 1; + } + printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " + "will find optimal one\n"); + memset(result, 0, sizeof(result[0])); + } + + i = 0; + memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); + memset(result, 0, sizeof(result)); + for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { + for (chunk_size = gran_size; chunk_size < (1ULL<<33); + chunk_size <<= 1) { + int num_reg; + + printk(KERN_INFO + "\ngran_size: %lldM chunk_size_size: %lldM\n", + gran_size >> 20, chunk_size >> 20); + if (i >= NUM_RESULT) + continue; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; + + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } + + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; + } + i++; + } } - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); - /* convert ranges to var ranges state */ - x86_setup_var_mtrrs(range, nr_range, address_bits); + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) { + printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", + result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, + result[i].chunk_sizek >> 10); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + result[i].num_reg, result[i].bad?"-":"", + result[i].lose_cover_sizek >> 10); + } - return 1; + /* try to find the optimal index */ + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = 1; i < num_var_ranges + 1 - nr_mtrr_spare_reg; i++) { + if (!min_loss_pfn[i]) { + num_reg_good = i; + break; + } + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + if (index_good != -1) { + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); + i = index_good; + printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", + result[i].gran_sizek >> 10, + result[i].chunk_sizek >> 10); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %ldM \n", + result[i].num_reg, + result[i].lose_cover_sizek >> 10); + /* convert ranges to var ranges state */ + chunk_size = result[i].chunk_sizek; + chunk_size <<= 10; + gran_size = result[i].gran_sizek; + gran_size <<= 10; + x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + set_var_mtrr_all(address_bits); + return 1; + } + + printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); + printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); + + return 0; } +#else +static int __init mtrr_cleanup(unsigned address_bits) +{ + return 0; +} +#endif + +static int __initdata changed_by_mtrr_cleanup; static int disable_mtrr_trim; @@ -1111,7 +1413,8 @@ int __init amd_special_default_mtrr(void) return 0; } -static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) +static u64 __init real_trim_memory(unsigned long start_pfn, + unsigned long limit_pfn) { u64 trim_start, trim_size; trim_start = start_pfn; @@ -1138,9 +1441,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - struct res_range range[RANGE_NUM]; int nr_range; - u64 total_real_trim_size; + u64 total_trim_size; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1155,11 +1457,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* Find highest cached pfn */ + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Find highest cached pfn */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; if (type != MTRR_TYPE_WRBACK) continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; if (highest_pfn < base + size) highest_pfn = base + size; } @@ -1177,9 +1490,10 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* check entries number */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); + type = range_state[i].type; if (type >= MTRR_NUM_TYPES) continue; + size = range_state[i].size_pfn; if (!size) type = MTRR_NUM_TYPES; num[type]++; @@ -1205,26 +1519,28 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); - total_real_trim_size = 0; + total_trim_size = 0; /* check the head */ if (range[0].start) - total_real_trim_size += real_trim_memory(0, range[0].start); + total_trim_size += real_trim_memory(0, range[0].start); /* check the holes */ for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) - total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); + total_trim_size += real_trim_memory(range[i].end + 1, + range[i+1].start); } /* check the top */ i = nr_range - 1; if (range[i].end + 1 < end_pfn) - total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); + total_trim_size += real_trim_memory(range[i].end + 1, + end_pfn); - if (total_real_trim_size) { + if (total_trim_size) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" " all of memory, losing %lluMB of RAM.\n", - total_real_trim_size >> 20); + total_trim_size >> 20); - if (enable_mtrr_cleanup < 1) + if (!changed_by_mtrr_cleanup) WARN_ON(1); printk(KERN_INFO "update e820 for mtrr\n"); @@ -1314,8 +1630,10 @@ void __init mtrr_bp_init(void) if (use_intel()) { get_mtrr_state(); - if (mtrr_cleanup(phys_addr)) + if (mtrr_cleanup(phys_addr)) { + changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); + } } } @@ -1355,7 +1673,7 @@ static int __init mtrr_init_finialize(void) if (!mtrr_if) return 0; if (use_intel()) { - if (enable_mtrr_cleanup < 1) + if (!changed_by_mtrr_cleanup) mtrr_state_warn(); } else { /* The CPUs haven't MTRR and seem to not support SMP. They have -- cgit v1.2.3 From 833e78bfeeef628f0201349a0a05a54f48f07884 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 5 May 2008 15:57:38 -0700 Subject: x86: process fam 10h like k8 with fixed mtrr setting otherwise fixed MTRR for family 10h may not be changed. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5aae648600bf..a83f5cd78885 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -342,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) if (lo != msrwords[0] || hi != msrwords[1]) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 15 && + (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) && ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) k8_enable_fixed_iorrs(); mtrr_wrmsr(msr, msrwords[0], msrwords[1]); -- cgit v1.2.3 From b79cd8f1268bab57ff85b19d131f7f23deab2dee Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 11 May 2008 00:30:15 -0700 Subject: x86: make e820.c to have common functions remove the duplicated copy of these functions. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 475 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 411 +-------------------------------------- arch/x86/kernel/e820_64.c | 444 ------------------------------------------ arch/x86/kernel/setup_32.c | 2 +- include/asm-x86/e820.h | 14 ++ include/asm-x86/e820_32.h | 12 -- include/asm-x86/e820_64.h | 12 -- 8 files changed, 496 insertions(+), 876 deletions(-) create mode 100644 arch/x86/kernel/e820.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..5369a4e36f17 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o -obj-y += bootflag.o e820_$(BITS).o +obj-y += bootflag.o e820_$(BITS).o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c new file mode 100644 index 000000000000..2cb686f60d0d --- /dev/null +++ b/arch/x86/kernel/e820.c @@ -0,0 +1,475 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * Venkatesh Pallipadi + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct e820map e820; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* + * if start is now at or beyond end, we're done, full + * coverage + */ + if (start >= end) + return 1; + } + return 0; +} + +/* + * Add a memory region to the kernel e820 map. + */ +void __init add_memory_region(u64 start, u64 size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); + switch (e820.map[i].type) { + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)\n"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)\n"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820MAX] __initdata; + static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following + (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i = 0; i < old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i = 0; i < 2 * old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i = 0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing = 1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + + /* loop through change-points, determining affect on the new bios map */ + for (chgidx = 0; chgidx < chg_nr; chgidx++) { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* + * continue building up new bios map based on this + * information + */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* + * move forward only if the new size + * was non-zero + */ + if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ + if (++new_bios_entry >= E820MAX) + break; + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr = change_point[chgidx]->addr; + } + last_type = current_type; + } + } + /* retain count for new bios entries */ + new_nr = new_bios_entry; + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +int __init copy_e820_map(struct e820entry *biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + add_memory_region(start, size, type); + } while (biosmap++, --nr_map); + return 0; +} + +u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + if (ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { + ei->type = new_type; + real_updated_size += ei->size; + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + add_memory_region(final_start, final_end - final_start, + new_type); + real_updated_size += final_end - final_start; + } + return real_updated_size; +} + +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + int found = 0; + + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + +#ifdef CONFIG_X86_64 + if (!found) { + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); + } +#endif + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} + diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 751d517d802c..dfe25751038b 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -16,21 +16,6 @@ #include #include -struct e820map e820; -struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ -}; -static struct change_member change_point_list[2*E820MAX] __initdata; -static struct change_member *change_point[2*E820MAX] __initdata; -static struct e820entry *overlap_list[E820MAX] __initdata; -static struct e820entry new_bios[E820MAX] __initdata; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - static struct resource system_rom_resource = { .name = "System ROM", .start = 0xf0000, @@ -254,223 +239,6 @@ void __init e820_mark_nosave_regions(void) } #endif -void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) -{ - int x; - - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} /* add_memory_region */ - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) { - return -1; - } - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; iaddr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; /* true number of change-points */ - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if > , swap */ - /* or, if current= & last=, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; ipbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; itype > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -int __init copy_e820_map(struct e820entry *biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - u64 start = biosmap->addr; - u64 size = biosmap->size; - u64 end = start + size; - u32 type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - add_memory_region(start, size, type); - } while (biosmap++, --nr_map); - - return 0; -} - /* * Find the highest page frame number we have available */ @@ -535,86 +303,12 @@ void __init register_bootmem_low_pages(unsigned long max_low_pfn) } } -void __init e820_register_memory(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; - - /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; - } - } -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; int i; - print_memory_map("limit_regions start"); + e820_print_map("limit_regions start"); for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -633,62 +327,10 @@ void __init limit_regions(unsigned long long size) e820.nr_map = i + 1; e820.map[i].size -= current_addr - size; } - print_memory_map("limit_regions endfor"); + e820_print_map("limit_regions endfor"); return; } - print_memory_map("limit_regions endfunc"); -} - -/* - * This function checks if any part of the range is mapped - * with type. - */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - const struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - - /* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init -e820_all_mapped(unsigned long s, unsigned long e, unsigned type) -{ - u64 start = s; - u64 end = e; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full - * coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; + e820_print_map("limit_regions endfunc"); } /* Overridden in paravirt.c if CONFIG_PARAVIRT */ @@ -700,7 +342,7 @@ char * __init __attribute__((weak)) memory_setup(void) void __init setup_memory_map(void) { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); + e820_print_map(memory_setup()); } static int __initdata user_defined_memmap; @@ -783,55 +425,12 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); -u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) -{ - int i; - u64 real_updated_size = 0; - - BUG_ON(old_type == new_type); - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 final_start, final_end; - if (ei->type != old_type) - continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { - ei->type = new_type; - real_updated_size += ei->size; - continue; - } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); - if (final_start >= final_end) - continue; - add_memory_region(final_start, final_end - final_start, - new_type); - real_updated_size += final_end - final_start; - } - - return real_updated_size; -} void __init finish_e820_parsing(void) { if (user_defined_memmap) { printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); + e820_print_map("user"); } } -void __init update_e820(void) -{ - u8 nr_map; - - nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) - return; - e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); - print_memory_map("modified"); -} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index c45b4dea4055..28a14eb65d3d 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -29,8 +29,6 @@ #include #include -struct e820map e820; - /* * PFN of last memory page. */ @@ -176,62 +174,6 @@ again: } return changed; } -/* - * This function checks if any part of the range is mapped - * with type. - */ -int -e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - -/* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init e820_all_mapped(unsigned long start, unsigned long end, - unsigned type) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* - * if start is now at or beyond end, we're done, full - * coverage - */ - if (start >= end) - return 1; - } - return 0; -} /* * Find a free area with specified alignment in a specific range. @@ -434,24 +376,6 @@ e820_register_active_regions(int nid, unsigned long start_pfn, add_active_range(nid, ei_startpfn, ei_endpfn); } -/* - * Add a memory region to the kernel e820 map. - */ -void __init add_memory_region(unsigned long start, unsigned long size, int type) -{ - int x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} - /* * Find the hole size (in bytes) in the memory range. * @start: starting address of the memory range to scan @@ -473,266 +397,6 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) return end - start - (ram << PAGE_SHIFT); } -static void __init e820_print_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) - (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: - printk(KERN_CONT "(usable)\n"); - break; - case E820_RESERVED: - printk(KERN_CONT "(reserved)\n"); - break; - case E820_ACPI: - printk(KERN_CONT "(ACPI data)\n"); - break; - case E820_NVS: - printk(KERN_CONT "(ACPI NVS)\n"); - break; - default: - printk(KERN_CONT "type %u\n", e820.map[i].type); - break; - } - } -} - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) -{ - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following - (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i = 0; i < old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) - return -1; - - /* create pointers for initial change-point information (for sorting) */ - for (i = 0; i < 2 * old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i = 0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + - biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i = 1; i < chg_nr; i++) { - unsigned long long curaddr, lastaddr; - unsigned long long curpbaddr, lastpbaddr; - - curaddr = change_point[i]->addr; - lastaddr = change_point[i - 1]->addr; - curpbaddr = change_point[i]->pbios->addr; - lastpbaddr = change_point[i - 1]->pbios->addr; - - /* - * swap entries, when: - * - * curaddr > lastaddr or - * curaddr == lastaddr and curaddr == curpbaddr and - * lastaddr != lastpbaddr - */ - if (curaddr < lastaddr || - (curaddr == lastaddr && curaddr == curpbaddr && - lastaddr != lastpbaddr)) { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing = 1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries = 0; /* number of entries in the overlap table */ - new_bios_entry = 0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - - /* loop through change-points, determining affect on the new bios map */ - for (chgidx = 0; chgidx < chg_nr; chgidx++) { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == - change_point[chgidx]->pbios->addr) { - /* - * add map entry to overlap list (> 1 entry - * implies an overlap) - */ - overlap_list[overlap_entries++] = - change_point[chgidx]->pbios; - } else { - /* - * remove entry from list (order independent, - * so swap with last) - */ - for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == - change_point[chgidx]->pbios) - overlap_list[i] = - overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* - * if there are overlapping entries, decide which - * "type" to use (larger value takes precedence -- - * 1=usable, 2,3,4,4+=unusable) - */ - current_type = 0; - for (i = 0; i < overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* - * continue building up new bios map based on this - * information - */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* - * move forward only if the new size - * was non-zero - */ - if (new_bios[new_bios_entry].size != 0) - /* - * no more space left for new - * bios entries ? - */ - if (++new_bios_entry >= E820MAX) - break; - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = - change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr = change_point[chgidx]->addr; - } - last_type = current_type; - } - } - /* retain count for new bios entries */ - new_nr = new_bios_entry; - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - */ -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - u64 start = biosmap->addr; - u64 size = biosmap->size; - u64 end = start + size; - u32 type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - add_memory_region(start, size, type); - } while (biosmap++, --nr_map); - return 0; -} - static void early_panic(char *msg) { early_printk(msg); @@ -829,114 +493,6 @@ void __init finish_e820_parsing(void) } } -u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) -{ - int i; - u64 real_updated_size = 0; - - BUG_ON(old_type == new_type); - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 final_start, final_end; - if (ei->type != old_type) - continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { - ei->type = new_type; - real_updated_size += ei->size; - continue; - } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); - if (final_start >= final_end) - continue; - add_memory_region(final_start, final_end - final_start, - new_type); - real_updated_size += final_end - final_start; - } - return real_updated_size; -} - -void __init update_e820(void) -{ - u8 nr_map; - - nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) - return; - e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); - e820_print_map("modified"); -} - -unsigned long pci_mem_start = 0xaeedbabe; -EXPORT_SYMBOL(pci_mem_start); - -/* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. - * Hopefully the BIOS let enough space left. - */ -__init void e820_setup_gap(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long last; - int i; - int found = 0; - - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - found = 1; - } - } - if (start < last) - last = start; - } - - if (!found) { - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk(KERN_INFO - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); -} - int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) { int i; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index b54c79c91efd..5faeab69edd9 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -875,7 +875,7 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif - e820_register_memory(); + e820_setup_gap(); e820_mark_nosave_regions(); #ifdef CONFIG_VT diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 7004251fc66b..b5b519feba1d 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -20,6 +20,20 @@ struct e820map { __u32 nr_map; struct e820entry map[E820MAX]; }; + +extern struct e820map e820; + +extern int e820_any_mapped(u64 start, u64 end, unsigned type); +extern int e820_all_mapped(u64 start, u64 end, unsigned type); +extern void add_memory_region(u64 start, u64 size, int type); +extern void e820_print_map(char *who); +extern int sanitize_e820_map(struct e820entry *biosmap, char *pnr_map); +extern int copy_e820_map(struct e820entry *biosmap, int nr_map); +extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type); +extern void update_e820(void); +extern void e820_setup_gap(void); + #endif /* __ASSEMBLY__ */ #define ISA_START_ADDRESS 0xa0000 diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index af0711b220df..9576b438fbd9 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -21,19 +21,8 @@ extern void setup_memory_map(void); extern void finish_e820_parsing(void); -extern struct e820map e820; -extern void update_e820(void); - -extern int e820_all_mapped(unsigned long start, unsigned long end, - unsigned type); -extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern void propagate_e820_map(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); -extern void add_memory_region(unsigned long long start, - unsigned long long size, int type); -extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type); -extern void e820_register_memory(void); extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, @@ -47,6 +36,5 @@ static inline void e820_mark_nosave_regions(void) } #endif - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 6ae3e2803286..9fac77e01441 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -19,34 +19,22 @@ extern unsigned long find_e820_area(unsigned long start, unsigned long end, extern unsigned long find_e820_area_size(unsigned long start, unsigned long *sizep, unsigned long align); -extern void add_memory_region(unsigned long start, unsigned long size, - int type); -extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type); extern void setup_memory_region(void); extern void contig_e820_setup(void); extern unsigned long e820_end_of_ram(void); extern void e820_reserve_resources(void); extern void e820_mark_nosave_regions(void); -extern int e820_any_mapped(unsigned long start, unsigned long end, - unsigned type); -extern int e820_all_mapped(unsigned long start, unsigned long end, - unsigned type); extern int e820_any_non_reserved(unsigned long start, unsigned long end); extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); extern int is_memory_all_valid(unsigned long start, unsigned long end); extern unsigned long e820_hole_size(unsigned long start, unsigned long end); -extern void e820_setup_gap(void); extern void e820_register_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); extern void finish_e820_parsing(void); -extern struct e820map e820; -extern void update_e820(void); - extern void reserve_early(unsigned long start, unsigned long end, char *name); extern void free_early(unsigned long start, unsigned long end); extern void early_res_to_bootmem(unsigned long start, unsigned long end); -- cgit v1.2.3 From 8004dd965b13b01a96def054d420f6df7ff22d53 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 12 May 2008 17:40:39 -0700 Subject: x86: amd opteron TOM2 mask val fix there is a typo in the mask value, need to remove that extra 0, to avoid 4bit clearing. Signed-off-by: Yinghal Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/pci/k8-bus_64.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index a83f5cd78885..509bd3d9eacd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -233,7 +233,7 @@ void __init get_mtrr_state(void) mtrr_tom2 = high; mtrr_tom2 <<= 32; mtrr_tom2 |= low; - mtrr_tom2 &= 0xffffff8000000ULL; + mtrr_tom2 &= 0xffffff800000ULL; } if (mtrr_show) { int high_width; diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index 5c2799c20e47..bfefdf0f40d4 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -384,7 +384,7 @@ static int __init early_fill_mp_bus_info(void) /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; rdmsrl(address, val); - end = (val & 0xffffff8000000ULL); + end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); if (end < (1ULL<<32)) update_range(range, 0, end - 1); @@ -478,7 +478,7 @@ static int __init early_fill_mp_bus_info(void) /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; rdmsrl(address, val); - end = (val & 0xffffff8000000ULL); + end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); update_range(range, 1ULL<<32, end - 1); } -- cgit v1.2.3 From 3f03c54a34fa3da680b3341dd3ba965ef3394bd1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 9 May 2008 22:40:52 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete layout - fix #2 disable the noisy print out. also use the one the less spare mtrr reg. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 79 +++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index e6c162c379ac..0642201784e0 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -730,6 +730,7 @@ struct var_mtrr_range_state { }; struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static int __initdata debug_print; static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, @@ -748,10 +749,12 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, nr_range = add_range_with_merge(range, nr_range, base, base + size - 1); } - printk(KERN_DEBUG "After WB checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + if (debug_print) { + printk(KERN_DEBUG "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + } /* take out UC ranges */ for (i = 0; i < num_var_ranges; i++) { @@ -775,17 +778,21 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, continue; nr_range++; } - printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + if (debug_print) { + printk(KERN_DEBUG "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } /* sort the ranges */ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - printk(KERN_DEBUG "After sorting\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + if (debug_print) { + printk(KERN_DEBUG "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + } /* clear those is not used */ for (i = nr_range; i < RANGE_NUM; i++) @@ -912,12 +919,13 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1 << align; - printk(KERN_DEBUG "Setting variable MTRR %d, base: %ldMB, " - "range: %ldMB, type %s\n", - reg, range_startk >> 10, sizek >> 10, - (type == MTRR_TYPE_UNCACHABLE)?"UC": - ((type == MTRR_TYPE_WRBACK)?"WB":"Other") - ); + if (debug_print) + printk(KERN_DEBUG "Setting variable MTRR %d, " + "base: %ldMB, range: %ldMB, type %s\n", + reg, range_startk >> 10, sizek >> 10, + (type == MTRR_TYPE_UNCACHABLE)?"UC": + ((type == MTRR_TYPE_WRBACK)?"WB":"Other") + ); save_var_mtrr(reg++, range_startk, sizek, type); range_startk += sizek; range_sizek -= sizek; @@ -963,7 +971,9 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); if (range0_sizek == state->range_sizek) { - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", range0_basek<<10, + if (debug_print) + printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", + range0_basek<<10, (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK); @@ -980,7 +990,9 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } if (range0_sizek) { - printk(KERN_DEBUG "range0: %016lx - %016lx\n", range0_basek<<10, + if (debug_print) + printk(KERN_DEBUG "range0: %016lx - %016lx\n", + range0_basek<<10, (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); @@ -1016,13 +1028,15 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_sizek = 0; } - printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, MTRR_TYPE_WRBACK); if (hole_sizek) { - printk(KERN_DEBUG "hole: %016lx - %016lx\n", hole_basek<<10, - (hole_basek + hole_sizek)<<10); + if (debug_print) + printk(KERN_DEBUG "hole: %016lx - %016lx\n", + hole_basek<<10, (hole_basek + hole_sizek)<<10); state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, MTRR_TYPE_UNCACHABLE); @@ -1120,7 +1134,6 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, /* Write the last range */ if (var_state.range_sizek != 0) range_to_mtrr_with_hole(&var_state, 0, 0); - printk(KERN_DEBUG "DONE variable MTRRs\n"); num_reg = var_state.reg; /* Clear out the extra MTRR's */ @@ -1219,6 +1232,7 @@ static int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { int num_reg; + debug_print = 1; /* convert ranges to var ranges state */ num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, mtrr_gran_size); @@ -1242,8 +1256,8 @@ static int __init mtrr_cleanup(unsigned address_bits) result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; - printk(KERN_INFO " %sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?" BAD ":"", result[i].gran_sizek >> 10, + printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", + result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, result[i].chunk_sizek >> 10); printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", result[i].num_reg, result[i].bad?"-":"", @@ -1254,6 +1268,7 @@ static int __init mtrr_cleanup(unsigned address_bits) } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); + debug_print = 0; memset(result, 0, sizeof(result[0])); } @@ -1265,9 +1280,10 @@ static int __init mtrr_cleanup(unsigned address_bits) chunk_size <<= 1) { int num_reg; - printk(KERN_INFO + if (debug_print) + printk(KERN_INFO "\ngran_size: %lldM chunk_size_size: %lldM\n", - gran_size >> 20, chunk_size >> 20); + gran_size >> 20, chunk_size >> 20); if (i >= NUM_RESULT) continue; @@ -1310,10 +1326,10 @@ static int __init mtrr_cleanup(unsigned address_bits) /* print out all */ for (i = 0; i < NUM_RESULT; i++) { - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", + printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", result[i].num_reg, result[i].bad?"-":"", result[i].lose_cover_sizek >> 10); } @@ -1322,7 +1338,7 @@ static int __init mtrr_cleanup(unsigned address_bits) if (nr_mtrr_spare_reg >= num_var_ranges) nr_mtrr_spare_reg = num_var_ranges - 1; num_reg_good = -1; - for (i = 1; i < num_var_ranges + 1 - nr_mtrr_spare_reg; i++) { + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { if (!min_loss_pfn[i]) { num_reg_good = i; break; @@ -1344,10 +1360,10 @@ static int __init mtrr_cleanup(unsigned address_bits) if (index_good != -1) { printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", + printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", result[i].gran_sizek >> 10, result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %ldM \n", + printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", result[i].num_reg, result[i].lose_cover_sizek >> 10); /* convert ranges to var ranges state */ @@ -1355,6 +1371,7 @@ static int __init mtrr_cleanup(unsigned address_bits) chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; + debug_print = 1; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); set_var_mtrr_all(address_bits); return 1; -- cgit v1.2.3 From b25e31cec7788ccba5749d10f8f4343fffff4750 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:28 -0700 Subject: x86 boot: minor code format fixes in e820 and efi routines Standardize a few pointer declarations to not have the extra space after the '*' character. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820_64.c | 2 +- arch/x86/kernel/efi_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 28a14eb65d3d..c10b4aece5e4 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -404,7 +404,7 @@ static void early_panic(char *msg) } /* We're not void only for x86 32-bit compat */ -char * __init machine_specific_memory_setup(void) +char *__init machine_specific_memory_setup(void) { char *who = "BIOS-e820"; /* diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index d0060fdcccac..21a7b759687a 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -103,7 +103,7 @@ void __init efi_reserve_bootmem(void) memmap.nr_map * memmap.desc_size); } -void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { static unsigned pages_mapped __initdata; unsigned i, pages; -- cgit v1.2.3 From c3965bd15118742d72b4bc1a290d37b3f081eb98 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:34 -0700 Subject: x86 boot: proper use of ARRAY_SIZE instead of repeated E820MAX constant This patch is motivated by a subsequent patch which will allow for more memory map entries on EFI supported systems than can be passed via the x86 legacy BIOS E820 interface. The legacy interface is limited to E820MAX == 128 memory entries, and that "E820MAX" manifest constant was used as the size for several arrays and loops over those arrays. The primary change in this patch is to change code loop sizes over those arrays from using the constant E820MAX, to using the ARRAY_SIZE() macro evaluated for the array being looped. That way, a subsequent patch can change the size of some of these arrays, without breaking this code. This patch also adds a parameter to the sanitize_e820_map() routine, which had an implicit size for the array passed it of E820MAX entries. This new parameter explicitly passes the size of said array. Once again, this will allow a subsequent patch to change that array size for some calls to sanitize_e820_map() without breaking the code. As part of enhancing the sanitize_e820_map() interface this way, I further combined the unnecessarily distinct x86_32 and x86_64 declarations for this routine into a single, commonly used, declaration. This patch in itself should make no difference to the resulting kernel binary. [ mingo@elte.hu: merged to -tip ] Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/boot/memory.c | 3 ++- arch/x86/kernel/e820.c | 9 +++++---- arch/x86/kernel/e820_64.c | 6 ++++-- arch/x86/mach-default/setup.c | 4 +++- arch/x86/mach-voyager/setup.c | 4 +++- include/asm-x86/e820.h | 3 ++- include/asm-x86/setup.h | 1 - 7 files changed, 19 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index acad32eb4290..53165c97336b 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -13,6 +13,7 @@ */ #include "boot.h" +#include #define SMAP 0x534d4150 /* ASCII "SMAP" */ @@ -53,7 +54,7 @@ static int detect_memory_e820(void) count++; desc++; - } while (next && count < E820MAX); + } while (next && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2cb686f60d0d..2396b9da8027 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -95,7 +95,7 @@ void __init add_memory_region(u64 start, u64 size, int type) { int x = e820.nr_map; - if (x == E820MAX) { + if (x == ARRAY_SIZE(e820.map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } @@ -142,7 +142,8 @@ void __init e820_print_map(char *who) * replaces the original e820 map with a new one, removing overlaps. * */ -int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, + char *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -314,7 +315,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) * no more space left for new * bios entries ? */ - if (++new_bios_entry >= E820MAX) + if (++new_bios_entry >= max_nr_map) break; } if (current_type != 0) { @@ -403,7 +404,7 @@ void __init update_e820(void) u8 nr_map; nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) return; e820.nr_map = nr_map; printk(KERN_INFO "modified physical RAM map:\n"); diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index c10b4aece5e4..58dc6eee4d4f 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -413,7 +413,9 @@ char *__init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); @@ -484,7 +486,7 @@ void __init finish_e820_parsing(void) if (userdef) { char nr = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr) < 0) + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) early_panic("Invalid user supplied memory map"); e820.nr_map = nr; diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 0c28a071824c..38a856c4fc07 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -163,7 +163,9 @@ char * __init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index 5ae5466b9eb9..662b5c0a77d6 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -111,7 +111,9 @@ char *__init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index b5b519feba1d..65c891d2a4cb 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -27,7 +27,8 @@ extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void add_memory_region(u64 start, u64 size, int type); extern void e820_print_map(char *who); -extern int sanitize_e820_map(struct e820entry *biosmap, char *pnr_map); +extern int +sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, char *pnr_map); extern int copy_e820_map(struct e820entry *biosmap, int nr_map); extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index fa6763af8d26..ffa0f540fc7f 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -56,7 +56,6 @@ char * __init machine_specific_memory_setup(void); char *memory_setup(void); int __init copy_e820_map(struct e820entry *biosmap, int nr_map); -int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map); void __init add_memory_region(unsigned long long start, unsigned long long size, int type); -- cgit v1.2.3 From 028b785888c523baccdf27af0cdbf1deb92edec0 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:40 -0700 Subject: x86 boot: extend some internal memory map arrays to handle larger EFI input Extend internal boot time memory tables to allow for up to three entries per node, which may be larger than the 128 E820MAX entries handled by the legacy BIOS E820 interface. The EFI interface, if present, is capable of passing memory map entries for these larger node counts. This patch requires an earlier patch that rewrote code depending on these array sizes from using E820MAX explicitly to size loops, to instead using ARRAY_SIZE() of the applicable array. Another patch following this one will provide the code to pick up additional memory entries passed via the EFI interface from the BIOS and insert them in the following, now enlarged, arrays. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 8 ++++---- include/asm-x86/e820.h | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2396b9da8027..3f7777b255aa 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -149,10 +149,10 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, struct e820entry *pbios; /* pointer to original bios entry */ unsigned long long addr; /* address for this change point */ }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; +static struct change_member change_point_list[2*E820_X_MAX] __initdata; +static struct change_member *change_point[2*E820_X_MAX] __initdata; +static struct e820entry *overlap_list[E820_X_MAX] __initdata; +static struct e820entry new_bios[E820_X_MAX] __initdata; struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 65c891d2a4cb..ab18457a95c0 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -2,6 +2,41 @@ #define __ASM_E820_H #define E820MAP 0x2d0 /* our map */ #define E820MAX 128 /* number of entries in E820MAP */ + +/* + * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the + * constrained space in the zeropage. If we have more nodes than + * that, and if we've booted off EFI firmware, then the EFI tables + * passed us from the EFI firmware can list more nodes. Size our + * internal memory map tables to have room for these additional + * nodes, based on up to three entries per node for which the + * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT), + * plus E820MAX, allowing space for the possible duplicate E820 + * entries that might need room in the same arrays, prior to the + * call to sanitize_e820_map() to remove duplicates. The allowance + * of three memory map entries per node is "enough" entries for + * the initial hardware platform motivating this mechanism to make + * use of additional EFI map entries. Future platforms may want + * to allow more than three entries per node or otherwise refine + * this size. + */ + +/* + * Odd: 'make headers_check' complains about numa.h if I try + * to collapse the next two #ifdef lines to a single line: + * #if defined(__KERNEL__) && defined(CONFIG_EFI) + */ +#ifdef __KERNEL__ +#ifdef CONFIG_EFI +#include +#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) +#else /* ! CONFIG_EFI */ +#define E820_X_MAX E820MAX +#endif +#else /* ! __KERNEL__ */ +#define E820_X_MAX E820MAX +#endif + #define E820NR 0x1e8 /* # entries in E820MAP */ #define E820_RAM 1 @@ -18,7 +53,7 @@ struct e820entry { struct e820map { __u32 nr_map; - struct e820entry map[E820MAX]; + struct e820entry map[E820_X_MAX]; }; extern struct e820map e820; -- cgit v1.2.3 From 6e9bcc796b120d17b08dde7ab958b82ddb899889 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:46 -0700 Subject: x86 boot: change sanitize_e820_map parameter from byte to int to allow bigger memory maps The map size counter passed into, and back out of, sanitize_e820_map(), was an eight bit type (char or u8), as derived from its origins in legacy BIOS E820 structures. This patch changes that type to an 'int', to allow this sanitize routine to also be used on larger maps (larger than the 256 count that fits in a char). The legacy BIOS E820 interface of course does not change; that remains at 8 bits for this count, holding up to E820MAX == 128 entries. But the kernel internals can handle more when those additional memory map entries are passed from the BIOS via EFI interfaces. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 5 +++-- arch/x86/kernel/e820_64.c | 7 +++++-- arch/x86/mach-default/setup.c | 5 ++++- arch/x86/mach-voyager/setup.c | 5 ++++- include/asm-x86/e820.h | 2 +- 5 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3f7777b255aa..91abf5b2fb94 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -143,7 +143,7 @@ void __init e820_print_map(char *who) * */ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - char *pnr_map) + int *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -204,6 +204,7 @@ static struct e820entry new_bios[E820_X_MAX] __initdata; return -1; old_nr = *pnr_map; + BUG_ON(old_nr > max_nr_map); /* bail out if we find any unreasonable addresses in bios map */ for (i = 0; i < old_nr; i++) @@ -401,7 +402,7 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - u8 nr_map; + int nr_map; nr_map = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 58dc6eee4d4f..354fbb221709 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -407,15 +407,18 @@ static void early_panic(char *msg) char *__init machine_specific_memory_setup(void) { char *who = "BIOS-e820"; + int new_nr; /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ + new_nr = boot_params.e820_entries; sanitize_e820_map(boot_params.e820_map, ARRAY_SIZE(boot_params.e820_map), - &boot_params.e820_entries); + &new_nr); + boot_params.e820_entries = new_nr; if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); @@ -484,7 +487,7 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - char nr = e820.nr_map; + int nr = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) early_panic("Invalid user supplied memory map"); diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 38a856c4fc07..56b4c39cb7fa 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -153,6 +153,7 @@ late_initcall(print_ipi_mode); char * __init machine_specific_memory_setup(void) { char *who; + int new_nr; who = "BIOS-e820"; @@ -163,9 +164,11 @@ char * __init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ + new_nr = boot_params.e820_entries; sanitize_e820_map(boot_params.e820_map, ARRAY_SIZE(boot_params.e820_map), - &boot_params.e820_entries); + &new_nr); + boot_params.e820_entries = new_nr; if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index 662b5c0a77d6..f4aca9fa9546 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -62,6 +62,7 @@ void __init time_init_hook(void) char *__init machine_specific_memory_setup(void) { char *who; + int new_nr; who = "NOT VOYAGER"; @@ -111,9 +112,11 @@ char *__init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ + new_nr = boot_params.e820_entries; sanitize_e820_map(boot_params.e820_map, ARRAY_SIZE(boot_params.e820_map), - &boot_params.e820_entries); + &new_nr); + boot_params.e820_entries = new_nr; if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index ab18457a95c0..1eb13b881437 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -63,7 +63,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void add_memory_region(u64 start, u64 size, int type); extern void e820_print_map(char *who); extern int -sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, char *pnr_map); +sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); extern int copy_e820_map(struct e820entry *biosmap, int nr_map); extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); -- cgit v1.2.3 From 5b7eb2e9ef4e467a1248537b47a63bab265be3cc Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:52 -0700 Subject: x86 boot: longer comment explaining sanitize_e820_map routine Elaborate on the comment for sanitize_e820_map(), epxlaining more what it does, what it inputs, and what it returns. Rearrange the placement of this comment to fit kernel conventions, before the routine's code rather than buried inside it. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 94 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 38 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 91abf5b2fb94..41c480ae47df 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -139,9 +139,64 @@ void __init e820_print_map(char *who) * Sanitize the BIOS e820 map. * * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. + * replaces the original e820 map with a new one, removing overlaps, + * and resolving conflicting memory types in favor of highest + * numbered type. * + * The input parameter biosmap points to an array of 'struct + * e820entry' which on entry has elements in the range [0, *pnr_map) + * valid, and which has space for up to max_nr_map entries. + * On return, the resulting sanitized e820 map entries will be in + * overwritten in the same location, starting at biosmap. + * + * The integer pointed to by pnr_map must be valid on entry (the + * current number of valid entries located at biosmap) and will + * be updated on return, with the new number of valid entries + * (something no more than max_nr_map.) + * + * The return value from sanitize_e820_map() is zero if it + * successfully 'sanitized' the map entries passed in, and is -1 + * if it did nothing, which can happen if either of (1) it was + * only passed one map entry, or (2) any of the input map entries + * were invalid (start + size < start, meaning that the size was + * so big the described memory range wrapped around through zero.) + * + * Visually we're performing the following + * (1,2,3,4 = memory types)... + * + * Sample memory map (w/overlaps): + * ____22__________________ + * ______________________4_ + * ____1111________________ + * _44_____________________ + * 11111111________________ + * ____________________33__ + * ___________44___________ + * __________33333_________ + * ______________22________ + * ___________________2222_ + * _________111111111______ + * _____________________11_ + * _________________4______ + * + * Sanitized equivalent (no overlap): + * 1_______________________ + * _44_____________________ + * ___1____________________ + * ____22__________________ + * ______11________________ + * _________1______________ + * __________3_____________ + * ___________44___________ + * _____________33_________ + * _______________2________ + * ________________1_______ + * _________________4______ + * ___________________2____ + * ____________________33__ + * ______________________4_ */ + int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map) { @@ -162,43 +217,6 @@ static struct e820entry new_bios[E820_X_MAX] __initdata; int old_nr, new_nr, chg_nr; int i; - /* - Visually we're performing the following - (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - /* if there's only one memory region, don't bother */ if (*pnr_map < 2) return -1; -- cgit v1.2.3 From 69c9189320c46b14e5ae3ad4b3a0d35cc63cba20 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:58 -0700 Subject: x86 boot: add code to add BIOS provided EFI memory entries to kernel Add to the kernels boot memory map 'memmap' entries found in the EFI memory descriptors passed in from the BIOS. On EFI systems, up to E820MAX == 128 memory map entries can be passed via the legacy E820 interface (limited by the size of the 'zeropage'). These entries can be duplicated in the EFI descriptors also passed from the BIOS, and possibly more entries passed by the EFI interface, which does not have the E820MAX limit on number of memory map entries. This code doesn't worry about the likely duplicate, overlapping or (unlikely) conflicting entries between the EFI map and the E820 map. It just dumps all the EFI entries into the memmap[] array (which already has the E820 entries) and lets the existing routine sanitize_e820_map() sort the mess out. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/efi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 77d424cf68b3..4a1a26d5931f 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -213,6 +213,31 @@ unsigned long efi_get_time(void) eft.minute, eft.second); } +/* + * Tell the kernel about the EFI memory map. This might include + * more than the max 128 entries that can fit in the e820 legacy + * (zeropage) memory map. + */ + +static void __init add_efi_memmap(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + int e820_type; + + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + add_memory_region(start, size, e820_type); + } + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + #if EFI_DEBUG static void __init print_efi_memmap(void) { @@ -370,6 +395,7 @@ void __init efi_init(void) if (memmap.desc_size != sizeof(efi_memory_desc_t)) printk(KERN_WARNING "Kernel-defined memdesc" "doesn't match the one from EFI!\n"); + add_efi_memmap(); /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; -- cgit v1.2.3 From a4c81cf684350797939416c99effb9d3ae46bca6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 18 May 2008 01:18:57 -0700 Subject: x86: extend e820 ealy_res support 32bit move early_res related from e820_64.c to e820.c make edba detection to be done in head32.c remove smp_alloc_memory, because we have fixed trampoline address now. Signed-off-by: Yinghai Lu arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 -------------------------------- arch/x86/kernel/head32.c | 76 ++++++++++++ arch/x86/kernel/setup_32.c | 109 +++--------------- arch/x86/kernel/smpboot.c | 17 -- arch/x86/kernel/trampoline.c | 2 arch/x86/mach-voyager/voyager_smp.c | 9 - include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 - include/asm-x86/smp.h | 1 arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 -------------------------------- arch/x86/kernel/head32.c | 76 ++++++++++++ arch/x86/kernel/setup_32.c | 109 +++--------------- arch/x86/kernel/smpboot.c | 17 -- arch/x86/kernel/trampoline.c | 2 arch/x86/mach-voyager/voyager_smp.c | 9 - include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 - include/asm-x86/smp.h | 1 arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 -------------------------------- arch/x86/kernel/head32.c | 76 ++++++++++++ arch/x86/kernel/setup_32.c | 109 +++--------------- arch/x86/kernel/smpboot.c | 17 -- arch/x86/kernel/trampoline.c | 2 arch/x86/mach-voyager/voyager_smp.c | 9 - include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 - include/asm-x86/smp.h | 1 10 files changed, 320 insertions(+), 319 deletions(-) Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 --------------------------------- arch/x86/kernel/head32.c | 76 +++++++++++++ arch/x86/kernel/setup_32.c | 109 ++++-------------- arch/x86/kernel/smpboot.c | 17 --- arch/x86/kernel/trampoline.c | 2 +- arch/x86/mach-voyager/voyager_smp.c | 9 -- include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 -- include/asm-x86/smp.h | 1 - 10 files changed, 320 insertions(+), 319 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 41c480ae47df..35da8cdbe5e6 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -22,7 +22,9 @@ #include #include #include +#include #include +#include struct e820map e820; @@ -493,3 +495,215 @@ __init void e820_setup_gap(void) pci_mem_start, gapstart, gapsize); } + +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 + +struct early_res { + u64 start, end; + char name[16]; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, +#endif +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, + /* + * Has to be in very low memory so we can execute + * real-mode AP code. + */ + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, +#endif + {} +}; + +void __init reserve_early(u64 start, u64 end, char *name) +{ + int i; + struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + panic("Overlapping early reservations %llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name?name:"", r->start, + r->end - 1, r->name); + } + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + r->start = start; + r->end = end; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i, j; + + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (start == r->start && end == r->end) + break; + } + if (i >= MAX_EARLY_RES || !early_res[i].end) + panic("free_early on not reserved area: %llx-%llx!", + start, end); + + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; +} + +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i; + u64 final_start, final_end; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) + continue; + printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i, + final_start, final_end - 1, r->name); +#ifdef CONFIG_X86_64 + reserve_bootmem_generic(final_start, final_end - final_start); +#else + reserve_bootmem(final_start, final_end - final_start, + BOOTMEM_DEFAULT); +#endif + } +} + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp, last; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last >= r->start && addr < r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + continue; + if (last > end) + continue; + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && + addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + continue; + return addr; + } + return -1UL; + +} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 354fbb221709..07941b554519 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -46,202 +46,6 @@ unsigned long max_pfn_mapped; */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -/* - * Early reserved memory areas. - */ -#define MAX_EARLY_RES 20 - -struct early_res { - unsigned long start, end; - char name[16]; -}; -static struct early_res early_res[MAX_EARLY_RES] __initdata = { - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#ifdef CONFIG_X86_TRAMPOLINE - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif - {} -}; - -void __init reserve_early(unsigned long start, unsigned long end, char *name) -{ - int i; - struct early_res *r; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n", - start, end - 1, name?name:"", r->start, r->end - 1, r->name); - } - if (i >= MAX_EARLY_RES) - panic("Too many early reservations"); - r = &early_res[i]; - r->start = start; - r->end = end; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); -} - -void __init free_early(unsigned long start, unsigned long end) -{ - struct early_res *r; - int i, j; - - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - r = &early_res[i]; - if (start == r->start && end == r->end) - break; - } - if (i >= MAX_EARLY_RES || !early_res[i].end) - panic("free_early on not reserved area: %lx-%lx!", start, end); - - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; -} - -void __init early_res_to_bootmem(unsigned long start, unsigned long end) -{ - int i; - unsigned long final_start, final_end; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) - continue; - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, - final_start, final_end - 1, r->name); - reserve_bootmem_generic(final_start, final_end - final_start); - } -} - -/* Check for already reserved areas */ -static inline int __init -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align) -{ - int i; - unsigned long addr = *addrp, last; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last >= r->start && addr < r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align) -{ - int i; - unsigned long addr = *addrp, last; - unsigned long size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, - unsigned long size, unsigned long align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr, last; - unsigned long ei_last; - - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - continue; - if (last > end) - continue; - return addr; - } - return -1UL; -} - -/* - * Find next free range after *start - */ -unsigned long __init find_e820_area_size(unsigned long start, - unsigned long *sizep, - unsigned long align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr, last; - unsigned long ei_last; - - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - continue; - return addr; - } - return -1UL; - -} /* * Find the highest page frame number we have available */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3db059058927..c216d3c2a991 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -8,7 +8,83 @@ #include #include +#include +#include +#include +#include + +#define BIOS_LOWMEM_KILOBYTES 0x413 + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + */ +static void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; + + /* reserve all memory between lowmem and the 1MB mark */ + reserve_early(lowmem, 0x100000, "BIOS reserved"); +} + void __init i386_start_kernel(void) { + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + +#ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD */ + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = ramdisk_image + ramdisk_size; + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); + } +#endif + reserve_early(__pa_symbol(&_end), init_pg_tables_end, "INIT_PG_TABLE"); + + reserve_ebda_region(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + start_kernel(); } diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 5faeab69edd9..fed482c62450 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -359,56 +359,6 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -#define BIOS_LOWMEM_KILOBYTES 0x413 - -/* - * The BIOS places the EBDA/XBDA at the top of conventional - * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. - */ -static void __init reserve_ebda_region(void) -{ - unsigned int lowmem, ebda_addr; - - /* To determine the position of the EBDA and the */ - /* end of conventional memory, we need to look at */ - /* the BIOS data area. In a paravirtual environment */ - /* that area is absent. We'll just have to assume */ - /* that the paravirt case can handle memory setup */ - /* correctly, without our help. */ - if (paravirt_enabled()) - return; - - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - - /* Fixup: bios puts an EBDA in the top 64K segment */ - /* of conventional memory, but does not adjust lowmem. */ - if ((lowmem - ebda_addr) <= 0x10000) - lowmem = ebda_addr; - - /* Fixup: bios does not report an EBDA at all. */ - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ - if ((ebda_addr == 0) && (lowmem >= 0x9f000)) - lowmem = 0x9f000; - - /* Paranoia: should never happen, but... */ - if ((lowmem == 0) || (lowmem >= 0x100000)) - lowmem = 0x9f000; - - /* reserve all memory between lowmem and the 1MB mark */ - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT); -} - #ifndef CONFIG_NEED_MULTIPLE_NODES static void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) @@ -522,25 +472,32 @@ static void __init reserve_initrd(void) unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; unsigned long ramdisk_here; - initrd_start = 0; - if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ + initrd_start = 0; + if (ramdisk_end < ramdisk_image) { + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd wraps around end of memory, " "disabling initrd\n"); return; } if (ramdisk_size >= end_of_lowmem/2) { + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; } + if (ramdisk_end <= end_of_lowmem) { /* All in lowmem, easy case */ - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); + /* + * don't need to reserve again, already reserved early + * in i386_start_kernel, and early_res_to_bootmem + * convert that to reserved in bootmem + */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; return; @@ -582,6 +539,8 @@ static void __init relocate_initrd(void) p = (char *)__va(ramdisk_image); memcpy(q, p, clen); q += clen; + /* need to free these low pages...*/ + free_bootmem(ramdisk_image, clen); ramdisk_image += clen; ramdisk_size -= clen; } @@ -600,47 +559,28 @@ static void __init relocate_initrd(void) ramdisk_image += clen; ramdisk_size -= clen; } + /* high pages is not converted by early_res_to_bootmem */ } #endif /* CONFIG_BLK_DEV_INITRD */ void __init setup_bootmem_allocator(void) { - unsigned long bootmap_size; + unsigned long bootmap_size, bootmap; /* * Initialize the boot-time allocator (with low memory only): */ - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); - + bootmap_size = bootmem_bootmap_pages(max_low_pfn)<> PAGE_SHIFT, max_low_pfn); register_bootmem_low_pages(max_low_pfn); + early_res_to_bootmem(0, max_low_pfn<llc_shared_map; } -#ifdef CONFIG_X86_32 -/* - * We are called very early to get the low memory for the - * SMP bootup trampoline page. - */ -void __init smp_alloc_memory(void) -{ - trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - if (__pa(trampoline_base) >= 0x9F000) - BUG(); -} -#endif - static void impress_friends(void) { int cpu; diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index abbf199adebb..1106fac6024d 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -2,7 +2,7 @@ #include -/* ready for x86_64, no harm for x86, since it will overwrite after alloc */ +/* ready for x86_64 and x86 */ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); /* diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 8acbf0cdf1a5..7bbebbfe8c4e 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1137,15 +1137,6 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, 0, 1, 1); } -/* used to set up the trampoline for other CPUs when the memory manager - * is sorted out */ -void __init smp_alloc_memory(void) -{ - trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); - if (__pa(trampoline_base) >= 0x93000) - BUG(); -} - /* send a reschedule CPI to one CPU by physical CPU number*/ static void voyager_smp_send_reschedule(int cpu) { diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 1eb13b881437..5a58e2bb1d78 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -70,6 +70,12 @@ extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, extern void update_e820(void); extern void e820_setup_gap(void); +extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); +extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); +extern void reserve_early(u64 start, u64 end, char *name); +extern void free_early(u64 start, u64 end); +extern void early_res_to_bootmem(u64 start, u64 end); + #endif /* __ASSEMBLY__ */ #define ISA_START_ADDRESS 0xa0000 diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 9fac77e01441..37f176a02bc6 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -14,11 +14,6 @@ #include #ifndef __ASSEMBLY__ -extern unsigned long find_e820_area(unsigned long start, unsigned long end, - unsigned long size, unsigned long align); -extern unsigned long find_e820_area_size(unsigned long start, - unsigned long *sizep, - unsigned long align); extern void setup_memory_region(void); extern void contig_e820_setup(void); extern unsigned long e820_end_of_ram(void); @@ -35,10 +30,6 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn, extern void finish_e820_parsing(void); -extern void reserve_early(unsigned long start, unsigned long end, char *name); -extern void free_early(unsigned long start, unsigned long end); -extern void early_res_to_bootmem(unsigned long start, unsigned long end); - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 1ebaa5cd3112..514e52b95cef 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -201,7 +201,6 @@ extern void cpu_exit_clear(void); extern void cpu_uninit(void); #endif -extern void smp_alloc_memory(void); extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From ba5b14cc0311af6ed20dc25aa98a1d5ea6f2e6c0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 21 May 2008 18:40:18 -0700 Subject: x86: extend e820 ealy_res support 32bit - fix use find_e820_area to find addess for new RAMDISK, instead of using ram blindly also print out low ram and bootmap info Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/kernel/setup_32.c | 55 ++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index fed482c62450..3c451d143eba 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -466,11 +466,11 @@ static bool do_relocate_initrd = false; static void __init reserve_initrd(void) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; - unsigned long ramdisk_here; + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 ramdisk_here; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -478,14 +478,8 @@ static void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_end < ramdisk_image) { - free_bootmem(ramdisk_image, ramdisk_size); - printk(KERN_ERR "initrd wraps around end of memory, " - "disabling initrd\n"); - return; - } if (ramdisk_size >= end_of_lowmem/2) { - free_bootmem(ramdisk_image, ramdisk_size); + free_early(ramdisk_image, ramdisk_image + ramdisk_size - 1); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -495,8 +489,7 @@ static void __init reserve_initrd(void) /* All in lowmem, easy case */ /* * don't need to reserve again, already reserved early - * in i386_start_kernel, and early_res_to_bootmem - * convert that to reserved in bootmem + * in i386_start_kernel */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; @@ -504,11 +497,14 @@ static void __init reserve_initrd(void) } /* We need to move the initrd down into lowmem */ - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; + ramdisk_here = find_e820_area(min_low_pfn<> PAGE_SHIFT, max_low_pfn); + printk(KERN_INFO " low ram: %08lx - %08lx\n", + min_low_pfn< Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: move mp_ioapic_routing to mpparse and make it static mpparse is the only user of mp_ioapic_routing. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 -- arch/x86/kernel/mpparse.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c49ebcc6c41e..6f20fa92bbaf 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -331,8 +331,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e #ifdef CONFIG_X86_IO_APIC -struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; - static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 404683b94e79..bb7213335991 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -1,5 +1,5 @@ /* - * Intel Multiprocessor Specification 1.1 and 1.4 +2 * Intel Multiprocessor Specification 1.1 and 1.4 * compliant MP-table parsing routines. * * (c) 1995 Alan Cox, Building #3 @@ -805,7 +805,7 @@ int es7000_plat; #define MP_ISA_BUS 0 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; +static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; static int mp_find_ioapic(int gsi) { -- cgit v1.2.3 From a91eea6df383f26fc4fcbefcbdb5aee8facd17fd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: fix shadow variables of global end_pnf in e820_64.c Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820_64.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 07941b554519..5a9bc9244dc6 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -51,21 +51,21 @@ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; */ unsigned long __init e820_end_of_ram(void) { - unsigned long end_pfn; + unsigned long last_pfn; - end_pfn = find_max_pfn_with_active_regions(); + last_pfn = find_max_pfn_with_active_regions(); - if (end_pfn > max_pfn_mapped) - max_pfn_mapped = end_pfn; + if (last_pfn > max_pfn_mapped) + max_pfn_mapped = last_pfn; if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT) max_pfn_mapped = MAXMEM>>PAGE_SHIFT; - if (end_pfn > end_user_pfn) - end_pfn = end_user_pfn; - if (end_pfn > max_pfn_mapped) - end_pfn = max_pfn_mapped; + if (last_pfn > end_user_pfn) + last_pfn = end_user_pfn; + if (last_pfn > max_pfn_mapped) + last_pfn = max_pfn_mapped; printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped); - return end_pfn; + return last_pfn; } /* @@ -124,12 +124,12 @@ void __init e820_mark_nosave_regions(void) } /* - * Finds an active region in the address range from start_pfn to end_pfn and + * Finds an active region in the address range from start_pfn to last_pfn and * returns its range in ei_startpfn and ei_endpfn for the e820 entry. */ static int __init e820_find_active_region(const struct e820entry *ei, unsigned long start_pfn, - unsigned long end_pfn, + unsigned long last_pfn, unsigned long *ei_startpfn, unsigned long *ei_endpfn) { @@ -146,14 +146,14 @@ static int __init e820_find_active_region(const struct e820entry *ei, /* Skip if map is outside the node */ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || - *ei_startpfn >= end_pfn) + *ei_startpfn >= last_pfn) return 0; /* Check for overlaps */ if (*ei_startpfn < start_pfn) *ei_startpfn = start_pfn; - if (*ei_endpfn > end_pfn) - *ei_endpfn = end_pfn; + if (*ei_endpfn > last_pfn) + *ei_endpfn = last_pfn; /* Obey end_user_pfn to save on memmap */ if (*ei_startpfn >= end_user_pfn) @@ -167,7 +167,7 @@ static int __init e820_find_active_region(const struct e820entry *ei, /* Walk the e820 map and register active regions within a node */ void __init e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn) + unsigned long last_pfn) { unsigned long ei_startpfn; unsigned long ei_endpfn; @@ -175,7 +175,7 @@ e820_register_active_regions(int nid, unsigned long start_pfn, for (i = 0; i < e820.nr_map; i++) if (e820_find_active_region(&e820.map[i], - start_pfn, end_pfn, + start_pfn, last_pfn, &ei_startpfn, &ei_endpfn)) add_active_range(nid, ei_startpfn, ei_endpfn); } @@ -188,13 +188,13 @@ e820_register_active_regions(int nid, unsigned long start_pfn, unsigned long __init e820_hole_size(unsigned long start, unsigned long end) { unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long last_pfn = end >> PAGE_SHIFT; unsigned long ei_startpfn, ei_endpfn, ram = 0; int i; for (i = 0; i < e820.nr_map; i++) { if (e820_find_active_region(&e820.map[i], - start_pfn, end_pfn, + start_pfn, last_pfn, &ei_startpfn, &ei_endpfn)) ram += ei_endpfn - ei_startpfn; } -- cgit v1.2.3 From 4a139a7fdec8964cecf7a3564ee7ae327141d495 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: include pci.h in e820_64.c global pci_mem_start needs a declaration. include pci.h Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 5a9bc9244dc6..3b2e0b1bb49b 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 11a62a056093a7f25f1595fbd8bd5f93559572b6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: cleanup print out for mptable the new output is: MPTABLE: OEM ID: SUN MPTABLE: Product ID: 4600 M2 MPTABLE: APIC at: 0x instead of it all in one line with <6> and double Product ID... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mpparse.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index bb7213335991..5a18b2b9852e 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -113,7 +113,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) #ifdef CONFIG_X86_NUMAQ mpc_oem_bus_info(m, str, translation_table[mpc_record]); #else - Dprintk("Bus #%d is %s\n", m->mpc_busid, str); + printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); #endif #if MAX_MP_BUSSES < 256 @@ -183,7 +183,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void __init MP_intsrc_info(struct mpc_config_intsrc *m) { mp_irqs[mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," + printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, @@ -196,7 +196,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m) static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) { - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, @@ -309,16 +309,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) } memcpy(oem, mpc->mpc_oem, 8); oem[8] = 0; - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); memcpy(str, mpc->mpc_productid, 12); str[12] = 0; - printk("Product ID: %s ", str); #ifdef CONFIG_X86_32 mps_oem_check(mpc, oem, str); #endif - printk(KERN_INFO "MPTABLE: Product ID: %s ", str); + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); @@ -689,7 +688,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned int *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); + printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { -- cgit v1.2.3 From 32c5061265caf201d6a2c0d02181e2b68769c22c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Wed, 14 May 2008 19:02:51 +0400 Subject: x86: move es7000_plat out of mpparse.c Signed-off-by: Alexey Starikovskiy Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 11 ++++++----- arch/x86/mach-es7000/es7000plat.c | 2 ++ include/asm-x86/system.h | 1 - 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5a18b2b9852e..ff1342325efc 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -793,15 +793,14 @@ void __init find_smp_config(void) ACPI-based MP Configuration -------------------------------------------------------------------------- */ -/* - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: - */ -int es7000_plat; - #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) +extern int es7000_plat; +#endif + #define MP_ISA_BUS 0 static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; @@ -928,11 +927,13 @@ void __init mp_config_acpi_legacy_irqs(void) set_bit(MP_ISA_BUS, mp_bus_not_pci); Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) /* * Older generations of ES7000 have no legacy identity mappings */ if (es7000_plat == 1) return; +#endif /* * Locate the IOAPIC that manages the ISA IRQs (0-15). diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c index f5d6f7d8b86e..a41c77a47227 100644 --- a/arch/x86/mach-es7000/es7000plat.c +++ b/arch/x86/mach-es7000/es7000plat.c @@ -52,6 +52,8 @@ static struct mip_reg *host_reg; static int mip_port; static unsigned long mip_addr, host_addr; +int es7000_plat; + /* * GSI override for ES7000 platforms. */ diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index a2f04cd79b29..9f7f63ba0042 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -303,7 +303,6 @@ static inline void clflush(volatile void *__p) void disable_hlt(void); void enable_hlt(void); -extern int es7000_plat; void cpu_idle_wait(void); extern unsigned long arch_align_stack(unsigned long sp); -- cgit v1.2.3 From 11113f84c72bd832dc4e76122e613b0e623e2346 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Wed, 14 May 2008 19:02:57 +0400 Subject: x86: complete move ACPI from mpparse.c Signed-off-by: Alexey Starikovskiy Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 304 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/mpparse.c | 299 +------------------------------------------ 2 files changed, 305 insertions(+), 298 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6f20fa92bbaf..b2ad09c4c6ae 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -846,6 +846,310 @@ static int __init acpi_parse_madt_lapic_entries(void) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC +#define MP_ISA_BUS 0 + +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) +extern int es7000_plat; +#endif + +static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; + +static int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +static u8 __init uniq_ioapic_id(u8 id) +{ +#ifdef CONFIG_X86_32 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +#else + int i; + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mpc_apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); +#ifdef CONFIG_X86_32 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); +#else + mp_ioapics[idx].mpc_apicver = 0; +#endif + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; +} + +void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) +{ + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mpc_irqflag = (trigger << 2) | polarity; + mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq; /* IRQ */ + mp_irqs[mp_irq_entries].mpc_dstapic = + mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + mp_irqs[mp_irq_entries].mpc_dstirq = pin; /* INTIN# */ + + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); + +} + +void __init mp_config_acpi_legacy_irqs(void) +{ + int i = 0; + int ioapic = -1; + +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; +#endif + set_bit(MP_ISA_BUS, mp_bus_not_pci); + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; +#endif + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mpc_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; +#ifdef CONFIG_X86_IO_APIC + mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; +#endif + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS + && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == + mp_irqs[mp_irq_entries].mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mpc_srcbusirq = i; /* Identity mapped */ + mp_irqs[mp_irq_entries].mpc_dstirq = i; + + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); + } +} + +int mp_register_gsi(u32 gsi, int triggering, int polarity) +{ + int ioapic; + int ioapic_pin; +#ifdef CONFIG_X86_32 +#define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 + + static int pci_irq = IRQ_COMPRESSION_START; + /* + * Mapping between Global System Interrupts, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; +#else + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; +#endif + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + +#ifdef CONFIG_X86_32 + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); +#endif + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (ioapic_pin > MP_MAX_IOAPIC_PIN) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); +#ifdef CONFIG_X86_32 + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); +#else + return gsi; +#endif + } + + set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); +#ifdef CONFIG_X86_32 + /* + * For GSI >= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + gsi = pci_irq++; + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_gbl_FADT.sci_interrupt) + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } +#endif + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ff1342325efc..d05b70c329c4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -1,5 +1,5 @@ /* -2 * Intel Multiprocessor Specification 1.1 and 1.4 + * Intel Multiprocessor Specification 1.1 and 1.4 * compliant MP-table parsing routines. * * (c) 1995 Alan Cox, Building #3 @@ -788,300 +788,3 @@ void __init find_smp_config(void) { __find_smp_config(1); } - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#ifdef CONFIG_X86_IO_APIC - -#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) -extern int es7000_plat; -#endif - -#define MP_ISA_BUS 0 - -static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic(int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_base) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; -} - -static u8 __init uniq_ioapic_id(u8 id) -{ -#ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_config_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->mpc_apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif -} - -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) -{ - int idx = 0; - - if (bad_ioapic(address)) - return; - - idx = nr_ioapics; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].mpc_apicver = 0; -#endif - /* - * Build basic GSI lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI GSIs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - nr_ioapics++; -} - -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - MP_intsrc_info(&intsrc); -} - -void __init mp_config_acpi_legacy_irqs(void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) - /* - * Fabricate the legacy ISA bus (bus #31). - */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; -#endif - set_bit(MP_ISA_BUS, mp_bus_not_pci); - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); - -#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) - /* - * Older generations of ES7000 have no legacy identity mappings - */ - if (es7000_plat == 1) - return; -#endif - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; -#ifdef CONFIG_X86_IO_APIC - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; -#endif - /* - * Use the default configuration for the IRQs 0-15. Unless - * overridden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS - && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - MP_intsrc_info(&intsrc); - } -} - -int mp_register_gsi(u32 gsi, int triggering, int polarity) -{ - int ioapic; - int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 - - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; -#endif - - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - -#ifdef CONFIG_X86_32 - if (ioapic_renumber_irq) - gsi = ioapic_renumber_irq(ioapic, gsi); -#endif - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (ioapic_pin > MP_MAX_IOAPIC_PIN) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else - return gsi; -#endif - } - - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} - -#endif /* CONFIG_X86_IO_APIC */ -#endif /* CONFIG_ACPI */ -- cgit v1.2.3 From 5f8951487ddbacbc949e9ffae574f94791f9b4dd Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Wed, 14 May 2008 19:03:04 +0400 Subject: x86: make mp_ioapic_routing definition local Signed-off-by: Alexey Starikovskiy Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 7 ++++++- include/asm-x86/io_apic.h | 7 ------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b2ad09c4c6ae..f75c2030f4b5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -852,7 +852,12 @@ static int __init acpi_parse_madt_lapic_entries(void) extern int es7000_plat; #endif -static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; +static struct { + int apic_id; + int gsi_base; + int gsi_end; + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; static int mp_find_ioapic(int gsi) { diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index d593e14f0341..2ef96f0596e2 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -112,13 +112,6 @@ extern int nr_ioapic_registers[MAX_IO_APICS]; #define MP_MAX_IOAPIC_PIN 127 -struct mp_ioapic_routing { - int apic_id; - int gsi_base; - int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -}; - /* I/O APIC entries */ extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; -- cgit v1.2.3 From ec2cd0a22e2715f776a934e01c4f8ea098324fe1 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Wed, 14 May 2008 19:03:10 +0400 Subject: x86: make struct config_ioapic not MPspec specific Signed-off-by: Alexey Starikovskiy Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 28 ++++++++++++------------ arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/io_apic_32.c | 52 ++++++++++++++++++++++---------------------- arch/x86/kernel/io_apic_64.c | 26 +++++++++++----------- arch/x86/kernel/mpparse.c | 8 +++++-- include/asm-x86/io_apic.h | 10 ++++++++- 6 files changed, 69 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f75c2030f4b5..a26dd91faf0f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -887,8 +887,8 @@ static u8 __init uniq_ioapic_id(u8 id) DECLARE_BITMAP(used, 256); bitmap_zero(used, 256); for (i = 0; i < nr_ioapics; i++) { - struct mpc_config_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->mpc_apicid, used); + struct mp_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mp_apicid, used); } if (!test_bit(id, used)) return id; @@ -920,29 +920,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) idx = nr_ioapics; - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; + mp_ioapics[idx].mp_type = MP_IOAPIC; + mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mp_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); + mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); #ifdef CONFIG_X86_32 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); #else - mp_ioapics[idx].mpc_apicver = 0; + mp_ioapics[idx].mp_apicver = 0; #endif /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, + mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); nr_ioapics++; @@ -975,7 +975,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq; /* IRQ */ mp_irqs[mp_irq_entries].mpc_dstapic = - mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + mp_ioapics[ioapic].mp_apicid; /* APIC ID */ mp_irqs[mp_irq_entries].mpc_dstirq = pin; /* INTIN# */ if (++mp_irq_entries == MAX_IRQ_SOURCES) @@ -1016,7 +1016,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_irqs[mp_irq_entries].mpc_irqflag = 0; /* Conforming */ mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; #ifdef CONFIG_X86_IO_APIC - mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mp_apicid; #endif /* * Use the default configuration for the IRQs 0-15. Unless diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1bdeb6c..fa1be1d62916 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1202,7 +1202,7 @@ void __init init_apic_mappings(void) for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; + ioapic_phys = mp_ioapics[i].mp_apicaddr; if (!ioapic_phys) { printk(KERN_ERR "WARNING: bogus zero IO-APIC " diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54fc1fdd..5af1b717236c 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -72,7 +72,7 @@ int sis_apic_bug = -1; int nr_ioapic_registers[MAX_IO_APICS]; /* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ @@ -110,7 +110,7 @@ struct io_apic { static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); } static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) @@ -802,7 +802,7 @@ static int find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && mp_irqs[i].mpc_dstirq == pin) return i; @@ -844,7 +844,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) return apic; } } @@ -872,7 +872,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) int lbus = mp_irqs[i].mpc_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; @@ -1250,12 +1250,12 @@ static void __init setup_IO_APIC_irqs(void) if (first_notcon) { apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mpc_apicid, + mp_ioapics[apic].mp_apicid, pin); first_notcon = 0; } else apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mpc_apicid, pin); + mp_ioapics[apic].mp_apicid, pin); continue; } @@ -1357,7 +1357,7 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); /* * We are a bit conservative about what we expect. We have to @@ -1376,7 +1376,7 @@ void __init print_IO_APIC(void) reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -1749,14 +1749,14 @@ static void __init setup_ioapic_ids_from_mpc(void) reg_00.raw = io_apic_read(apic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); - old_id = mp_ioapics[apic].mpc_apicid; + old_id = mp_ioapics[apic].mp_apicid; - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mpc_apicid); + apic, mp_ioapics[apic].mp_apicid); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; } /* @@ -1765,9 +1765,9 @@ static void __init setup_ioapic_ids_from_mpc(void) * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mpc_apicid)) { + mp_ioapics[apic].mp_apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mpc_apicid); + apic, mp_ioapics[apic].mp_apicid); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; @@ -1776,13 +1776,13 @@ static void __init setup_ioapic_ids_from_mpc(void) printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); physid_set(i, phys_id_present_map); - mp_ioapics[apic].mpc_apicid = i; + mp_ioapics[apic].mp_apicid = i; } else { physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", - mp_ioapics[apic].mpc_apicid); + mp_ioapics[apic].mp_apicid); physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -1791,11 +1791,11 @@ static void __init setup_ioapic_ids_from_mpc(void) * We need to adjust the IRQ routing table * if the ID changed. */ - if (old_id != mp_ioapics[apic].mpc_apicid) + if (old_id != mp_ioapics[apic].mp_apicid) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_dstapic == old_id) mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; + = mp_ioapics[apic].mp_apicid; /* * Read the right value from the MPC table and @@ -1803,9 +1803,9 @@ static void __init setup_ioapic_ids_from_mpc(void) */ apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); + mp_ioapics[apic].mp_apicid); - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0, reg_00.raw); spin_unlock_irqrestore(&ioapic_lock, flags); @@ -1816,7 +1816,7 @@ static void __init setup_ioapic_ids_from_mpc(void) spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2355,8 +2355,8 @@ static int ioapic_resume(struct sys_device *dev) spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; io_apic_write(dev->id, 0, reg_00.raw); } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2789,7 +2789,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, edge_level, active_high_low); ioapic_register_intr(irq, entry.vector, edge_level); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8dfcc529..4555ad8c2070 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -104,7 +104,7 @@ DEFINE_SPINLOCK(vector_lock); int nr_ioapic_registers[MAX_IO_APICS]; /* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ @@ -140,7 +140,7 @@ struct io_apic { static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); } static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) @@ -454,7 +454,7 @@ static int find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && mp_irqs[i].mpc_dstirq == pin) return i; @@ -496,7 +496,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) return apic; } } @@ -524,7 +524,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) int lbus = mp_irqs[i].mpc_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; @@ -846,7 +846,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, irq, trigger, polarity); /* @@ -887,10 +887,10 @@ static void __init setup_IO_APIC_irqs(void) idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); first_notcon = 0; } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); continue; } if (!first_notcon) { @@ -965,7 +965,7 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); /* * We are a bit conservative about what we expect. We have to @@ -983,7 +983,7 @@ void __apicdebuginit print_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1841,8 +1841,8 @@ static int ioapic_resume(struct sys_device *dev) spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; io_apic_write(dev->id, 0, reg_00.raw); } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2336,7 +2336,7 @@ void __init ioapic_init_mappings(void) ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; + ioapic_phys = mp_ioapics[i].mp_apicaddr; } else { ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d05b70c329c4..9f1e5bf7f0fb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -176,7 +176,11 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) if (bad_ioapic(m->mpc_apicaddr)) return; - mp_ioapics[nr_ioapics] = *m; + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr; + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid; + mp_ioapics[nr_ioapics].mp_type = m->mpc_type; + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver; + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags; nr_ioapics++; } @@ -426,7 +430,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) intsrc.mpc_type = MP_INTSRC; intsrc.mpc_irqflag = 0; /* conforming */ intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid; intsrc.mpc_irqtype = mp_INT; diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index 2ef96f0596e2..ade76c0d03ae 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -112,8 +112,16 @@ extern int nr_ioapic_registers[MAX_IO_APICS]; #define MP_MAX_IOAPIC_PIN 127 +struct mp_config_ioapic { + unsigned long mp_apicaddr; + unsigned int mp_apicid; + unsigned char mp_type; + unsigned char mp_apicver; + unsigned char mp_flags; +}; + /* I/O APIC entries */ -extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; /* # of MP IRQ source entries */ extern int mp_irq_entries; -- cgit v1.2.3 From 2fddb6e28e903a3ab1704cc5aac01be5a59dc05b Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Wed, 14 May 2008 19:03:17 +0400 Subject: x86: make config_irqsrc not MPspec specific Signed-off-by: Alexey Starikovskiy Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 43 +++++++++++++++-------------- arch/x86/kernel/io_apic_32.c | 64 ++++++++++++++++++++++---------------------- arch/x86/kernel/io_apic_64.c | 58 +++++++++++++++++++-------------------- arch/x86/kernel/mpparse.c | 8 +++++- include/asm-x86/io_apic.h | 12 ++++++++- 5 files changed, 100 insertions(+), 85 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a26dd91faf0f..276ec058f683 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -969,14 +969,14 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) if ((bus_irq == 0) && (trigger == 3)) trigger = 1; - mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; - mp_irqs[mp_irq_entries].mpc_irqflag = (trigger << 2) | polarity; - mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq; /* IRQ */ - mp_irqs[mp_irq_entries].mpc_dstapic = + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mp_irqflag = (trigger << 2) | polarity; + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_srcbusirq = bus_irq; /* IRQ */ + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ - mp_irqs[mp_irq_entries].mpc_dstirq = pin; /* INTIN# */ + mp_irqs[mp_irq_entries].mp_dstirq = pin; /* INTIN# */ if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); @@ -1012,12 +1012,11 @@ void __init mp_config_acpi_legacy_irqs(void) if (ioapic < 0) return; - mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mpc_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; -#ifdef CONFIG_X86_IO_APIC - mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mp_apicid; -#endif + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; + /* * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. @@ -1026,17 +1025,17 @@ void __init mp_config_acpi_legacy_irqs(void) int idx; for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; + struct mp_config_intsrc *irq = mp_irqs + idx; /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS - && irq->mpc_srcbusirq == i) + if (irq->mp_srcbus == MP_ISA_BUS + && irq->mp_srcbusirq == i) break; /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == - mp_irqs[mp_irq_entries].mpc_dstapic) && - (irq->mpc_dstirq == i)) + if ((irq->mp_dstapic == + mp_irqs[mp_irq_entries].mp_dstapic) && + (irq->mp_dstirq == i)) break; } @@ -1045,9 +1044,9 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } - mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; - mp_irqs[mp_irq_entries].mpc_srcbusirq = i; /* Identity mapped */ - mp_irqs[mp_irq_entries].mpc_dstirq = i; + mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ + mp_irqs[mp_irq_entries].mp_dstirq = i; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 5af1b717236c..ea68c3e5ba1d 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -76,7 +76,7 @@ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; @@ -801,10 +801,10 @@ static int find_irq_entry(int apic, int pin, int type) int i; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) + if (mp_irqs[i].mp_irqtype == type && + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) && + mp_irqs[i].mp_dstirq == pin) return i; return -1; @@ -818,13 +818,13 @@ static int __init find_isa_irq_pin(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) - return mp_irqs[i].mpc_dstirq; + return mp_irqs[i].mp_dstirq; } return -1; } @@ -834,17 +834,17 @@ static int __init find_isa_irq_apic(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) break; } if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) return apic; } } @@ -869,23 +869,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return -1; } for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) break; if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mpc_irqtype && + !mp_irqs[i].mp_irqtype && (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + if (pin == (mp_irqs[i].mp_srcbusirq & 3)) return irq; /* * Use the first all-but-pin matching entry as a @@ -952,7 +952,7 @@ static int EISA_ELCR(unsigned int irq) * EISA conforming in the MP table, that means its trigger type must * be read in from the ELCR */ -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) /* PCI interrupts are always polarity one level triggered, @@ -969,13 +969,13 @@ static int EISA_ELCR(unsigned int irq) static int MPBIOS_polarity(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mpc_irqflag & 3) + switch (mp_irqs[idx].mp_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ { @@ -1012,13 +1012,13 @@ static int MPBIOS_polarity(int idx) static int MPBIOS_trigger(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int trigger; /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ { @@ -1097,16 +1097,16 @@ static inline int irq_trigger(int idx) static int pin_2_irq(int idx, int apic, int pin) { int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; /* * Debugging check, we are in big trouble if this message pops up! */ - if (mp_irqs[idx].mpc_dstirq != pin) + if (mp_irqs[idx].mp_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) - irq = mp_irqs[idx].mpc_srcbusirq; + irq = mp_irqs[idx].mp_srcbusirq; else { /* * PCI IRQs are mapped in order @@ -1793,8 +1793,8 @@ static void __init setup_ioapic_ids_from_mpc(void) */ if (old_id != mp_ioapics[apic].mp_apicid) for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic = mp_ioapics[apic].mp_apicid; /* @@ -2810,8 +2810,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return -1; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == mp_INT && - mp_irqs[i].mpc_srcbusirq == bus_irq) + if (mp_irqs[i].mp_irqtype == mp_INT && + mp_irqs[i].mp_srcbusirq == bus_irq) break; if (i >= mp_irq_entries) return -1; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 4555ad8c2070..e7f1476ed537 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -108,7 +108,7 @@ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; @@ -453,10 +453,10 @@ static int find_irq_entry(int apic, int pin, int type) int i; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) + if (mp_irqs[i].mp_irqtype == type && + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) && + mp_irqs[i].mp_dstirq == pin) return i; return -1; @@ -470,13 +470,13 @@ static int __init find_isa_irq_pin(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) - return mp_irqs[i].mpc_dstirq; + return mp_irqs[i].mp_dstirq; } return -1; } @@ -486,17 +486,17 @@ static int __init find_isa_irq_apic(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) break; } if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) return apic; } } @@ -521,23 +521,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return -1; } for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) break; if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mpc_irqtype && + !mp_irqs[i].mp_irqtype && (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + if (pin == (mp_irqs[i].mp_srcbusirq & 3)) return irq; /* * Use the first all-but-pin matching entry as a @@ -565,13 +565,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) static int MPBIOS_polarity(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mpc_irqflag & 3) + switch (mp_irqs[idx].mp_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ if (test_bit(bus, mp_bus_not_pci)) @@ -607,13 +607,13 @@ static int MPBIOS_polarity(int idx) static int MPBIOS_trigger(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int trigger; /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ if (test_bit(bus, mp_bus_not_pci)) @@ -660,16 +660,16 @@ static inline int irq_trigger(int idx) static int pin_2_irq(int idx, int apic, int pin) { int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; /* * Debugging check, we are in big trouble if this message pops up! */ - if (mp_irqs[idx].mpc_dstirq != pin) + if (mp_irqs[idx].mp_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mpc_srcbusirq; + irq = mp_irqs[idx].mp_srcbusirq; } else { /* * PCI IRQs are mapped in order @@ -2242,8 +2242,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return -1; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == mp_INT && - mp_irqs[i].mpc_srcbusirq == bus_irq) + if (mp_irqs[i].mp_irqtype == mp_INT && + mp_irqs[i].mp_srcbusirq == bus_irq) break; if (i >= mp_irq_entries) return -1; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9f1e5bf7f0fb..59f051db236d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -186,12 +186,18 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void __init MP_intsrc_info(struct mpc_config_intsrc *m) { - mp_irqs[mp_irq_entries] = *m; printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + mp_irqs[mp_irq_entries].mp_dstapic = m->mpc_dstapic; + mp_irqs[mp_irq_entries].mp_type = m->mpc_type; + mp_irqs[mp_irq_entries].mp_irqtype = m->mpc_irqtype; + mp_irqs[mp_irq_entries].mp_irqflag = m->mpc_irqflag; + mp_irqs[mp_irq_entries].mp_srcbus = m->mpc_srcbus; + mp_irqs[mp_irq_entries].mp_srcbusirq = m->mpc_srcbusirq; + mp_irqs[mp_irq_entries].mp_dstirq = m->mpc_dstirq; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index ade76c0d03ae..86d8c3bdcca4 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -120,6 +120,16 @@ struct mp_config_ioapic { unsigned char mp_flags; }; +struct mp_config_intsrc { + unsigned int mp_dstapic; + unsigned char mp_type; + unsigned char mp_irqtype; + unsigned short mp_irqflag; + unsigned char mp_srcbus; + unsigned char mp_srcbusirq; + unsigned char mp_dstirq; +}; + /* I/O APIC entries */ extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; @@ -127,7 +137,7 @@ extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; extern int mp_irq_entries; /* MP IRQ source entries */ -extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* non-0 if default (table-less) MP configuration */ extern int mpc_default_type; -- cgit v1.2.3 From 59f4519ad7ade61123e90085b0dadb2ea197bd87 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Date: Sun, 18 May 2008 05:24:41 +0200 Subject: x86: initialize all fields of mp_irqs[mp_irq_entries] Commit "x86: make config_irqsrc not MPspec specific" introduced some uses of uninitialized fields in mp_config_acpi_legacy_irqs(). I need the following patch to get sched-devel/master to boot. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 276ec058f683..6170c6aeaf79 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1012,11 +1012,6 @@ void __init mp_config_acpi_legacy_irqs(void) if (ioapic < 0) return; - mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; - /* * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. @@ -1044,6 +1039,10 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ mp_irqs[mp_irq_entries].mp_dstirq = i; -- cgit v1.2.3 From aafbdf71f1d3aeffd679b1a86e1b28f71515856c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Thu, 22 May 2008 12:26:15 +0400 Subject: x86: fix mpparse/acpi interaction Sitsofe Wheeler reported boot problems on linux-next. It looks like the same issue as found by Soeren Sandman in 7575217f656a93, "x86: initialize all fields of mp_irqs[mp_irq_entries]". But his fix is also not complete, as dstapic is used before it assigned. Reported-by: Sitsofe Wheeler Bisected-by: Sitsofe Wheeler Signed-off-by: Alexey Starikovskiy Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6170c6aeaf79..2ddfabae382b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1019,6 +1019,11 @@ void __init mp_config_acpi_legacy_irqs(void) for (i = 0; i < 16; i++) { int idx; + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; + for (idx = 0; idx < mp_irq_entries; idx++) { struct mp_config_intsrc *irq = mp_irqs + idx; @@ -1039,10 +1044,6 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } - mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ mp_irqs[mp_irq_entries].mp_dstirq = i; -- cgit v1.2.3 From bf62f3981c7076714e3b9f5fa6989a806cad02bf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 20 May 2008 20:10:58 -0700 Subject: x86: move e820_mark_nosave_regions to e820.c and make e820_mark_nosave_regions to take limit_pfn to use max_low_pfn for 32bit and end_pfn for 64bit Signed-off-by: Yinghai Lu Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- arch/x86/kernel/e820.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 32 -------------------------------- arch/x86/kernel/e820_64.c | 32 -------------------------------- arch/x86/kernel/setup_32.c | 2 +- arch/x86/kernel/setup_64.c | 2 +- include/asm-x86/e820.h | 9 +++++++++ include/asm-x86/e820_32.h | 8 -------- include/asm-x86/e820_64.h | 1 - 8 files changed, 43 insertions(+), 75 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 35da8cdbe5e6..0cd9132c9450 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -495,6 +496,37 @@ __init void e820_setup_gap(void) pci_mem_start, gapstart, gapsize); } +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +/** + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(unsigned long limit_pfn) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= limit_pfn) + break; + } +} +#endif /* * Early reserved memory areas. diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index dfe25751038b..db760d4706af 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -208,37 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource, } } -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) -/** - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not - * correspond to e820 RAM areas and mark the corresponding pages as nosave for - * hibernation. - * - * This function requires the e820 map to be sorted and without any - * overlapping entries and assumes the first e820 area to be RAM. - */ -void __init e820_mark_nosave_regions(void) -{ - int i; - unsigned long pfn; - - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); - for (i = 1; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (pfn < PFN_UP(ei->addr)) - register_nosave_region(pfn, PFN_UP(ei->addr)); - - pfn = PFN_DOWN(ei->addr + ei->size); - if (ei->type != E820_RAM) - register_nosave_region(PFN_UP(ei->addr), pfn); - - if (pfn >= max_low_pfn) - break; - } -} -#endif - /* * Find the highest page frame number we have available */ diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 3b2e0b1bb49b..5e063e72b24a 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -93,37 +92,6 @@ void __init e820_reserve_resources(void) } } -/* - * Find the ranges of physical addresses that do not correspond to - * e820 RAM areas and mark the corresponding pages as nosave for software - * suspend and suspend to RAM. - * - * This function requires the e820 map to be sorted and without any - * overlapping entries and assumes the first e820 area to be RAM. - */ -void __init e820_mark_nosave_regions(void) -{ - int i; - unsigned long paddr; - - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); - for (i = 1; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (paddr < ei->addr) - register_nosave_region(PFN_DOWN(paddr), - PFN_UP(ei->addr)); - - paddr = round_down(ei->addr + ei->size, PAGE_SIZE); - if (ei->type != E820_RAM) - register_nosave_region(PFN_UP(ei->addr), - PFN_DOWN(paddr)); - - if (paddr >= (end_pfn << PAGE_SHIFT)) - break; - } -} - /* * Finds an active region in the address range from start_pfn to last_pfn and * returns its range in ei_startpfn and ei_endpfn for the e820 entry. diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 3c451d143eba..e1173aecf69a 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -820,7 +820,7 @@ void __init setup_arch(char **cmdline_p) #endif e820_setup_gap(); - e820_mark_nosave_regions(); + e820_mark_nosave_regions(max_low_pfn); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff1286ad8a..975a5da46e49 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -516,7 +516,7 @@ void __init setup_arch(char **cmdline_p) * We trust e820 completely. No explicit ROM probing in memory. */ e820_reserve_resources(); - e820_mark_nosave_regions(); + e820_mark_nosave_regions(end_pfn); /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 5a58e2bb1d78..4266a2c5f2e8 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -70,6 +70,15 @@ extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, extern void update_e820(void); extern void e820_setup_gap(void); +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +extern void e820_mark_nosave_regions(unsigned long limit_pfn); +#else +static inline void e820_mark_nosave_regions(unsigned long limit_pfn) +{ +} +#endif + extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 9576b438fbd9..7ace82570a36 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -28,13 +28,5 @@ extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) -extern void e820_mark_nosave_regions(void); -#else -static inline void e820_mark_nosave_regions(void) -{ -} -#endif - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 37f176a02bc6..917963ccab69 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -18,7 +18,6 @@ extern void setup_memory_region(void); extern void contig_e820_setup(void); extern unsigned long e820_end_of_ram(void); extern void e820_reserve_resources(void); -extern void e820_mark_nosave_regions(void); extern int e820_any_non_reserved(unsigned long start, unsigned long end); extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); -- cgit v1.2.3 From b3e2416465bc9140bfd209f247e6a789f68f0d19 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Fri, 23 May 2008 01:54:44 +0400 Subject: x86: Set pic_mode only if local apic code is present --- arch/x86/kernel/mpparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 59f051db236d..e61523a13ecd 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -609,7 +609,7 @@ static void __init __get_smp_config(unsigned early) printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) if (mpf->mpf_feature2 & (1 << 7)) { printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); pic_mode = 1; -- cgit v1.2.3 From f3918352909f839a7b0dbf9b3f81d2e183c46f88 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Fri, 23 May 2008 01:54:51 +0400 Subject: x86: move pic_mode to apic_32.c --- arch/x86/kernel/apic_32.c | 2 ++ arch/x86/kernel/mpparse.c | 2 -- arch/x86/mach-visws/mpparse.c | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index fa1be1d62916..848c603457fe 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -76,6 +76,8 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); */ int apic_verbosity; +int pic_mode; + static unsigned int calibration_result; static int lapic_next_event(unsigned long delta, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e61523a13ecd..b72c04602ad2 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -48,8 +48,6 @@ int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 }; static int mp_current_pci_id; -int pic_mode; - /* * Intel MP BIOS table parsing routines: */ diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c index 57484e91ab90..b7d063e7de1f 100644 --- a/arch/x86/mach-visws/mpparse.c +++ b/arch/x86/mach-visws/mpparse.c @@ -11,8 +11,6 @@ /* Have we found an MP table */ int smp_found_config; -int pic_mode; - extern unsigned int __cpuinitdata maxcpus; /* -- cgit v1.2.3 From bab4b27c00c4880737c18bb91138b1a7dd94164c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Mon, 19 May 2008 19:47:03 +0400 Subject: x86: move smp_found_config --- arch/x86/kernel/apic_32.c | 3 +++ arch/x86/kernel/apic_64.c | 3 +++ arch/x86/kernel/mpparse.c | 8 ++++---- arch/x86/mach-visws/mpparse.c | 5 ++--- arch/x86/mach-voyager/voyager_smp.c | 5 ----- 5 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 848c603457fe..c304759f0834 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -78,6 +78,9 @@ int apic_verbosity; int pic_mode; +/* Have we found an MP table */ +int smp_found_config; + static unsigned int calibration_result; static int lapic_next_event(unsigned long delta, diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 5910020c3f24..54087f920f2f 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); */ int apic_verbosity; +/* Have we found an MP table */ +int smp_found_config; + static struct resource lapic_resource = { .name = "Local APIC", .flags = IORESOURCE_MEM | IORESOURCE_BUSY, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b72c04602ad2..d67cd7600a21 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -32,9 +32,6 @@ #include #endif -/* Have we found an MP table */ -int smp_found_config; - /* * Various Linux-internal data structures created from the * MP-table. @@ -639,7 +636,9 @@ static void __init __get_smp_config(unsigned early) * override the defaults. */ if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { +#ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; +#endif printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. " @@ -706,8 +705,9 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, !mpf_checksum((unsigned char *)bp, 16) && ((mpf->mpf_specification == 1) || (mpf->mpf_specification == 4))) { - +#ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; +#endif mpf_found = mpf; #ifdef CONFIG_X86_32 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c index b7d063e7de1f..a2fb78c0d154 100644 --- a/arch/x86/mach-visws/mpparse.c +++ b/arch/x86/mach-visws/mpparse.c @@ -8,9 +8,6 @@ #include "cobalt.h" #include "mach_apic.h" -/* Have we found an MP table */ -int smp_found_config; - extern unsigned int __cpuinitdata maxcpus; /* @@ -74,7 +71,9 @@ void __init find_smp_config(void) if (ncpus > maxcpus) ncpus = maxcpus; +#ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; +#endif while (ncpus--) MP_processor_info(mp++); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 7bbebbfe8c4e..8dedd01e909f 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -59,11 +59,6 @@ __u32 voyager_quad_processors = 0; * activity count. Finally exported by i386_ksyms.c */ static int voyager_extended_cpus = 1; -/* Have we found an SMP box - used by time.c to do the profiling - interrupt for timeslicing; do not set to 1 until the per CPU timer - interrupt is active */ -int smp_found_config = 0; - /* Used for the invalidate map that's also checked in the spinlock */ static volatile unsigned long smp_invalidate_needed; -- cgit v1.2.3 From ce6444d39fdea29dcf145d2d95fe9cdc850aa53c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Mon, 19 May 2008 19:47:09 +0400 Subject: x86: mp_bus_id_to_pci_bus is not needed --- arch/x86/kernel/io_apic_32.c | 2 +- arch/x86/kernel/io_apic_64.c | 2 +- arch/x86/kernel/mpparse.c | 10 ---------- include/asm-x86/mpspec.h | 2 -- 4 files changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index ea68c3e5ba1d..17b2e8f862d3 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -864,7 +864,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " "slot:%d, pin:%d.\n", bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { + if (test_bit(bus, mp_bus_not_pci)) { printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index e7f1476ed537..9637675ea555 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -516,7 +516,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { + if (test_bit(bus, mp_bus_not_pci)) { apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d67cd7600a21..83dfd6696c2a 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -41,13 +41,6 @@ int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 }; - -static int mp_current_pci_id; - -/* - * Intel MP BIOS table parsing routines: - */ /* * Checksum an MP configuration block. @@ -101,7 +94,6 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) static void __init MP_bus_info(struct mpc_config_bus *m) { char str[7]; - memcpy(str, m->mpc_bustype, 6); str[6] = 0; @@ -130,8 +122,6 @@ static void __init MP_bus_info(struct mpc_config_bus *m) mpc_oem_pci_bus(m, translation_table[mpc_record]); #endif clear_bit(m->mpc_busid, mp_bus_not_pci); - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; #if defined(CONFIG_EISA) || defined (CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index 57a991b9c053..b785ddd8d761 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -32,8 +32,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES]; - extern unsigned int boot_cpu_physical_apicid; extern int smp_found_config; extern int mpc_default_type; -- cgit v1.2.3 From 8732fc4b237fca3bd3cb0ec87ca8fb90271b0baf Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Mon, 19 May 2008 19:47:16 +0400 Subject: x86: move mp_bus_not_pci from mpparse.c --- arch/x86/kernel/io_apic_32.c | 6 ++++++ arch/x86/kernel/io_apic_64.c | 2 ++ arch/x86/kernel/mpparse.c | 10 ---------- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 17b2e8f862d3..2285b81ad1d5 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -81,6 +81,12 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + static int disable_timer_pin_1 __initdata; /* diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 9637675ea555..339cf6f926dc 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -113,6 +113,8 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 83dfd6696c2a..e8e041ed84d6 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -32,16 +32,6 @@ #include #endif -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - /* * Checksum an MP configuration block. */ -- cgit v1.2.3 From 136ef671df04dc157afa0d4b96c7bd23ba072c9c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy Date: Tue, 20 May 2008 00:29:59 +0400 Subject: x86: allow MPPARSE to be deselected in SMP configs --- arch/x86/Kconfig.debug | 2 +- arch/x86/kernel/setup_32.c | 2 +- arch/x86/kernel/setup_64.c | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ac1e31ba4795..d5b364b43a02 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -127,7 +127,7 @@ config 4KSTACKS config X86_FIND_SMP_CONFIG def_bool y - depends on X86_LOCAL_APIC || X86_VOYAGER + depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS depends on X86_32 config X86_MPPARSE diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index e1173aecf69a..c04e8c91daf7 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -814,7 +814,7 @@ void __init setup_arch(char **cmdline_p) "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); #endif #endif -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) if (smp_found_config) get_smp_config(); #endif diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 975a5da46e49..89e6cca5d693 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -456,10 +456,12 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_reserve_bootmem(); +#ifdef CONFIG_X86_MPPARSE /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); +#endif #ifdef CONFIG_BLK_DEV_INITRD if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; @@ -502,11 +504,13 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); +#ifdef CONFIG_X86_MPPARSE /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); +#endif init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.2.3 From b764a15f679942a7bc9d4f9645299e1defcc5b43 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 22 May 2008 21:22:04 +0100 Subject: x86: Switch apm to unlocked_kernel This pushes the lock a fair way down and the final kill looks like it should be an easy project for someone who wants to have a shot at it. Signed-off-by: Alan Cox Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apm_32.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9290e29013..00e6d1370954 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -228,6 +228,7 @@ #include #include #include +#include #include #include @@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) as->event_tail = 0; } as->events[as->event_head] = event; - if ((!as->suser) || (!as->writer)) + if (!as->suser || !as->writer) continue; switch (event) { case APM_SYS_SUSPEND: @@ -1396,7 +1397,7 @@ static void apm_mainloop(void) static int check_apm_user(struct apm_user *as, const char *func) { - if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { + if (as == NULL || as->magic != APM_BIOS_MAGIC) { printk(KERN_ERR "apm: %s passed bad filp\n", func); return 1; } @@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait) return 0; } -static int do_ioctl(struct inode *inode, struct file *filp, - u_int cmd, u_long arg) +static long do_ioctl(struct file *filp, u_int cmd, u_long arg) { struct apm_user *as; + int ret; as = filp->private_data; if (check_apm_user(as, "ioctl")) return -EIO; - if ((!as->suser) || (!as->writer)) + if (!as->suser || !as->writer) return -EPERM; switch (cmd) { case APM_IOC_STANDBY: + lock_kernel(); if (as->standbys_read > 0) { as->standbys_read--; as->standbys_pending--; @@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp, queue_event(APM_USER_STANDBY, as); if (standbys_pending <= 0) standby(); + unlock_kernel(); break; case APM_IOC_SUSPEND: + lock_kernel(); if (as->suspends_read > 0) { as->suspends_read--; as->suspends_pending--; @@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp, } else queue_event(APM_USER_SUSPEND, as); if (suspends_pending <= 0) { - return suspend(1); + ret = suspend(1); } else { as->suspend_wait = 1; wait_event_interruptible(apm_suspend_waitqueue, as->suspend_wait == 0); - return as->suspend_result; + ret = as->suspend_result; } - break; + unlock_kernel(); + return ret; default: - return -EINVAL; + return -ENOTTY; } return 0; } @@ -1860,7 +1865,7 @@ static const struct file_operations apm_bios_fops = { .owner = THIS_MODULE, .read = do_read, .poll = do_poll, - .ioctl = do_ioctl, + .unlocked_ioctl = do_ioctl, .open = do_open, .release = do_release, }; -- cgit v1.2.3 From 85cc35fa7255d113b5383a9c8536c363274bb475 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 May 2008 21:21:36 +0200 Subject: x86: fix mpparse fallout UP builds with LOCAL_APIC=y and IO_APIC=n fail with a missing reference to mp_bus_not_pci. Distangle the mpparse code some more and move the ioapic specific bus check into a separate function. This code needs sume urgent un#ifdef surgery all over the place. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mpparse.c | 73 ++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e8e041ed84d6..9f3792d55044 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -81,6 +81,7 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) generic_processor_info(apicid, m->mpc_apicver); } +#ifdef CONFIG_X86_IO_APIC static void __init MP_bus_info(struct mpc_config_bus *m) { char str[7]; @@ -122,6 +123,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) } else printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } +#endif #ifdef CONFIG_X86_IO_APIC @@ -336,7 +338,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) { struct mpc_config_bus *m = (struct mpc_config_bus *)mpt; +#ifdef CONFIG_X86_IO_APIC MP_bus_info(m); +#endif mpt += sizeof(*m); count += sizeof(*m); break; @@ -472,40 +476,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) MP_intsrc_info(&intsrc); } -#endif -static inline void __init construct_default_ISA_mptable(int mpc_default_type) +static void construct_ioapic_table(int mpc_default_type) { - struct mpc_config_processor processor; - struct mpc_config_bus bus; -#ifdef CONFIG_X86_IO_APIC struct mpc_config_ioapic ioapic; -#endif - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } + struct mpc_config_bus bus; bus.mpc_type = MP_BUS; bus.mpc_busid = 0; @@ -534,7 +509,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) MP_bus_info(&bus); } -#ifdef CONFIG_X86_IO_APIC ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; @@ -546,7 +520,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) * We set up most of the low 16 IO-APIC pins according to MPS rules. */ construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void construct_ioapic_table(int mpc_default_type) { } #endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + lintsrc.mpc_type = MP_LINTSRC; lintsrc.mpc_irqflag = 0; /* conforming */ lintsrc.mpc_srcbusid = 0; -- cgit v1.2.3 From ddca03c98a7f7ad5ab09967ff52d4ed60358c896 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:31 +0400 Subject: x86: nmi - unify die_nmi() interface By slightly changing 32bit mode die_nmi() we may unify the interface and make it common for both (32/64bit) modes Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_32.c | 7 +++---- arch/x86/kernel/traps_32.c | 8 +++++--- include/asm-x86/nmi.h | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 69bdae555c18..bd04a28f7a5c 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -320,8 +320,6 @@ void touch_nmi_watchdog(void) } EXPORT_SYMBOL(touch_nmi_watchdog); -extern void die_nmi(struct pt_regs *, const char *msg); - notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { @@ -375,7 +373,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, 0); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -406,7 +405,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) char buf[64]; sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(regs, buf); + die_nmi(buf, regs, 0); return 0; } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index bde6f63e15d5..f31e6651fa6f 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -755,9 +755,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static DEFINE_SPINLOCK(nmi_print_lock); -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; spin_lock(&nmi_print_lock); @@ -766,10 +766,12 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) * to get a message out: */ bust_spinlocks(1); - printk(KERN_EMERG "%s", msg); + printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 8455bf36d8df..7cd5b6a88138 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -38,12 +38,12 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); -extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern void nmi_watchdog_default(void); #else #define nmi_watchdog_default() do {} while (0) #endif +extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); -- cgit v1.2.3 From e56b3a12c45e439169f2d1f7194be397667320a6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:32 +0400 Subject: x86: nmi - die_nmi() output message unification Make 64bit die_nmi() to produce the same message as 32bit mode has Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_64.c | 4 ++-- arch/x86/kernel/traps_64.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 33cb8ecbb6dd..1a98705270a2 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -358,8 +358,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, - panic_on_timeout); + die_nmi("NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index adff76ea97c4..6a048ce3c787 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -614,7 +614,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) * We are in trouble anyway, lets at least try * to get a message out. */ - printk(str, smp_processor_id()); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); -- cgit v1.2.3 From c6425b9f143a75bbcd0a7684b4df40e20d0b2f32 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:33 +0400 Subject: x86: move do_nmi(), stop_nmi() and restart_nmi() to traps_64.c traps_32.c already holds these functions so do the same for traps_64.c Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_64.c | 25 ------------------------- arch/x86/kernel/traps_64.c | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 1a98705270a2..c1ea671525e5 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -30,7 +30,6 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -int panic_on_unrecovered_nmi; static cpumask_t backtrace_mask = CPU_MASK_NONE; @@ -383,30 +382,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) return rc; } -static unsigned ignore_nmis; - -asmlinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - add_pda(__nmi_count,1); - if (!ignore_nmis) - default_do_nmi(regs); - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 6a048ce3c787..e4a380797dc1 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -76,7 +76,9 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +int panic_on_unrecovered_nmi; static unsigned int code_bytes = 64; +static unsigned ignore_nmis; static inline void conditional_sti(struct pt_regs *regs) { @@ -867,6 +869,28 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) io_check_error(reason, regs); } +asmlinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_enter(); + add_pda(__nmi_count, 1); + if (!ignore_nmis) + default_do_nmi(regs); + nmi_exit(); +} + +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + /* runs on IST stack. */ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) { -- cgit v1.2.3 From d1b946b97d71423f365fa797d1428e1847c0bec1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:34 +0400 Subject: x86: nmi_32.c - add "panic" option Allow to pass "panic" option in 32bit mode Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_32.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index bd04a28f7a5c..4437fe1edabc 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -42,6 +42,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; * 0: the lapic NMI watchdog is disabled, but can be enabled */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +static int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; @@ -140,6 +141,14 @@ static int __init setup_nmi_watchdog(char *str) { int nmi; + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + get_option(&str, &nmi); if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) @@ -374,7 +383,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * die_nmi will return ONLY if NOTIFY_STOP happens.. */ die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, 0); + regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); -- cgit v1.2.3 From 4b82b277707a39b97271439c475f186f63ec4692 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:35 +0400 Subject: x86: nmi_32.c - add nmi_watchdog_default helper Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_32.c | 19 +++++++++++++------ include/asm-x86/nmi.h | 4 +--- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 4437fe1edabc..7714e8478213 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,6 +51,17 @@ static DEFINE_PER_CPU(short, wd_enabled); static int endflag __initdata = 0; +/* Run after command line and cpu_init init, but before all other checks */ +void nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; +} + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -437,12 +448,8 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return -EIO; } - if (nmi_watchdog == NMI_DEFAULT) { - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; - } + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 7cd5b6a88138..1e8f34d7ab68 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -38,11 +38,9 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); -extern void nmi_watchdog_default(void); -#else -#define nmi_watchdog_default() do {} while (0) #endif +extern void nmi_watchdog_default(void); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; -- cgit v1.2.3 From ad63ba169d45ccb6594eb35a6c31c421fe70ff45 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:36 +0400 Subject: x86: nmi_32/64.c - use apic_write_around instead of apic_write apic_write_around will be expanded to apic_write in 64bit mode anyway. Only a few CPUs (well, old CPUs to be precise) requires such an action. In general it should not hurt and could be cleaned up for apic_write (just in case) Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_32.c | 2 +- arch/x86/kernel/nmi_64.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 7714e8478213..ec5842b4dd68 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -248,7 +248,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index c1ea671525e5..f546e3164a74 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -215,7 +215,7 @@ late_initcall(init_lapic_nmi_sysfs); static void __acpi_nmi_enable(void *__unused) { - apic_write(APIC_LVT0, APIC_DM_NMI); + apic_write_around(APIC_LVT0, APIC_DM_NMI); } /* @@ -229,7 +229,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* -- cgit v1.2.3 From 6c8decdf14b17d42f6eb817d310642f4d6598a19 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:37 +0400 Subject: x86: nmi_32.c - unknown_nmi_panic_callback should always panic Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index ec5842b4dd68..d0b0684bde9e 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -425,7 +425,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) char buf[64]; sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 0); + die_nmi(buf, regs, 1); /* Always panic here */ return 0; } -- cgit v1.2.3 From 96f9dcb10755e96eae706b9e869c0dc25adb3d74 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:38 +0400 Subject: x86: nmi_64.c - use for_each_possible_cpu helper Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index f546e3164a74..d98b21cd4928 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -98,7 +98,7 @@ int __init check_nmi_watchdog(void) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); #endif - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_possible_cpu(cpu) prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks -- cgit v1.2.3 From 7c2ba83f9a479eee6f302147767a30f3187fbd4b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:39 +0400 Subject: x86: nmi_32.c cleanup - use for_each_online_cpu helper Since cpu_online_map is touched (by for_each_online_cpu) at moment when cpu_callin_map is already filled up we can get rid of its checking at all Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_32.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index d0b0684bde9e..c6d0777aaf90 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -108,13 +108,7 @@ int __init check_nmi_watchdog(void) local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks - for_each_possible_cpu(cpu) { -#ifdef CONFIG_SMP - /* Check cpu_callin_map here because that is set - after the timer is started. */ - if (!cpu_isset(cpu, cpu_callin_map)) - continue; -#endif + for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { -- cgit v1.2.3 From fd5cea02de100197a4c26d9e103508cf09b50a82 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:40 +0400 Subject: x86: nmi_32/64.c - add helper functions to hide arch specific data Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/nmi_32.c | 44 ++++++++++++++++++++++++++++++++------------ arch/x86/kernel/nmi_64.c | 42 +++++++++++++++++++++++++++++++----------- 2 files changed, 63 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index c6d0777aaf90..a94dbedf2b74 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,6 +51,26 @@ static DEFINE_PER_CPU(short, wd_enabled); static int endflag __initdata = 0; +static inline unsigned int get_nmi_count(int cpu) +{ + return nmi_count(cpu); +} + +static inline int mce_in_progress(void) +{ + return 0; +} + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +} + /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -104,19 +124,19 @@ int __init check_nmi_watchdog(void) #endif for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = nmi_count(cpu); + prev_nmi_count[cpu] = get_nmi_count(cpu); local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], - nmi_count(cpu)); + get_nmi_count(cpu)); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } @@ -355,6 +375,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) touched = 1; } + sum = get_timer_irqs(cpu); + + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + if (cpu_isset(cpu, backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ @@ -365,16 +392,9 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) cpu_clear(cpu, backtrace_mask); } - /* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ - sum = per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) touched = 1; - } /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index d98b21cd4928..c6802447262c 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -47,6 +47,30 @@ static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); +static int endflag __initdata = 0; + +static inline unsigned int get_nmi_count(int cpu) +{ + return cpu_pda(cpu)->__nmi_count; +} + +static inline int mce_in_progress(void) +{ +#ifdef CONFIG_X86_MCE + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ + return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); +} + /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -55,8 +79,6 @@ void nmi_watchdog_default(void) nmi_watchdog = NMI_NONE; } -static int endflag __initdata = 0; - #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -99,19 +121,19 @@ int __init check_nmi_watchdog(void) #endif for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; + prev_nmi_count[cpu] = get_nmi_count(cpu); local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], - cpu_pda(cpu)->__nmi_count); + get_nmi_count(cpu)); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } @@ -327,7 +349,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) touched = 1; } - sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); + sum = get_timer_irqs(cpu); + if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -343,12 +366,9 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) cpu_clear(cpu, backtrace_mask); } -#ifdef CONFIG_X86_MCE - /* Could check oops_in_progress here too, but it's safer - not too */ - if (atomic_read(&mce_entry) > 0) + if (mce_in_progress()) touched = 1; -#endif + /* if the apic timer isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* -- cgit v1.2.3 From 1798bc22b2790bf2a956588e6b17c36ef79ceff7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 24 May 2008 19:36:41 +0400 Subject: x86: nmi_32/64.c - merge down nmi_32.c and nmi_64.c to nmi.c Signed-off-by: Cyrill Gorcunov Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/nmi.c | 533 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/nmi_32.c | 506 -------------------------------------------- arch/x86/kernel/nmi_64.c | 477 ------------------------------------------ 4 files changed, 534 insertions(+), 984 deletions(-) create mode 100644 arch/x86/kernel/nmi.c delete mode 100644 arch/x86/kernel/nmi_32.c delete mode 100644 arch/x86/kernel/nmi_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..d5dd6668225f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 000000000000..69a839fc1eb0 --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,533 @@ +/* + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +int unknown_nmi_panic; +int nmi_watchdog_enabled; + +static cpumask_t backtrace_mask = CPU_MASK_NONE; + +/* nmi_active: + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled + */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +static int panic_on_timeout; + +unsigned int nmi_watchdog = NMI_DEFAULT; + +static unsigned int nmi_hz = HZ; +static DEFINE_PER_CPU(short, wd_enabled); +static int endflag __initdata = 0; + +static inline unsigned int get_nmi_count(int cpu) +{ +#ifdef CONFIG_X86_64 + return cpu_pda(cpu)->__nmi_count; +#else + return nmi_count(cpu); +#endif +} + +static inline int mce_in_progress(void) +{ +#if defined(CONFIX_X86_64) && defined(CONFIG_X86_MCE) + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ +#ifdef CONFIG_X86_64 + return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); +#else + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +#endif +} + +/* Run after command line and cpu_init init, but before all other checks */ +void nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; +#ifdef CONFIG_X86_64 + nmi_watchdog = NMI_NONE; +#else + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; +#endif +} + +#ifdef CONFIG_SMP +/* + * The performance counters used by NMI_LOCAL_APIC don't trigger when + * the CPU is idle. To make sure the NMI watchdog really ticks on all + * CPUs during the test make them busy. + */ +static __init void nmi_cpu_busy(void *data) +{ + local_irq_enable_in_hardirq(); + /* + * Intentionally don't use cpu_relax here. This is + * to make sure that the performance counter really ticks, + * even if there is a simulator or similar that catches the + * pause instruction. On a real HT machine this is fine because + * all other CPUs are busy with "useless" delay loops and don't + * care if they get somewhat less cycles. + */ + while (endflag == 0) + mb(); +} +#endif + +int __init check_nmi_watchdog(void) +{ + unsigned int *prev_nmi_count; + int cpu; + + if (nmi_watchdog == NMI_NONE || nmi_watchdog == NMI_DISABLED) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + + prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) + goto error; + + printk(KERN_INFO "Testing NMI watchdog ... "); + +#ifdef CONFIG_SMP + if (nmi_watchdog == NMI_LOCAL_APIC) + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); +#endif + + for_each_possible_cpu(cpu) + prev_nmi_count[cpu] = get_nmi_count(cpu); + local_irq_enable(); + mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ + + for_each_online_cpu(cpu) { + if (!per_cpu(wd_enabled, cpu)) + continue; + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + printk(KERN_WARNING "WARNING: CPU#%d: NMI " + "appears to be stuck (%d->%d)!\n", + cpu, + prev_nmi_count[cpu], + get_nmi_count(cpu)); + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); + } + } + endflag = 1; + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + goto error; + } + printk("OK.\n"); + + /* + * now that we know it works we can reduce NMI frequency to + * something more reasonable; makes a difference in some configs + */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); + + kfree(prev_nmi_count); + return 0; + +error: +#ifdef CONFIG_X86_32 + timer_ack = !cpu_has_tsc; +#endif + return -1; +} + +static int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + + get_option(&str, &nmi); + + if (nmi >= NMI_INVALID || nmi < NMI_NONE) + return 0; + + nmi_watchdog = nmi; + return 1; +} +__setup("nmi_watchdog=", setup_nmi_watchdog); + +/* + * Suspend/resume support + */ +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } + return 0; +} + +static struct sysdev_class nmi_sysclass = { + .name = "lapic_nmi", + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + /* + * should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if (atomic_read(&nmi_active) < 0) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} + +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +void setup_apic_nmi_watchdog(void *unused) +{ + if (__get_cpu_var(wd_enabled)) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) + return; + + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + /* enable it before to avoid race with handler */ + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; + return; + } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + } +} + +void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if (nmi_watchdog != NMI_LOCAL_APIC && + nmi_watchdog != NMI_IO_APIC) + return; + if (__get_cpu_var(wd_enabled) == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; + atomic_dec(&nmi_active); +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up here too!] + */ + +static DEFINE_PER_CPU(unsigned, last_irq_sum); +static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(int, nmi_touch); + +void touch_nmi_watchdog(void) +{ + if (nmi_watchdog > 0) { + unsigned cpu; + + /* + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. + */ + for_each_present_cpu(cpu) { + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; + } + } + + /* + * Tickle the softlockup detector too: + */ + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +notrace __kprobes int +nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +{ + /* + * Since current_thread_info()-> is always on the stack, and we + * always switch the stack NMI-atomically, it's safe to use + * smp_processor_id(). + */ + unsigned int sum; + int touched = 0; + int cpu = smp_processor_id(); + int rc = 0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } + + sum = get_timer_irqs(cpu); + + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + touched = 1; + + /* if the none of the timers isn't firing, this cpu isn't doing much */ + if (!touched && __get_cpu_var(last_irq_sum) == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + /* + * die_nmi will return ONLY if NOTIFY_STOP happens.. + */ + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); + } else { + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); + } + + /* see if the nmi watchdog went off */ + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* + * don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; + } + return rc; +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf, regs, 1); /* Always panic here */ + return 0; +} + +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { + printk(KERN_WARNING + "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else { + printk(KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ + +int do_nmi_callback(struct pt_regs *regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); + diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c deleted file mode 100644 index a94dbedf2b74..000000000000 --- a/arch/x86/kernel/nmi_32.c +++ /dev/null @@ -1,506 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mach_traps.h" - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -static cpumask_t backtrace_mask = CPU_MASK_NONE; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; - -static DEFINE_PER_CPU(short, wd_enabled); - -static int endflag __initdata = 0; - -static inline unsigned int get_nmi_count(int cpu) -{ - return nmi_count(cpu); -} - -static inline int mce_in_progress(void) -{ - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; -} - -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; -} - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (endflag == 0) - mb(); -} -#endif - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) - return 0; - - if (!atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - goto error; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - goto error; - } - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -error: - timer_ack = !cpu_has_tsc; - - return -1; -} - -static int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) - return 0; - - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - - -/* Suspend/resume support */ - -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up - * here too!] - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog > 0) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - /* - * Tickle the softlockup detector too: - */ - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - - /* - * Since current_thread_info()-> is always on the stack, and we - * always switch the stack NMI-atomically, it's safe to use - * smp_processor_id(). - */ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); - spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); - } - - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - touched = 1; - - /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { - printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else { - printk( KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void __trigger_all_cpu_backtrace(void) -{ - int i; - - backtrace_mask = cpu_online_map; - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) - break; - mdelay(1); - } -} - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c deleted file mode 100644 index c6802447262c..000000000000 --- a/arch/x86/kernel/nmi_64.c +++ /dev/null @@ -1,477 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -static cpumask_t backtrace_mask = CPU_MASK_NONE; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; - -static DEFINE_PER_CPU(short, wd_enabled); - -static int endflag __initdata = 0; - -static inline unsigned int get_nmi_count(int cpu) -{ - return cpu_pda(cpu)->__nmi_count; -} - -static inline int mce_in_progress(void) -{ -#ifdef CONFIG_X86_MCE - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); -} - -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - nmi_watchdog = NMI_NONE; -} - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (endflag == 0) - mb(); -} -#endif - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) - return 0; - - if (!atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - return -1; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - return -1; - } - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -} - -static int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str,"panic",5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) - return 0; - - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - __get_cpu_var(wd_enabled) = 1; - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog > 0) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); - spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); - } - - if (mce_in_progress()) - touched = 1; - - /* if the apic timer isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { - printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else { - printk( KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void __trigger_all_cpu_backtrace(void) -{ - int i; - - backtrace_mask = cpu_online_map; - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) - break; - mdelay(1); - } -} - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); -- cgit v1.2.3 From a15af1c9ea2750a9ff01e51615c45950bad8221b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:06 +0100 Subject: x86/paravirt: add pte_flags to just get pte flags Add pte_flags() to extract the flags from a pte. This is a special case of pte_val() which is only guaranteed to return the pte's flags correctly; the page number may be corrupted or missing. The intent is to allow paravirt implementations to return pte flags without having to do any translation of the page number (most notably, Xen). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 1 + arch/x86/xen/enlighten.c | 1 + drivers/lguest/lg.h | 1 - include/asm-x86/page.h | 1 + include/asm-x86/paravirt.h | 15 +++++++++++++++ include/asm-x86/pgtable.h | 16 ++++++++-------- 6 files changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 74f0c5ea2a03..c98d54688180 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -403,6 +403,7 @@ struct pv_mmu_ops pv_mmu_ops = { #endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, + .pte_flags = native_pte_val, .pgd_val = native_pgd_val, .make_pte = native_make_pte, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4a372b71239d..1b4b5fa498b3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1101,6 +1101,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pmd = xen_set_pmd, .pte_val = xen_pte_val, + .pte_flags = native_pte_val, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 005bd045d2eb..5faefeaf6790 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -136,7 +136,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); * first step in the migration to the kernel types. pte_pfn is already defined * in the kernel. */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) -#define pte_flags(x) (pte_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h index dc936dddf161..a1e2b9470f25 100644 --- a/include/asm-x86/page.h +++ b/include/asm-x86/page.h @@ -160,6 +160,7 @@ static inline pteval_t native_pte_val(pte_t pte) #endif #define pte_val(x) native_pte_val(x) +#define pte_flags(x) native_pte_val(x) #define __pte(x) native_make_pte(x) #endif /* CONFIG_PARAVIRT */ diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 0f13b945e240..5ea37a48eecb 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -239,6 +239,7 @@ struct pv_mmu_ops { unsigned long addr, pte_t *ptep); pteval_t (*pte_val)(pte_t); + pteval_t (*pte_flags)(pte_t); pte_t (*make_pte)(pteval_t pte); pgdval_t (*pgd_val)(pgd_t); @@ -996,6 +997,20 @@ static inline pteval_t pte_val(pte_t pte) return ret; } +static inline pteval_t pte_flags(pte_t pte) +{ + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) + ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags, + pte.pte, (u64)pte.pte >> 32); + else + ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, + pte.pte); + + return ret; +} + static inline pgd_t __pgd(pgdval_t val) { pgdval_t ret; diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 97c271b2910b..47a852cb8c92 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -164,37 +164,37 @@ extern struct list_head pgd_list; */ static inline int pte_dirty(pte_t pte) { - return pte_val(pte) & _PAGE_DIRTY; + return pte_flags(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { - return pte_val(pte) & _PAGE_ACCESSED; + return pte_flags(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { - return pte_val(pte) & _PAGE_RW; + return pte_flags(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { - return pte_val(pte) & _PAGE_FILE; + return pte_flags(pte) & _PAGE_FILE; } static inline int pte_huge(pte_t pte) { - return pte_val(pte) & _PAGE_PSE; + return pte_flags(pte) & _PAGE_PSE; } static inline int pte_global(pte_t pte) { - return pte_val(pte) & _PAGE_GLOBAL; + return pte_flags(pte) & _PAGE_GLOBAL; } static inline int pte_exec(pte_t pte) { - return !(pte_val(pte) & _PAGE_NX); + return !(pte_flags(pte) & _PAGE_NX); } static inline int pte_special(pte_t pte) @@ -305,7 +305,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) return __pgprot(preservebits | addbits); } -#define pte_pgprot(x) __pgprot(pte_val(x) & ~PTE_MASK) +#define pte_pgprot(x) __pgprot(pte_flags(x) & ~PTE_MASK) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) -- cgit v1.2.3 From 3945e2c9abf8e00c2edc4aa29215ddfad1cd8cf7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 25 May 2008 10:00:09 -0700 Subject: x86: extend e820 ealy_res support 32bit - fix #2 remove extra -1 in reseve_early calling panic if can not find space for new RAMDISK Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/kernel/setup_32.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index c04e8c91daf7..08b47a2f7af0 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -479,7 +479,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= end_of_lowmem/2) { - free_early(ramdisk_image, ramdisk_image + ramdisk_size - 1); + free_early(ramdisk_image, ramdisk_end); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -501,9 +501,13 @@ static void __init reserve_initrd(void) end_of_lowmem, ramdisk_size, PAGE_SIZE); + if (ramdisk_here == -1ULL) + panic("Cannot find place for new RAMDISK of size %lld\n", + ramdisk_size); + /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size - 1, + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -579,15 +583,15 @@ void __init setup_bootmem_allocator(void) PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); - reserve_early(bootmap, bootmap + bootmap_size - 1, "BOOTMAP"); + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); #ifdef CONFIG_BLK_DEV_INITRD reserve_initrd(); #endif bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); printk(KERN_INFO " low ram: %08lx - %08lx\n", min_low_pfn< Date: Tue, 27 May 2008 18:03:56 +0200 Subject: x86: aperture_64.c: cleanups Some small cleanups for aperture_64.c; they should not really change any code. Signed-off-by: Pavel Machek Cc: Dave Jones Cc: Andi Kleen Signed-off-by: Thomas Gleixner --- arch/x86/kernel/aperture_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5373f7834d8a..6eea42eb287b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -111,7 +111,7 @@ static u32 __init allocate_aperture(void) /* Find a PCI capability */ -static __u32 __init find_cap(int bus, int slot, int func, int cap) +static u32 __init find_cap(int bus, int slot, int func, int cap) { int bytes; u8 pos; @@ -137,7 +137,7 @@ static __u32 __init find_cap(int bus, int slot, int func, int cap) } /* Read a standard AGPv3 bridge header */ -static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) +static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) { u32 apsize; u32 apsizereg; @@ -202,7 +202,7 @@ static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. */ -static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) +static u32 __init search_agp_bridge(u32 *order, int *valid_agp) { int bus, slot, func; -- cgit v1.2.3 From 19ec673ced067316b9732bc6d1c4ff4052e5f795 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 28 May 2008 23:00:47 +0400 Subject: x86: nmi - fix incorrect NMI watchdog used by default The commit commit 4b82b277707a39b97271439c475f186f63ec4692 Author: Cyrill Gorcunov Date: Sat May 24 19:36:35 2008 +0400 set nmi_watchdog to NMI_IO_APIC as by default. This causes hangs on some machines with buggy watchdogs. Fix it - i.e. restore old behaviour. Thanks to Sitsofe Wheeler and Adrian Bunk for catching the problem and Maciej W. Rozycki for explanation what is going on there. Signed-off-by: Cyrill Gorcunov CC: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 16 +++++++++------- include/asm-x86/nmi.h | 4 +++- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 69a839fc1eb0..3671a9f3564b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -84,20 +84,15 @@ static inline unsigned int get_timer_irqs(int cpu) #endif } +#ifdef CONFIG_X86_64 /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; -#ifdef CONFIG_X86_64 nmi_watchdog = NMI_NONE; -#else - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; -#endif } +#endif #ifdef CONFIG_SMP /* @@ -488,8 +483,15 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return -EIO; } +#ifdef CONFIG_X86_64 /* if nmi_watchdog is not set yet, then set it */ nmi_watchdog_default(); +#else + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; +#endif if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 1e8f34d7ab68..6f4d44fc051f 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -38,9 +38,11 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); +extern void nmi_watchdog_default(void); +#else +#define nmi_watchdog_default(void) do {} while (0) #endif -extern void nmi_watchdog_default(void); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; -- cgit v1.2.3 From d364319b989967b74e57fef5c8017fd56a16c392 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 May 2008 23:42:01 +0200 Subject: x86: move mmconfig declarations to header arch/x86/kernel/mmconf-fam10h_64.c is missing the prototypes, which are decalred in arch/x86/kernel/setup_64.c. Move the prototypes and the inline stubs to the appropriate header file. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/mmconf-fam10h_64.c | 1 + arch/x86/kernel/setup_64.c | 13 +------------ include/asm-x86/mmconfig.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 12 deletions(-) create mode 100644 include/asm-x86/mmconfig.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index edc5fbfe85c0..fdfdc550b366 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../pci/pci.h" diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff1286ad8a..341230db74e1 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #ifdef CONFIG_PARAVIRT @@ -293,18 +294,6 @@ static void __init parse_setup_data(void) } } -#ifdef CONFIG_PCI_MMCONFIG -extern void __cpuinit fam10h_check_enable_mmcfg(void); -extern void __init check_enable_amd_mmconf_dmi(void); -#else -void __cpuinit fam10h_check_enable_mmcfg(void) -{ -} -void __init check_enable_amd_mmconf_dmi(void) -{ -} -#endif - /* * setup_arch - architecture-specific boot-time initializations * diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h new file mode 100644 index 000000000000..95beda07c6fa --- /dev/null +++ b/include/asm-x86/mmconfig.h @@ -0,0 +1,12 @@ +#ifndef _ASM_MMCONFIG_H +#define _ASM_MMCONFIG_H + +#ifdef CONFIG_PCI_MMCONFIG +extern void __cpuinit fam10h_check_enable_mmcfg(void); +extern void __init check_enable_amd_mmconf_dmi(void); +#else +static inline void fam10h_check_enable_mmcfg(void) { } +static inline void check_enable_amd_mmconf_dmi(void) { } +#endif + +#endif -- cgit v1.2.3 From 4d285878564bb46cf64e54be18eeffe33ca583a0 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 22 May 2008 18:48:32 -0400 Subject: x86: Move the AMD64 specific parts out of setup_64.c Create a separate amd_64.c file in the cpu/ dir for the useful parts to live in. Signed-off-by: Dave Jones Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/amd_64.c | 235 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 232 ++---------------------------------------- 3 files changed, 242 insertions(+), 226 deletions(-) create mode 100644 arch/x86/kernel/cpu/amd_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a0c6f8190887..ef065c1a2e1a 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,6 +7,7 @@ obj-y += proc.o feature_names.o obj-$(CONFIG_X86_32) += common.o bugs.o obj-$(CONFIG_X86_32) += amd.o +obj-$(CONFIG_X86_64) += amd_64.o obj-$(CONFIG_X86_32) += cyrix.o obj-$(CONFIG_X86_32) += centaur.o obj-$(CONFIG_X86_32) += transmeta.o diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c new file mode 100644 index 000000000000..1746f6f95726 --- /dev/null +++ b/arch/x86/kernel/cpu/amd_64.c @@ -0,0 +1,235 @@ +#include +#include + +#include +#include +#include + +#include + +extern int __cpuinit get_model_name(struct cpuinfo_x86 *c); +extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c); + +int force_mwait __cpuinitdata; + +#ifdef CONFIG_NUMA +static int __cpuinit nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * Assumes number of cores is a power of two. + */ +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits; +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node = 0; + unsigned apicid = hard_smp_processor_id(); +#endif + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + +#ifdef CONFIG_NUMA + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = c->initial_apicid; + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; + +#endif +} + +#define ENABLE_C1E_MASK 0x18000000 +#define CPUID_PROCESSOR_SIGNATURE 1 +#define CPUID_XFAM 0x0ff00000 +#define CPUID_XFAM_K8 0x00000000 +#define CPUID_XFAM_10H 0x00100000 +#define CPUID_XFAM_11H 0x00200000 +#define CPUID_XMOD 0x000f0000 +#define CPUID_XMOD_REV_F 0x00040000 + +/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ +static __cpuinit int amd_apic_timer_broken(void) +{ + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + + switch (eax & CPUID_XFAM) { + case CPUID_XFAM_K8: + if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) + break; + case CPUID_XFAM_10H: + case CPUID_XFAM_11H: + rdmsr(MSR_K8_ENABLE_C1E, lo, hi); + if (lo & ENABLE_C1E_MASK) + return 1; + break; + default: + /* err on the side of caution */ + return 1; + } + return 0; +} + +void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + +void __cpuinit init_amd(struct cpuinfo_x86 *c) +{ + unsigned level; + +#ifdef CONFIG_SMP + unsigned long value; + + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + if (c->x86 == 15) { + rdmsrl(MSR_K8_HWCR, value); + value |= 1 << 6; + wrmsrl(MSR_K8_HWCR, value); + } +#endif + + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + clear_cpu_cap(c, 0*32+31); + + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || + level >= 0x0f58)) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + if (c->x86 == 0x10 || c->x86 == 0x11) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* Enable workaround for FXSAVE leak */ + if (c->x86 >= 6) + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + + level = get_model_name(c); + if (!level) { + switch (c->x86) { + case 15: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } + display_cacheinfo(c); + + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) + amd_detect_cmp(c); + + if (c->extended_cpuid_level >= 0x80000006 && + (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + + if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) + set_cpu_cap(c, X86_FEATURE_K8); + + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + + if (c->x86 == 0x10) + fam10h_check_enable_mmcfg(); + + if (amd_apic_timer_broken()) + disable_apic_timer = 1; + + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && + (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) + set_memory_4k((unsigned long)__va(tseg), 1); + } +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 341230db74e1..b07b1997ed97 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -96,8 +96,6 @@ int bootloader_type; unsigned long saved_video_mode; -int force_mwait __cpuinitdata; - /* * Early DMI memory */ @@ -526,7 +524,7 @@ void __init setup_arch(char **cmdline_p) check_enable_amd_mmconf_dmi(); } -static int __cpuinit get_model_name(struct cpuinfo_x86 *c) +int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -542,7 +540,7 @@ static int __cpuinit get_model_name(struct cpuinfo_x86 *c) } -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, eax, ebx, ecx, edx; @@ -574,228 +572,6 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) } } -#ifdef CONFIG_NUMA -static int __cpuinit nearby_node(int apicid) -{ - int i, node; - - for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - return first_node(node_online_map); /* Shouldn't happen */ -} -#endif - -/* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. - * Assumes number of cores is a power of two. - */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits; -#ifdef CONFIG_NUMA - int cpu = smp_processor_id(); - int node = 0; - unsigned apicid = hard_smp_processor_id(); -#endif - bits = c->x86_coreid_bits; - - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - - int ht_nodeid = c->initial_apicid; - - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -#endif -} - -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; - -#endif -} - -#define ENABLE_C1E_MASK 0x18000000 -#define CPUID_PROCESSOR_SIGNATURE 1 -#define CPUID_XFAM 0x0ff00000 -#define CPUID_XFAM_K8 0x00000000 -#define CPUID_XFAM_10H 0x00100000 -#define CPUID_XFAM_11H 0x00200000 -#define CPUID_XMOD 0x000f0000 -#define CPUID_XMOD_REV_F 0x00040000 - -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(void) -{ - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - - switch (eax & CPUID_XFAM) { - case CPUID_XFAM_K8: - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) - break; - case CPUID_XFAM_10H: - case CPUID_XFAM_11H: - rdmsr(MSR_K8_ENABLE_C1E, lo, hi); - if (lo & ENABLE_C1E_MASK) - return 1; - break; - default: - /* err on the side of caution */ - return 1; - } - return 0; -} - -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) -{ - early_init_amd_mc(c); - - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -} - -static void __cpuinit init_amd(struct cpuinfo_x86 *c) -{ - unsigned level; - -#ifdef CONFIG_SMP - unsigned long value; - - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 15) { - rdmsrl(MSR_K8_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K8_HWCR, value); - } -#endif - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_cpu_cap(c, 0*32+31); - - /* On C+ stepping K8 rep microcode works well for copy/memset */ - level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || - level >= 0x0f58)) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - if (c->x86 == 0x10 || c->x86 == 0x11) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - - level = get_model_name(c); - if (!level) { - switch (c->x86) { - case 15: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - display_cacheinfo(c); - - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) - amd_detect_cmp(c); - - if (c->extended_cpuid_level >= 0x80000006 && - (cpuid_edx(0x80000006) & 0xf000)) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) - set_cpu_cap(c, X86_FEATURE_K8); - - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - - if (c->x86 == 0x10) - fam10h_check_enable_mmcfg(); - - if (amd_apic_timer_broken()) - disable_apic_timer = 1; - - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) - set_memory_4k((unsigned long)__va(tseg), 1); - } -} - void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP @@ -977,6 +753,10 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) c->x86_vendor = X86_VENDOR_UNKNOWN; } +// FIXME: Needs to use cpu_vendor_dev_register +extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c); +extern void __cpuinit init_amd(struct cpuinfo_x86 *c); + /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -- cgit v1.2.3 From a82fbe31cb387bb246e2d3b3c177f551bb991135 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 22 May 2008 18:54:32 -0400 Subject: x86: Move the 64-bit Intel specific parts out of setup_64.c Create a separate intel_64.c file in the cpu/ dir for the useful parts to live in. Signed-off-by: Dave Jones Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/intel_64.c | 97 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 93 +--------------------------------------- 3 files changed, 100 insertions(+), 91 deletions(-) create mode 100644 arch/x86/kernel/cpu/intel_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index ef065c1a2e1a..b7a11924fed7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_X86_32) += cyrix.o obj-$(CONFIG_X86_32) += centaur.o obj-$(CONFIG_X86_32) += transmeta.o obj-$(CONFIG_X86_32) += intel.o +obj-$(CONFIG_X86_64) += intel_64.o obj-$(CONFIG_X86_32) += umc.o obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c new file mode 100644 index 000000000000..e5f929f6c3d4 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_64.c @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include + +void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +{ + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + +/* + * find out the number of processor cores on the die + */ +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, t; + + if (c->cpuid_level < 4) + return 1; + + cpuid_count(4, 0, &eax, &t, &t, &t); + + if (eax & 0x1f) + return ((eax >> 26) + 1); + else + return 1; +} + +static void __cpuinit srat_detect_node(void) +{ +#ifdef CONFIG_NUMA + unsigned node; + int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE || !node_online(node)) + node = first_node(node_online_map); + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +} + +void __cpuinit init_intel(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned n; + + init_intel_cacheinfo(c); + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (cpu_has_ds) { + unsigned int l1, l2; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & (1<<12))) + set_cpu_cap(c, X86_FEATURE_PEBS); + } + + + if (cpu_has_bts) + ds_init_intel(c); + + n = c->extended_cpuid_level; + if (n >= 0x80000008) { + unsigned eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + /* CPUID workaround for Intel 0F34 CPU */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 0xF && c->x86_model == 0x3 && + c->x86_mask == 0x4) + c->x86_phys_bits = 36; + } + + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + c->x86_max_cores = intel_num_cpu_cores(c); + + srat_detect_node(); +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index b07b1997ed97..c4e6a0b6c303 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -622,97 +622,6 @@ out: #endif } -/* - * find out the number of processor cores on the die - */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, t; - - if (c->cpuid_level < 4) - return 1; - - cpuid_count(4, 0, &eax, &t, &t, &t); - - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; -} - -static void __cpuinit srat_detect_node(void) -{ -#ifdef CONFIG_NUMA - unsigned node; - int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); - - /* Don't do the funky fallback heuristics the AMD version employs - for now. */ - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) - node = first_node(node_online_map); - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -} - -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) -{ - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -} - -static void __cpuinit init_intel(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned n; - - init_intel_cacheinfo(c); - if (c->cpuid_level > 9) { - unsigned eax = cpuid_eax(10); - /* Check for version and the number of counters */ - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - } - - if (cpu_has_ds) { - unsigned int l1, l2; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - } - - - if (cpu_has_bts) - ds_init_intel(c); - - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - /* CPUID workaround for Intel 0F34 CPU */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 0xF && c->x86_model == 0x3 && - c->x86_mask == 0x4) - c->x86_phys_bits = 36; - } - - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - c->x86_max_cores = intel_num_cpu_cores(c); - - srat_detect_node(); -} - static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) { if (c->x86 == 0x6 && c->x86_model >= 0xf) @@ -756,6 +665,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) // FIXME: Needs to use cpu_vendor_dev_register extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c); extern void __cpuinit init_amd(struct cpuinfo_x86 *c); +extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c); +extern void __cpuinit init_intel(struct cpuinfo_x86 *c); /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu -- cgit v1.2.3 From 7e2191127eb414d7d5a11df6552ab6e3845d17a1 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 22 May 2008 18:55:06 -0400 Subject: x86: Remove workaround for prescott (32bit P4) from 64-bit code. Signed-off-by: Dave Jones Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_64.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index e5f929f6c3d4..b33912199484 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -79,11 +79,6 @@ void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; - /* CPUID workaround for Intel 0F34 CPU */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 0xF && c->x86_model == 0x3 && - c->x86_mask == 0x4) - c->x86_phys_bits = 36; } if (c->x86 == 15) -- cgit v1.2.3 From 30a713180b3d08fdec5ca572e5a1cd35253c5d8e Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 22 May 2008 18:57:25 -0400 Subject: x86: Move the 64-bit Centaur specific parts out of setup_64.c Create a separate centaur_64.c file in the cpu/ dir for the useful parts to live in. Signed-off-by: Dave Jones Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/centaur_64.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 28 ++-------------------------- 3 files changed, 34 insertions(+), 26 deletions(-) create mode 100644 arch/x86/kernel/cpu/centaur_64.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index b7a11924fed7..c77a1c50d944 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_X86_32) += amd.o obj-$(CONFIG_X86_64) += amd_64.o obj-$(CONFIG_X86_32) += cyrix.o obj-$(CONFIG_X86_32) += centaur.o +obj-$(CONFIG_X86_64) += centaur_64.o obj-$(CONFIG_X86_32) += transmeta.o obj-$(CONFIG_X86_32) += intel.o obj-$(CONFIG_X86_64) += intel_64.o diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c new file mode 100644 index 000000000000..bac96d187d05 --- /dev/null +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -0,0 +1,31 @@ +#include +#include + +#include +#include + +void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + if (c->x86 == 0x6 && c->x86_model >= 0xf) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + +void __cpuinit init_centaur(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned n; + + n = c->extended_cpuid_level; + if (n >= 0x80000008) { + unsigned eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } + + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index c4e6a0b6c303..25afdd80a3cf 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -622,32 +622,6 @@ out: #endif } -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x6 && c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -} - -static void __cpuinit init_centaur(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned n; - - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - - if (c->x86 == 0x6 && c->x86_model >= 0xf) { - c->x86_cache_alignment = c->x86_clflush_size * 2; - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - } - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); -} - static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -667,6 +641,8 @@ extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c); extern void __cpuinit init_amd(struct cpuinfo_x86 *c); extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c); extern void __cpuinit init_intel(struct cpuinfo_x86 *c); +extern void __cpuinit early_init_centaur(struct cpuinfo_x86 *c); +extern void __cpuinit init_centaur(struct cpuinfo_x86 *c); /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu -- cgit v1.2.3 From 1c47cd638e8302bc38be1f6d81067950e038ebd3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 30 May 2008 15:42:45 -0700 Subject: x86: fix overlong line in arch/x86/kernel/cpu/amd_64.c Clean up an overlong line in arch/x86/kernel/cpu/amd_64.c. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 1746f6f95726..c815c2c0484b 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -229,7 +229,8 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) + (tseg >> PMD_SHIFT) < + (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) set_memory_4k((unsigned long)__va(tseg), 1); } } -- cgit v1.2.3 From f0d43100f13be0fa5bf52741d7084bb27f00e621 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:56:36 -0700 Subject: x86: extend e820 early_res support 32bit -fix #3 introduce init_pg_table_start, so xen PV could specify the value. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/head32.c | 3 ++- arch/x86/kernel/head_32.S | 2 ++ arch/x86/kernel/setup_32.c | 7 +++++++ arch/x86/lguest/boot.c | 5 +++-- arch/x86/xen/enlighten.c | 3 ++- include/asm-x86/setup.h | 4 +++- 6 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c216d3c2a991..0b6cb9fba71e 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -76,7 +76,8 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_early(__pa_symbol(&_end), init_pg_tables_end, "INIT_PG_TABLE"); + reserve_early(init_pg_tables_start, init_pg_tables_end, + "INIT_PG_TABLE"); reserve_ebda_region(); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b2cc73768a9d..bef4618feadb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -194,6 +194,7 @@ default_entry: xorl %ebx,%ebx /* %ebx is kept at zero */ movl $pa(pg0), %edi + movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_pmd), %edx movl $PTE_ATTR, %eax 10: @@ -228,6 +229,7 @@ default_entry: page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(pg0), %edi + movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_dir), %edx movl $PTE_ATTR, %eax 10: diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 08b47a2f7af0..de9c5ee77d07 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -71,6 +71,7 @@ /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ +unsigned long init_pg_tables_start __initdata = ~0UL; unsigned long init_pg_tables_end __initdata = ~0UL; /* @@ -485,6 +486,10 @@ static void __init reserve_initrd(void) return; } + printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image, + ramdisk_end); + + if (ramdisk_end <= end_of_lowmem) { /* All in lowmem, easy case */ /* @@ -511,6 +516,8 @@ static void __init reserve_initrd(void) "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", + ramdisk_here, ramdisk_here + ramdisk_size); do_relocate_initrd = true; } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index af65b2da3ba0..bf7c34a3aaeb 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1011,6 +1011,7 @@ __init void lguest_init(void) * clobbered. The Launcher places our initial pagetables somewhere at * the top of our physical memory, so we don't need extra space: set * init_pg_tables_end to the end of the kernel. */ + init_pg_tables_start = __pa(pg0); init_pg_tables_end = __pa(pg0); /* Load the %fs segment register (the per-cpu segment register) with @@ -1064,9 +1065,9 @@ __init void lguest_init(void) pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call start_kernel() in init/main.c and we proceed + /* Now we're set up, call i386_start_kernel() in head32.c and we proceed * to boot as normal. It never returns. */ - start_kernel(); + i386_start_kernel(); } /* * This marks the end of stage II of our journey, The Guest. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e457d61..ccc7b84ddabf 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1211,6 +1211,7 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; + init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; init_mm.pgd = pgd; /* use the Xen pagetables to start */ @@ -1246,5 +1247,5 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("hvc", 0, NULL); /* Start the world */ - start_kernel(); + i386_start_kernel(); } diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index ffa0f540fc7f..64f5f0c11d66 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -59,9 +59,11 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map); void __init add_memory_region(unsigned long long start, unsigned long long size, int type); -extern unsigned long init_pg_tables_end; +void __init i386_start_kernel(void); +extern unsigned long init_pg_tables_start; +extern unsigned long init_pg_tables_end; #endif /* __i386__ */ #endif /* _SETUP */ -- cgit v1.2.3 From a5481280b29b6a3db912ec100498bd31eaa6d2db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:58:37 -0700 Subject: x86: extend e820 early_res support 32bit -fix #5 reserve early numa kva, so it will not clash with new RAMDISK Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 1 - arch/x86/mm/discontig_32.c | 12 +++++------- include/asm-x86/mmzone_32.h | 4 ---- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index de9c5ee77d07..6f40cb560ea9 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -614,7 +614,6 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif - numa_kva_reserve(); reserve_crashkernel(); reserve_ibft_region(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 47749727907e..55fdbab6b014 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -357,6 +357,11 @@ unsigned long __init setup_memory(void) printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); + + /* avoid clash with initrd */ + reserve_early(kva_start_pfn< system_max_low_pfn) @@ -392,13 +397,6 @@ unsigned long __init setup_memory(void) return max_low_pfn; } -void __init numa_kva_reserve(void) -{ - if (kva_pages) - reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), - BOOTMEM_DEFAULT); -} - void __init zone_sizes_init(void) { int nid; diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h index cb2cad0b65a7..faef751181b7 100644 --- a/include/asm-x86/mmzone_32.h +++ b/include/asm-x86/mmzone_32.h @@ -38,16 +38,12 @@ static inline void get_memcfg_numa(void) } extern int early_pfn_to_nid(unsigned long pfn); -extern void numa_kva_reserve(void); #else /* !CONFIG_NUMA */ #define get_memcfg_numa get_memcfg_numa_flat #define get_zholes_size(n) (0) -static inline void numa_kva_reserve(void) -{ -} #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM -- cgit v1.2.3 From 9a73aa81ffb993382afed2ed404bc2b330d75427 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 16:25:56 -0700 Subject: x86: 32bit numa srat fix early_ioremap leak on two node system (16g RAM) with numa config I got this crash: get_memcfg_from_srat: assigning address to rsdp RSD PTR v0 [ACPIAM] ACPI: Too big length in RSDT: 92 failed to get NUMA memory information from SRAT table NUMA - single node, flat memory mode Node: 0, start_pfn: 0, end_pfn: 153 Setting physnode_map array to node 0 for pfns: 0 ... Pid: 0, comm: swapper Not tainted 2.6.26-rc4 #4 [<80b41289>] hlt_loop+0x0/0x3 [<8011efa0>] ? alloc_remap+0x50/0x70 [<8079e32e>] alloc_node_mem_map+0x5e/0xa0 [<8012e77b>] ? printk+0x1b/0x20 [<80b590f6>] free_area_init_node+0xc6/0x470 [<80b588fc>] ? __alloc_bootmem_node+0x2c/0x50 [<80b58ad8>] ? find_min_pfn_for_node+0x38/0x70 [<8012e77b>] ? printk+0x1b/0x20 [<80b597c4>] free_area_init_nodes+0x254/0x2d0 [<80b544d7>] zone_sizes_init+0x97/0xa0 [<80b48a03>] setup_arch+0x383/0x530 [<8012e77b>] ? printk+0x1b/0x20 [<80b41aa4>] start_kernel+0x64/0x350 [<80b412d8>] i386_start_kernel+0x8/0x10 ======================= this patch increases the acpi table limit to 32. Also match early_ioremap() with early_iounmap(). Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/srat_32.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 70e4a374b4e8..88971ee166d4 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -261,7 +261,7 @@ out_fail: struct acpi_static_rsdt { struct acpi_table_rsdt table; - u32 padding[7]; /* Allow for 7 more table entries */ + u32 padding[32]; /* Allow for 32 more table entries */ }; int __init get_memcfg_from_srat(void) @@ -297,7 +297,7 @@ int __init get_memcfg_from_srat(void) } rsdt = (struct acpi_table_rsdt *) - early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); + early_ioremap(rsdp->rsdt_physical_address, sizeof(saved_rsdt)); if (!rsdt) { printk(KERN_WARNING @@ -310,6 +310,7 @@ int __init get_memcfg_from_srat(void) if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) { printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); + early_iounmap(rsdt, sizeof(saved_rsdt)); goto out_err; } @@ -319,37 +320,51 @@ int __init get_memcfg_from_srat(void) * size of RSDT) divided by the size of each entry * (4-byte table pointers). */ - tables = (header->length - sizeof(struct acpi_table_header)) / 4; + tables = (header->length - sizeof(struct acpi_table_header)) / sizeof(u32); if (!tables) goto out_err; memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); - + early_iounmap(rsdt, sizeof(saved_rsdt)); if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) { printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", saved_rsdt.table.header.length); goto out_err; } - printk("Begin SRAT table scan....\n"); + printk("Begin SRAT table scan....%d\n", tables); - for (i = 0; i < tables; i++) { + for (i = 0; i < tables; i++){ + int result; + u32 length; /* Map in header, then map in full table length. */ header = (struct acpi_table_header *) early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); if (!header) break; + + printk(KERN_INFO "ACPI: %4.4s %08lX, %04X\n", + header->signature, + (unsigned long)saved_rsdt.table.table_offset_entry[i], + header->length); + + if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) { + early_iounmap(header, sizeof(struct acpi_table_header)); + continue; + } + + length = header->length; + early_iounmap(header, sizeof(struct acpi_table_header)); header = (struct acpi_table_header *) - early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); + early_ioremap(saved_rsdt.table.table_offset_entry[i], length); if (!header) break; - if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) - continue; - /* we've found the srat table. don't need to look at any more tables */ - return acpi20_parse_srat((struct acpi_table_srat *)header); + result = acpi20_parse_srat((struct acpi_table_srat *)header); + early_iounmap(header, length); + return result; } out_err: remove_all_active_ranges(); -- cgit v1.2.3 From 831d991821daedd4839073dbca55514432ef1768 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 Sep 2007 10:17:39 +0200 Subject: x86: add PCI extended config space access for AMD Barcelona This patch implements PCI extended configuration space access for AMD's Barcelona CPUs. It extends the method using CF8/CFC IO addresses. An x86 capability bit has been introduced that is set for CPUs supporting PCI extended config space accesses. Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ++++ arch/x86/kernel/cpu/amd_64.c | 3 +++ arch/x86/kernel/setup.c | 13 +++++++++++++ arch/x86/kernel/setup.h | 26 ++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 2 ++ arch/x86/pci/direct.c | 21 +++++++++++++++------ include/asm-x86/cpufeature.h | 2 ++ 7 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 arch/x86/kernel/setup.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 245866828294..99221f9834e4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include #include +#include "../setup.h" #include "cpu.h" /* @@ -308,6 +309,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + + if (c->x86 == 0x10) + amd_enable_pci_ext_cfg(c); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index c815c2c0484b..180097e99219 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -217,6 +217,9 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x10) fam10h_check_enable_mmcfg(); + if (c->x86 == 0x10) + amd_enable_pci_ext_cfg(c); + if (amd_apic_timer_broken()) disable_apic_timer = 1; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6f80b852a196..d8f171234013 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -136,4 +136,17 @@ void __init setup_per_cpu_areas(void) setup_cpumask_of_cpu(); } +#define ENABLE_CF8_EXT_CFG (1ULL << 46) + +void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) +{ + u64 reg; + rdmsrl(MSR_AMD64_NB_CFG, reg); + if (!(reg & ENABLE_CF8_EXT_CFG)) { + reg |= ENABLE_CF8_EXT_CFG; + wrmsrl(MSR_AMD64_NB_CFG, reg); + } + set_cpu_cap(c, X86_FEATURE_PCI_EXT_CFG); +} + #endif diff --git a/arch/x86/kernel/setup.h b/arch/x86/kernel/setup.h new file mode 100644 index 000000000000..66cc2c7f961d --- /dev/null +++ b/arch/x86/kernel/setup.h @@ -0,0 +1,26 @@ +/* + * Internal declarations for shared x86 setup code. + * + * Copyright (c) 2008 Advanced Micro Devices, Inc. + * Contributed by Robert Richter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef _ARCH_X86_KERNEL_SETUP_H + +extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); + +#endif /* _ARCH_X86_KERNEL_SETUP_H */ diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 25afdd80a3cf..215bd67e5b6c 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -73,6 +73,8 @@ #include #include +#include "setup.h" + #include #ifdef CONFIG_PARAVIRT #include diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 21d1e0e0d535..27d61b63def6 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -8,18 +8,21 @@ #include "pci.h" /* - * Functions for accessing PCI configuration space with type 1 accesses + * Functions for accessing PCI base (first 256 bytes) and extended + * (4096 bytes per PCI function) configuration space with type 1 + * accesses. */ #define PCI_CONF1_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) + (0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \ + | (devfn << 8) | (reg & 0xFC)) static int pci_conf1_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; - if ((bus > 255) || (devfn > 255) || (reg > 255)) { + if ((bus > 255) || (devfn > 255) || (reg > 4095)) { *value = -1; return -EINVAL; } @@ -50,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, { unsigned long flags; - if ((bus > 255) || (devfn > 255) || (reg > 255)) + if ((bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); @@ -260,10 +263,16 @@ void __init pci_direct_init(int type) return; printk(KERN_INFO "PCI: Using configuration type %d for base access\n", type); - if (type == 1) + if (type == 1) { raw_pci_ops = &pci_direct_conf1; - else + if (!raw_pci_ext_ops && cpu_has_pci_ext_cfg) { + printk(KERN_INFO "PCI: Using configuration type 1 " + "for extended access\n"); + raw_pci_ext_ops = &pci_direct_conf1; + } + } else { raw_pci_ops = &pci_direct_conf2; + } } int __init pci_direct_probe(void) diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 0d609c837a41..40fcbba00f15 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -79,6 +79,7 @@ #define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well on this CPU */ #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */ +#define X86_FEATURE_PCI_EXT_CFG (3*32+19) /* PCI extended cfg access */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ @@ -187,6 +188,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) +#define cpu_has_pci_ext_cfg boot_cpu_has(X86_FEATURE_PCI_EXT_CFG) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.2.3 From c46e62f73569d7ef42255bd6f31e35925b7f1492 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 28 May 2008 12:42:57 +0200 Subject: i8259: fix final ugliness Introduce IRQx_VECTOR on 32-bit, so that #ifdef noise is kept down. There should be no object code change. [ mingo@elte.hu: merged to x86/irq not x86/i8259 due to x86/irq having restructured the vector code into asm-x86/irq_vectors.h, which this patch touches. ] Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/i8259.c | 22 ++++++++-------------- include/asm-x86/irq_vectors.h | 9 ++++++--- 2 files changed, 14 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 7a0fda8f01b5..dc92b49d9204 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -297,34 +297,28 @@ void init_8259A(int auto_eoi) * outb_pic - this has to work on a wide range of PC hardware. */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ -#ifndef CONFIG_X86_64 - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ -#else /* CONFIG_X86_64 */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64, + to 0x20-0x27 on i386 */ outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); -#endif /* CONFIG_X86_64 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); + if (auto_eoi) /* master does Auto EOI */ outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); else /* master expects normal EOI */ outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ -#ifndef CONFIG_X86_64 - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ -#else /* CONFIG_X86_64 */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + + /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); -#endif /* CONFIG_X86_64 */ if (auto_eoi) /* * In AEOI mode we just have to mask the interrupt diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h index 3cb6d8c77b39..b58581e2e24e 100644 --- a/include/asm-x86/irq_vectors.h +++ b/include/asm-x86/irq_vectors.h @@ -18,17 +18,20 @@ #endif /* - * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. - * * Reserve the lowest usable priority level 0x20 - 0x2f for triggering * cleanup after irq migration on 64 bit. */ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR /* - * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit + * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. + * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit. */ +#ifdef CONFIG_X86_32 +#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR) +#else #define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#endif #define IRQ1_VECTOR (IRQ0_VECTOR + 1) #define IRQ2_VECTOR (IRQ0_VECTOR + 2) #define IRQ3_VECTOR (IRQ0_VECTOR + 3) -- cgit v1.2.3 From 1a5726528a70bb239bdd149aef7f2155cd2b1699 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 2 Jun 2008 12:21:36 +0200 Subject: fix build bug in "x86: add PCI extended config space access for AMD Barcelona" --- arch/x86/kernel/cpu/amd.c | 1 + include/asm-x86/mmconfig.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 99221f9834e4..656b40aed647 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include "../setup.h" diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h index 46d6bb135df4..691798fbee10 100644 --- a/include/asm-x86/mmconfig.h +++ b/include/asm-x86/mmconfig.h @@ -9,6 +9,10 @@ static inline void fam10h_check_enable_mmcfg(void) { } static inline void check_enable_amd_mmconf_dmi(void) { } #endif +#if defined(CONFIG_SMP) && defined(CONFIG_X86_64) extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); +#else +static inline void amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) { } +#endif #endif -- cgit v1.2.3 From 88ff0a474e98f869d8c321e29481f298320100d6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 27 May 2008 18:49:39 -0700 Subject: x86: coding style fixes for nmi.c before total: 1 errors, 6 warnings, 534 lines checked after total: 0 errors, 1 warnings, 532 lines checked Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3671a9f3564b..bf143f191fb1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -23,9 +23,8 @@ #include #include #include +#include -#include -#include #include #include @@ -45,13 +44,16 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; * 0: the lapic NMI watchdog is disabled, but can be enabled */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; +EXPORT_SYMBOL(nmi_active); unsigned int nmi_watchdog = NMI_DEFAULT; +EXPORT_SYMBOL(nmi_watchdog); + +static int panic_on_timeout; static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); -static int endflag __initdata = 0; +static int endflag __initdata; static inline unsigned int get_nmi_count(int cpu) { @@ -404,7 +406,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); dump_stack(); spin_unlock(&lock); cpu_clear(cpu, backtrace_mask); @@ -529,7 +531,3 @@ void __trigger_all_cpu_backtrace(void) mdelay(1); } } - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); - -- cgit v1.2.3 From 9f5314fb4d556d3132c784d0df47352b2830ca53 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 28 May 2008 09:51:18 -0500 Subject: x86, uv: update macros used by UV platform Update the UV address macros to better describe the fields of UV physical addresses. Improve comments in the header files. Add additional MMR definitions. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 141 +++++++---- include/asm-x86/uv/uv_hub.h | 188 ++++++++++----- include/asm-x86/uv/uv_mmrs.h | 509 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 731 insertions(+), 107 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index ebf13908a743..45e84acca8a9 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. */ #include @@ -55,37 +55,37 @@ static cpumask_t uv_vector_allocation_domain(int cpu) int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) { unsigned long val; - int nasid; + int pnode; - nasid = uv_apicid_to_nasid(phys_apicid); + pnode = uv_apicid_to_pnode(phys_apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; - uv_write_global_mmr64(nasid, UVH_IPI_INT, val); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_STARTUP; - uv_write_global_mmr64(nasid, UVH_IPI_INT, val); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); return 0; } static void uv_send_IPI_one(int cpu, int vector) { unsigned long val, apicid, lapicid; - int nasid; + int pnode; apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ lapicid = apicid & 0x3f; /* ZZZ macro needed */ - nasid = uv_apicid_to_nasid(apicid); + pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << UVH_IPI_INT_APIC_ID_SHFT) | (vector << UVH_IPI_INT_VECTOR_SHFT); - uv_write_global_mmr64(nasid, UVH_IPI_INT, val); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } static void uv_send_IPI_mask(cpumask_t mask, int vector) @@ -159,39 +159,81 @@ struct genapic apic_x2apic_uv_x = { .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ }; -static __cpuinit void set_x2apic_extra_bits(int nasid) +static __cpuinit void set_x2apic_extra_bits(int pnode) { - __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6); + __get_cpu_var(x2apic_extra_bits) = (pnode << 6); } /* * Called on boot cpu. */ +static __init int boot_pnode_to_blade(int pnode) +{ + int blade; + + for (blade = 0; blade < uv_num_possible_blades(); blade++) + if (pnode == uv_blade_info[blade].pnode) + return blade; + BUG(); +} + +struct redir_addr { + unsigned long redirect; + unsigned long alias; +}; + +#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT + +static __initdata struct redir_addr redir_addrs[] = { + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, +}; + +static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) +{ + union uvh_si_alias0_overlay_config_u alias; + union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + int i; + + for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { + alias.v = uv_read_local_mmr(redir_addrs[i].alias); + if (alias.s.base == 0) { + *size = (1UL << alias.s.m_alias); + redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); + *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; + return; + } + } + BUG(); +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; - int bytes, nid, cpu, lcpu, nasid, last_nasid, blade; - unsigned long mmr_base; + union uvh_node_id_u node_id; + unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; + int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + unsigned long mmr_base, present; m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); + m_val = m_n_config.s.m_skt; + n_val = m_n_config.s.n_skt; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); - last_nasid = -1; - for_each_possible_cpu(cpu) { - nid = cpu_to_node(cpu); - nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); - if (nasid != last_nasid) - uv_possible_blades++; - last_nasid = nasid; - } + for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) + uv_possible_blades += + hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = alloc_bootmem_pages(bytes); + get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); + bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = alloc_bootmem_pages(bytes); memset(uv_node_to_blade, 255, bytes); @@ -200,43 +242,56 @@ static __init void uv_system_init(void) uv_cpu_to_blade = alloc_bootmem_pages(bytes); memset(uv_cpu_to_blade, 255, bytes); - last_nasid = -1; - blade = -1; - lcpu = -1; - for_each_possible_cpu(cpu) { - nid = cpu_to_node(cpu); - nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); - if (nasid != last_nasid) { - blade++; - lcpu = -1; - uv_blade_info[blade].nr_posible_cpus = 0; + blade = 0; + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); + for (j = 0; j < 64; j++) { + if (!test_bit(j, &present)) + continue; + uv_blade_info[blade].pnode = (i * 64 + j); + uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + blade++; } - last_nasid = nasid; - lcpu++; + } - uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt; - uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_upper = (((unsigned long)node_id.s.node_id) & + ~((1 << n_val) - 1)) << m_val; + + for_each_present_cpu(cpu) { + nid = cpu_to_node(cpu); + pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + blade = boot_pnode_to_blade(pnode); + lcpu = uv_blade_info[blade].nr_possible_cpus; + uv_blade_info[blade].nr_possible_cpus++; + + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; + uv_cpu_hub_info(cpu)->lowmem_remap_top = + lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->m_val = m_val; + uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; - uv_cpu_hub_info(cpu)->local_nasid = nasid; - uv_cpu_hub_info(cpu)->gnode_upper = - nasid & ~((1 << uv_hub_info->n_val) - 1); + uv_cpu_hub_info(cpu)->pnode = pnode; + uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1; + uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ - uv_blade_info[blade].nasid = nasid; - uv_blade_info[blade].nr_posible_cpus++; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; - printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid); - printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade); + printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, " + "lcpu %d, blade %d\n", + cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, + lcpu, blade); } } /* * Called on each cpu to initialize the per_cpu UV data area. + * ZZZ hotplug not supported yet */ void __cpuinit uv_cpu_init(void) { @@ -246,5 +301,5 @@ void __cpuinit uv_cpu_init(void) uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->local_nasid); + set_x2apic_extra_bits(uv_hub_info->pnode); } diff --git a/include/asm-x86/uv/uv_hub.h b/include/asm-x86/uv/uv_hub.h index 26b9240d1e23..65004881de5f 100644 --- a/include/asm-x86/uv/uv_hub.h +++ b/include/asm-x86/uv/uv_hub.h @@ -5,7 +5,7 @@ * * SGI UV architectural definitions * - * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. */ #ifndef __ASM_X86_UV_HUB_H__ @@ -20,26 +20,49 @@ /* * Addressing Terminology * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. + * M - The low M bits of a physical address represent the offset + * into the blade local memory. RAM memory on a blade is physically + * contiguous (although various IO spaces may punch holes in + * it).. * - * SNASID - NASID right shifted by 1 bit. + * N - Number of bits in the node portion of a socket physical + * address. + * + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. + * + * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead + * of nasids. + * + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. + * + * + * NumaLink Global Physical Address Format: + * +--------------------------------+---------------------+ + * |00..000| GNODE | NodeOffset | + * +--------------------------------+---------------------+ + * |<-------53 - M bits --->|<--------M bits -----> + * + * M - number of node offset bits (35 .. 40) * * * Memory/UV-HUB Processor Socket Address Format: - * +--------+---------------+---------------------+ - * |00..0000| SNASID | NodeOffset | - * +--------+---------------+---------------------+ - * <--- N bits --->|<--------M bits -----> + * +----------------+---------------+---------------------+ + * |00..000000000000| PNODE | NodeOffset | + * +----------------+---------------+---------------------+ + * <--- N bits --->|<--------M bits -----> * - * M number of node offset bits (35 .. 40) - * N number of SNASID bits (0 .. 10) + * M - number of node offset bits (35 .. 40) + * N - number of PNODE bits (0 .. 10) * * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64). * The actual values are configuration dependent and are set at - * boot time + * boot time. M & N values are set by the hardware/BIOS at boot. + * * * APICID format * NOTE!!!!!! This is the current format of the APICID. However, code @@ -48,14 +71,14 @@ * * 1111110000000000 * 5432109876543210 - * nnnnnnnnnnlc0cch + * pppppppppplc0cch * sssssssssss * - * n = snasid bits + * p = pnode bits * l = socket number on board * c = core * h = hyperthread - * s = bits that are in the socket CSR + * s = bits that are in the SOCKET_ID CSR * * Note: Processor only supports 12 bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. @@ -74,7 +97,7 @@ * This value is also the value of the maximum number of non-router NASIDs * in the numalink fabric. * - * NOTE: a brick may be 1 or 2 OS nodes. Don't get these confused. + * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused. */ #define UV_MAX_NUMALINK_BLADES 16384 @@ -96,8 +119,12 @@ */ struct uv_hub_info_s { unsigned long global_mmr_base; - unsigned short local_nasid; - unsigned short gnode_upper; + unsigned long gpa_mask; + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; unsigned short coherency_domain_number; unsigned short numa_blade_id; unsigned char blade_processor_id; @@ -112,83 +139,124 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); * Local & Global MMR space macros. * Note: macros are intended to be used ONLY by inline functions * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ -#define UV_SNASID(n) ((n) >> 1) -#define UV_NASID(n) ((n) << 1) +#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) +#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) #define UV_LOCAL_MMR_BASE 0xf4000000UL #define UV_GLOBAL_MMR32_BASE 0xf8000000UL #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) -#define UV_GLOBAL_MMR32_SNASID_MASK 0x3ff -#define UV_GLOBAL_MMR32_SNASID_SHIFT 15 -#define UV_GLOBAL_MMR64_SNASID_SHIFT 26 +#define UV_GLOBAL_MMR32_PNODE_SHIFT 15 +#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 -#define UV_GLOBAL_MMR32_NASID_BITS(n) \ - (((UV_SNASID(n) & UV_GLOBAL_MMR32_SNASID_MASK)) << \ - (UV_GLOBAL_MMR32_SNASID_SHIFT)) +#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) -#define UV_GLOBAL_MMR64_NASID_BITS(n) \ - ((unsigned long)UV_SNASID(n) << UV_GLOBAL_MMR64_SNASID_SHIFT) +#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ + ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) + +#define UV_APIC_PNODE_SHIFT 6 + +/* + * Macros for converting between kernel virtual addresses, socket local physical + * addresses, and UV global physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. + */ + +/* socket phys RAM --> UV global physical address */ +static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) +{ + if (paddr < uv_hub_info->lowmem_remap_top) + paddr += uv_hub_info->lowmem_remap_base; + return paddr | uv_hub_info->gnode_upper; +} + + +/* socket virtual --> UV global physical address */ +static inline unsigned long uv_gpa(void *v) +{ + return __pa(v) | uv_hub_info->gnode_upper; +} + +/* socket virtual --> UV global physical address */ +static inline void *uv_vgpa(void *v) +{ + return (void *)uv_gpa(v); +} + +/* UV global physical address --> socket virtual */ +static inline void *uv_va(unsigned long gpa) +{ + return __va(gpa & uv_hub_info->gpa_mask); +} + +/* pnode, offset --> socket virtual */ +static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) +{ + return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset); +} -#define UV_APIC_NASID_SHIFT 6 /* - * Extract a NASID from an APICID (full apicid, not processor subset) + * Extract a PNODE from an APICID (full apicid, not processor subset) */ -static inline int uv_apicid_to_nasid(int apicid) +static inline int uv_apicid_to_pnode(int apicid) { - return (UV_NASID(apicid >> UV_APIC_NASID_SHIFT)); + return (apicid >> UV_APIC_PNODE_SHIFT); } /* * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int nasid, +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | - UV_GLOBAL_MMR32_NASID_BITS(nasid) | offset); + UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int nasid, unsigned long offset, +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { - *uv_global_mmr32_address(nasid, offset) = val; + *uv_global_mmr32_address(pnode, offset) = val; } -static inline unsigned long uv_read_global_mmr32(int nasid, +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { - return *uv_global_mmr32_address(nasid, offset); + return *uv_global_mmr32_address(pnode, offset); } /* * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int nasid, +static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | - UV_GLOBAL_MMR64_NASID_BITS(nasid) | offset); + UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int nasid, unsigned long offset, +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { - *uv_global_mmr64_address(nasid, offset) = val; + *uv_global_mmr64_address(pnode, offset) = val; } -static inline unsigned long uv_read_global_mmr64(int nasid, +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { - return *uv_global_mmr64_address(nasid, offset); + return *uv_global_mmr64_address(pnode, offset); } /* - * Access node local MMRs. Faster than using global space but only local MMRs + * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. */ static inline unsigned long *uv_local_mmr_address(unsigned long offset) @@ -207,15 +275,15 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) } /* - * Structures and definitions for converting between cpu, node, and blade + * Structures and definitions for converting between cpu, node, pnode, and blade * numbers. */ struct uv_blade_info { - unsigned short nr_posible_cpus; + unsigned short nr_possible_cpus; unsigned short nr_online_cpus; - unsigned short nasid; + unsigned short pnode; }; -struct uv_blade_info *uv_blade_info; +extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; extern short *uv_cpu_to_blade; extern short uv_possible_blades; @@ -244,16 +312,16 @@ static inline int uv_node_to_blade_id(int nid) return uv_node_to_blade[nid]; } -/* Convert a blade id to the NASID of the blade */ -static inline int uv_blade_to_nasid(int bid) +/* Convert a blade id to the PNODE of the blade */ +static inline int uv_blade_to_pnode(int bid) { - return uv_blade_info[bid].nasid; + return uv_blade_info[bid].pnode; } /* Determine the number of possible cpus on a blade */ static inline int uv_blade_nr_possible_cpus(int bid) { - return uv_blade_info[bid].nr_posible_cpus; + return uv_blade_info[bid].nr_possible_cpus; } /* Determine the number of online cpus on a blade */ @@ -262,16 +330,16 @@ static inline int uv_blade_nr_online_cpus(int bid) return uv_blade_info[bid].nr_online_cpus; } -/* Convert a cpu id to the NASID of the blade containing the cpu */ -static inline int uv_cpu_to_nasid(int cpu) +/* Convert a cpu id to the PNODE of the blade containing the cpu */ +static inline int uv_cpu_to_pnode(int cpu) { - return uv_blade_info[uv_cpu_to_blade_id(cpu)].nasid; + return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode; } -/* Convert a node number to the NASID of the blade */ -static inline int uv_node_to_nasid(int nid) +/* Convert a linux node number to the PNODE of the blade */ +static inline int uv_node_to_pnode(int nid) { - return uv_blade_info[uv_node_to_blade_id(nid)].nasid; + return uv_blade_info[uv_node_to_blade_id(nid)].pnode; } /* Maximum possible number of blades */ diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h index 3b69fe6b6376..ac9846076521 100644 --- a/include/asm-x86/uv/uv_mmrs.h +++ b/include/asm-x86/uv/uv_mmrs.h @@ -11,11 +11,46 @@ #ifndef __ASM_X86_UV_MMRS__ #define __ASM_X86_UV_MMRS__ -/* - * AUTO GENERATED - Do not edit - */ +#define UV_MMR_ENABLE (1UL << 63) - #define UV_MMR_ENABLE (1UL << 63) +/* ========================================================================= */ +/* UVH_BAU_DATA_CONFIG */ +/* ========================================================================= */ +#define UVH_BAU_DATA_CONFIG 0x61680UL +#define UVH_BAU_DATA_CONFIG_32 0x0450 + +#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 +#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 +#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 +#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 +#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_BAU_DATA_CONFIG_P_SHFT 13 +#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_BAU_DATA_CONFIG_T_SHFT 15 +#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_BAU_DATA_CONFIG_M_SHFT 16 +#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 +#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_bau_data_config_u { + unsigned long v; + struct uvh_bau_data_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; /* ========================================================================= */ /* UVH_IPI_INT */ @@ -109,6 +144,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0aa0 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL @@ -169,6 +205,7 @@ union uvh_lb_bau_intd_software_acknowledge_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0aa8 /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ @@ -247,6 +284,331 @@ union uvh_lb_bau_sb_descriptor_base_u { } s; }; +/* ========================================================================= */ +/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */ +/* ========================================================================= */ +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL + +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_SHFT 22 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_MASK 0x0000000000400000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 23 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000000800000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 24 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000001000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 25 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000002000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 26 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000004000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 27 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000008000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 28 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000010000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 29 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000020000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 30 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000040000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 31 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000080000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 32 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000100000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 33 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000200000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 34 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000400000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 35 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000000800000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 36 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000001000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 37 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000002000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 38 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000004000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 39 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000008000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 40 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000010000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 41 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000020000000000UL + +union uvh_lb_mcast_aoerr0_rpt_enable_u { + unsigned long v; + struct uvh_lb_mcast_aoerr0_rpt_enable_s { + unsigned long mcast_obese_msg : 1; /* RW */ + unsigned long mcast_data_sb_err : 1; /* RW */ + unsigned long mcast_nack_buff_parity : 1; /* RW */ + unsigned long mcast_timeout : 1; /* RW */ + unsigned long mcast_inactive_reply : 1; /* RW */ + unsigned long mcast_upgrade_error : 1; /* RW */ + unsigned long mcast_reg_count_underflow : 1; /* RW */ + unsigned long mcast_rep_obese_msg : 1; /* RW */ + unsigned long ucache_req_runt_msg : 1; /* RW */ + unsigned long ucache_req_obese_msg : 1; /* RW */ + unsigned long ucache_req_data_sb_err : 1; /* RW */ + unsigned long ucache_rep_runt_msg : 1; /* RW */ + unsigned long ucache_rep_obese_msg : 1; /* RW */ + unsigned long ucache_rep_data_sb_err : 1; /* RW */ + unsigned long ucache_rep_command_err : 1; /* RW */ + unsigned long ucache_pend_timeout : 1; /* RW */ + unsigned long macc_req_runt_msg : 1; /* RW */ + unsigned long macc_req_obese_msg : 1; /* RW */ + unsigned long macc_req_data_sb_err : 1; /* RW */ + unsigned long macc_rep_runt_msg : 1; /* RW */ + unsigned long macc_rep_obese_msg : 1; /* RW */ + unsigned long macc_rep_data_sb_err : 1; /* RW */ + unsigned long macc_timeout : 1; /* RW */ + unsigned long macc_spurious_event : 1; /* RW */ + unsigned long ioh_destination_table_parity : 1; /* RW */ + unsigned long get_had_error_reply : 1; /* RW */ + unsigned long get_timeout : 1; /* RW */ + unsigned long lock_manager_had_error_reply : 1; /* RW */ + unsigned long put_had_error_reply : 1; /* RW */ + unsigned long put_timeout : 1; /* RW */ + unsigned long sb_activation_overrun : 1; /* RW */ + unsigned long completed_gb_activation_had_error_reply : 1; /* RW */ + unsigned long completed_gb_activation_timeout : 1; /* RW */ + unsigned long descriptor_buffer_0_parity : 1; /* RW */ + unsigned long descriptor_buffer_1_parity : 1; /* RW */ + unsigned long socket_destination_table_parity : 1; /* RW */ + unsigned long bau_reply_payload_corruption : 1; /* RW */ + unsigned long io_port_destination_table_parity : 1; /* RW */ + unsigned long intd_soft_ack_timeout : 1; /* RW */ + unsigned long int_rep_obese_msg : 1; /* RW */ + unsigned long int_rep_command_err : 1; /* RW */ + unsigned long int_timeout : 1; /* RW */ + unsigned long rsvd_42_63 : 22; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LOCAL_INT0_CONFIG */ +/* ========================================================================= */ +#define UVH_LOCAL_INT0_CONFIG 0x61000UL + +#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0 +#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8 +#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12 +#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13 +#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15 +#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16 +#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_local_int0_config_u { + unsigned long v; + struct uvh_local_int0_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LOCAL_INT0_ENABLE */ +/* ========================================================================= */ +#define UVH_LOCAL_INT0_ENABLE 0x65000UL + +#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0 +#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL +#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1 +#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL +#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2 +#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL +#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3 +#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL +#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4 +#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL +#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5 +#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL +#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6 +#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7 +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8 +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9 +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10 +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11 +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12 +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13 +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14 +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15 +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16 +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17 +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18 +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19 +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20 +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL +#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21 +#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL +#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39 +#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL +#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40 +#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL +#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41 +#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42 +#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43 +#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL +#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44 +#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL + +union uvh_local_int0_enable_u { + unsigned long v; + struct uvh_local_int0_enable_s { + unsigned long lb_hcerr : 1; /* RW */ + unsigned long gr0_hcerr : 1; /* RW */ + unsigned long gr1_hcerr : 1; /* RW */ + unsigned long lh_hcerr : 1; /* RW */ + unsigned long rh_hcerr : 1; /* RW */ + unsigned long xn_hcerr : 1; /* RW */ + unsigned long si_hcerr : 1; /* RW */ + unsigned long lb_aoerr0 : 1; /* RW */ + unsigned long gr0_aoerr0 : 1; /* RW */ + unsigned long gr1_aoerr0 : 1; /* RW */ + unsigned long lh_aoerr0 : 1; /* RW */ + unsigned long rh_aoerr0 : 1; /* RW */ + unsigned long xn_aoerr0 : 1; /* RW */ + unsigned long si_aoerr0 : 1; /* RW */ + unsigned long lb_aoerr1 : 1; /* RW */ + unsigned long gr0_aoerr1 : 1; /* RW */ + unsigned long gr1_aoerr1 : 1; /* RW */ + unsigned long lh_aoerr1 : 1; /* RW */ + unsigned long rh_aoerr1 : 1; /* RW */ + unsigned long xn_aoerr1 : 1; /* RW */ + unsigned long si_aoerr1 : 1; /* RW */ + unsigned long rh_vpi_int : 1; /* RW */ + unsigned long system_shutdown_int : 1; /* RW */ + unsigned long lb_irq_int_0 : 1; /* RW */ + unsigned long lb_irq_int_1 : 1; /* RW */ + unsigned long lb_irq_int_2 : 1; /* RW */ + unsigned long lb_irq_int_3 : 1; /* RW */ + unsigned long lb_irq_int_4 : 1; /* RW */ + unsigned long lb_irq_int_5 : 1; /* RW */ + unsigned long lb_irq_int_6 : 1; /* RW */ + unsigned long lb_irq_int_7 : 1; /* RW */ + unsigned long lb_irq_int_8 : 1; /* RW */ + unsigned long lb_irq_int_9 : 1; /* RW */ + unsigned long lb_irq_int_10 : 1; /* RW */ + unsigned long lb_irq_int_11 : 1; /* RW */ + unsigned long lb_irq_int_12 : 1; /* RW */ + unsigned long lb_irq_int_13 : 1; /* RW */ + unsigned long lb_irq_int_14 : 1; /* RW */ + unsigned long lb_irq_int_15 : 1; /* RW */ + unsigned long l1_nmi_int : 1; /* RW */ + unsigned long stop_clock : 1; /* RW */ + unsigned long asic_to_l1 : 1; /* RW */ + unsigned long l1_to_asic : 1; /* RW */ + unsigned long ltc_int : 1; /* RW */ + unsigned long la_seq_trigger : 1; /* RW */ + unsigned long rsvd_45_63 : 19; /* */ + } s; +}; + /* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ @@ -283,6 +645,73 @@ union uvh_node_id_u { } s; }; +/* ========================================================================= */ +/* UVH_NODE_PRESENT_TABLE */ +/* ========================================================================= */ +#define UVH_NODE_PRESENT_TABLE 0x1400UL +#define UVH_NODE_PRESENT_TABLE_DEPTH 16 + +#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 +#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL + +union uvh_node_present_table_u { + unsigned long v; + struct uvh_node_present_table_s { + unsigned long nodes : 64; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_0_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_1_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_2_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + /* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ @@ -369,5 +798,77 @@ union uvh_si_addr_map_config_u { } s; }; +/* ========================================================================= */ +/* UVH_SI_ALIAS0_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL + +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias0_overlay_config_u { + unsigned long v; + struct uvh_si_alias0_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ALIAS1_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL + +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias1_overlay_config_u { + unsigned long v; + struct uvh_si_alias1_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ALIAS2_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL + +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias2_overlay_config_u { + unsigned long v; + struct uvh_si_alias2_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + #endif /* __ASM_X86_UV_MMRS__ */ -- cgit v1.2.3 From 3d1ba1da2b4ff4ace7801e99fb9a3095b182d847 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 2 Jun 2008 13:50:10 +0200 Subject: x86: fix nmi.c build bug apic.h needs to be included for the apic_write_around() definition. --- arch/x86/kernel/nmi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bf143f191fb1..cbd4fa3c475b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -11,6 +11,8 @@ * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ +#include + #include #include #include -- cgit v1.2.3 From f3294690979634ee10398bb0beadfe1d4edb881d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 10:16:10 +0200 Subject: x86, numaq: add pci_acpi_scan_root() stub allow 32-bit numaq build to succeed with ACPI enabled. --- arch/x86/kernel/numaq_32.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index e65281b1634b..992f53cb79b6 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -87,3 +87,14 @@ static int __init numaq_tsc_disable(void) return 0; } arch_initcall(numaq_tsc_disable); + +#ifdef CONFIG_ACPI +/* + * Dummy implementation: + */ +struct pci_bus * __devinit +pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) +{ + return NULL; +} +#endif -- cgit v1.2.3 From 0596152388e234efebce464355186ad9e16c8cb6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:52:47 -0700 Subject: x86, 32-bit: change propagate_e820_map() back to find_max_pfn() we don't need to call memory_present that early. numa and sparse will call memory_present later and might even fail, it will call memory_present for the full range. also for sparse it will call alloc_bootmem ... before we set up bootmem. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820_32.c | 5 ++--- arch/x86/kernel/setup_32.c | 4 ++-- arch/x86/mm/discontig_32.c | 2 +- include/asm-x86/e820_32.h | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index db760d4706af..0c025d04fc2e 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -210,7 +210,7 @@ void __init init_iomem_resources(struct resource *code_resource, /* * Find the highest page frame number we have available */ -void __init propagate_e820_map(void) +void __init find_max_pfn(void) { int i; @@ -227,7 +227,6 @@ void __init propagate_e820_map(void) continue; if (end > max_pfn) max_pfn = end; - memory_present(0, start, end); } } @@ -361,7 +360,7 @@ static int __init parse_memmap(char *arg) * size before original memory map is * reset. */ - propagate_e820_map(); + find_max_pfn(); saved_max_pfn = max_pfn; #endif e820.nr_map = 0; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 6f40cb560ea9..29010420458d 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -730,10 +730,10 @@ void __init setup_arch(char **cmdline_p) efi_init(); /* update e820 for memory not covered by WB MTRRs */ - propagate_e820_map(); + find_max_pfn(); mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - propagate_e820_map(); + find_max_pfn(); max_low_pfn = setup_memory(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 98b099eeab3a..ebbbba338150 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void) printk("NUMA - single node, flat memory mode\n"); /* Run the memory configuration and find the top of memory. */ - propagate_e820_map(); + find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 7ace82570a36..00fbc60b9d30 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -21,7 +21,7 @@ extern void setup_memory_map(void); extern void finish_e820_parsing(void); -extern void propagate_e820_map(void); +extern void find_max_pfn(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, -- cgit v1.2.3 From 2944e16b25e7fb8b5ee0dd9dc7197a0f9e523cfd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 13:17:38 -0700 Subject: x86: update mptable make mptable to be consistent with acpi routing, so we could: 1. kexec kernel with acpi=off 2. work around BIOSes where acpi routing is working, but mptable is not right, so can use kernel/kexec to start other OSes that don't have good acpi support. command line: update_mptable Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 22 +++ arch/x86/kernel/e820.c | 25 +++ arch/x86/kernel/mpparse.c | 401 ++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/setup_64.c | 4 + drivers/acpi/pci_irq.c | 5 + include/asm-x86/e820.h | 1 + include/asm-x86/mpspec.h | 4 + 7 files changed, 445 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2ddfabae382b..f226bdc19f69 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1154,6 +1154,28 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } +int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, + u32 gsi, int triggering, int polarity) +{ + struct mpc_config_intsrc intsrc; + int ioapic; + + /* print the entry should happen on mptable identically */ + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + intsrc.mpc_srcbus = number; + intsrc.mpc_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + intsrc.mpc_dstapic = mp_ioapic_routing[ioapic].apic_id; + intsrc.mpc_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; + + MP_intsrc_info(&intsrc); + + return 0; +} + /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0cd9132c9450..cd2b99e27d43 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -739,3 +739,28 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) return -1UL; } + +/* + * pre allocated 4k and reserved it in e820 + */ +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +{ + u64 size = 0; + u64 addr; + u64 start; + + start = startt; + while (size < sizet) + start = find_e820_area_size(start, &size, align); + + if (size < sizet) + return 0; + + addr = round_down(start + size - sizet, align); + update_memory_range(addr, sizet, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820 for early_reserve_e820\n"); + update_e820(); + + return addr; +} + diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9f3792d55044..8898aa49079d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #ifdef CONFIG_X86_32 @@ -161,20 +163,81 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) nr_ioapics++; } -static void __init MP_intsrc_info(struct mpc_config_intsrc *m) +static void print_MP_intsrc_info(struct mpc_config_intsrc *m) { - printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x," + printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - mp_irqs[mp_irq_entries].mp_dstapic = m->mpc_dstapic; - mp_irqs[mp_irq_entries].mp_type = m->mpc_type; - mp_irqs[mp_irq_entries].mp_irqtype = m->mpc_irqtype; - mp_irqs[mp_irq_entries].mp_irqflag = m->mpc_irqflag; - mp_irqs[mp_irq_entries].mp_srcbus = m->mpc_srcbus; - mp_irqs[mp_irq_entries].mp_srcbusirq = m->mpc_srcbusirq; - mp_irqs[mp_irq_entries].mp_dstirq = m->mpc_dstirq; +} + +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) +{ + printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); +} + +static void assign_to_mp_irq(struct mpc_config_intsrc *m, + struct mp_config_intsrc *mp_irq) +{ + mp_irq->mp_dstapic = m->mpc_dstapic; + mp_irq->mp_type = m->mpc_type; + mp_irq->mp_irqtype = m->mpc_irqtype; + mp_irq->mp_irqflag = m->mpc_irqflag; + mp_irq->mp_srcbus = m->mpc_srcbus; + mp_irq->mp_srcbusirq = m->mpc_srcbusirq; + mp_irq->mp_dstirq = m->mpc_dstirq; +} + +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, + struct mpc_config_intsrc *m) +{ + m->mpc_dstapic = mp_irq->mp_dstapic; + m->mpc_type = mp_irq->mp_type; + m->mpc_irqtype = mp_irq->mp_irqtype; + m->mpc_irqflag = mp_irq->mp_irqflag; + m->mpc_srcbus = mp_irq->mp_srcbus; + m->mpc_srcbusirq = mp_irq->mp_srcbusirq; + m->mpc_dstirq = mp_irq->mp_dstirq; +} + +static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, + struct mpc_config_intsrc *m) +{ + if (mp_irq->mp_dstapic != m->mpc_dstapic) + return 1; + if (mp_irq->mp_type != m->mpc_type) + return 2; + if (mp_irq->mp_irqtype != m->mpc_irqtype) + return 3; + if (mp_irq->mp_irqflag != m->mpc_irqflag) + return 4; + if (mp_irq->mp_srcbus != m->mpc_srcbus) + return 5; + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq) + return 6; + if (mp_irq->mp_dstirq != m->mpc_dstirq) + return 7; + + return 0; +} + +void MP_intsrc_info(struct mpc_config_intsrc *m) +{ + int i; + + print_MP_intsrc_info(m); + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } @@ -268,12 +331,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, * Read/parse the MPC */ -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem, + char *str) { - char str[16]; - char oem[10]; - int count = sizeof(*mpc); - unsigned char *mpt = ((unsigned char *)mpc) + count; if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", @@ -301,13 +361,28 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) memcpy(str, mpc->mpc_productid, 12); str[12] = 0; -#ifdef CONFIG_X86_32 - mps_oem_check(mpc, oem, str); -#endif printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); + return 1; +} + +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + +#ifdef CONFIG_X86_32 + mps_oem_check(mpc, oem, str); +#endif + /* save the local APIC address, it might be non-default */ if (!acpi_lapic) mp_lapic_addr = mpc->mpc_lapic; @@ -785,3 +860,295 @@ void __init find_smp_config(void) { __find_smp_config(1); } + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m) +{ + int i; + + if (m->mpc_irqtype != mp_INT) + return 0; + + if (m->mpc_irqflag != 0x0f) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].mp_irqtype != mp_INT) + continue; + + if (mp_irqs[i].mp_irqflag != 0x0f) + continue; + + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus) + continue; + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; +#endif + +static int __init replace_intsrc_all(struct mp_config_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; + int nr_m_spare = 0; +#endif + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length); + while (count < mpc->mpc_length) { + switch (*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m = + (struct mpc_config_processor *)mpt; + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m = + (struct mpc_config_bus *)mpt; + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + mpt += sizeof(struct mpc_config_ioapic); + count += sizeof(struct mpc_config_ioapic); + break; + } + case MP_INTSRC: + { +#ifdef CONFIG_X86_IO_APIC + struct mpc_config_intsrc *m = + (struct mpc_config_intsrc *)mpt; + + printk(KERN_INFO "OLD "); + print_MP_intsrc_info(m); + i = get_MP_intsrc_index(m); + if (i > 0) { + assign_to_mpc_intsrc(&mp_irqs[i], m); + printk(KERN_INFO "NEW "); + print_mp_irq_info(&mp_irqs[i]); + } else if (!i) { + /* legacy, do nothing */ + } else if (nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) + * are invalid entries, + * we need to use the slot later + */ + m_spare[nr_m_spare] = m; + nr_m_spare++; + } +#endif + mpt += sizeof(struct mpc_config_intsrc); + count += sizeof(struct mpc_config_intsrc); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m = + (struct mpc_config_lintsrc *)mpt; + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + default: + /* wrong mptable */ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); + printk(KERN_ERR "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->mpc_length, 1); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].mp_irqtype != mp_INT) + continue; + + if (mp_irqs[i].mp_irqflag != 0x0f) + continue; + + if (nr_m_spare > 0) { + printk(KERN_INFO "*NEW* found "); + nr_m_spare--; + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_config_intsrc *m = + (struct mpc_config_intsrc *)mpt; + count += sizeof(struct mpc_config_intsrc); + if (!mpc_new_phys) { + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); + } else { + if (count <= mpc_new_length) + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count); + else { + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length); + goto out; + } + } + assign_to_mpc_intsrc(&mp_irqs[i], m); + mpc->mpc_length = count; + mpt += sizeof(struct mpc_config_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->mpc_checksum = 0; + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc, + mpc->mpc_length); + + return 0; +} + +int __initdata enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init early_reserve_e820_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) { + u64 startt = 0; +#ifdef CONFIG_X86_TRAMPOLINE + startt = TRAMPOLINE_BASE; +#endif + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); + } +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct intel_mp_floating *mpf; + struct mp_config_table *mpc; + struct mp_config_table *mpc_new; + + if (!enable_update_mptable) + return 0; + + mpf = mpf_found; + if (!mpf) + return 0; + + /* + * Now see if we need to go further. + */ + if (mpf->mpf_feature1 != 0) + return 0; + + if (!mpf->mpf_physptr) + return 0; + + mpc = phys_to_virt(mpf->mpf_physptr); + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); + + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) { + mpc_new_phys = 0; + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the postion */ + mpc->mpc_checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); + mpc->mpc_checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); + if (old == new) { + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + return 0; + } + printk(KERN_INFO "use in-positon replacing\n"); + } else { + mpf->mpf_physptr = mpc_new_phys; + mpc_new = phys_to_virt(mpc_new_phys); + memcpy(mpc_new, mpc, mpc->mpc_length); + mpc = mpc_new; + /* check if we can modify that */ + if (mpc_new_phys - mpf->mpf_physptr) { + struct intel_mp_floating *mpf_new; + /* steal 16 bytes from [0, 1k) */ + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + mpf_new = phys_to_virt(0x400 - 16); + memcpy(mpf_new, mpf, 16); + mpf = mpf_new; + mpf->mpf_physptr = mpc_new_phys; + } + mpf->mpf_checksum = 0; + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + + return 0; +} + +late_initcall(update_mp_table); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 89e6cca5d693..978a0d637f3f 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -56,6 +56,7 @@ #include #include