diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-02-14 09:46:06 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-02-14 09:46:06 -0800 |
commit | 414f827c46973ba39320cfb43feb55a0eeb9b4e8 (patch) | |
tree | 45e860974ef698e71370a0ebdddcff4f14fbdf9e /arch/x86_64 | |
parent | 86a71dbd3e81e8870d0f0e56b87875f57e58222b (diff) | |
parent | 126b1922367fbe5513daa675a2abd13ed3917f4e (diff) |
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (94 commits)
[PATCH] x86-64: Remove mk_pte_phys()
[PATCH] i386: Fix broken CONFIG_COMPAT_VDSO on i386
[PATCH] i386: fix 32-bit ioctls on x64_32
[PATCH] x86: Unify pcspeaker platform device code between i386/x86-64
[PATCH] i386: Remove extern declaration from mm/discontig.c, put in header.
[PATCH] i386: Rename cpu_gdt_descr and remove extern declaration from smpboot.c
[PATCH] i386: Move mce_disabled to asm/mce.h
[PATCH] i386: paravirt unhandled fallthrough
[PATCH] x86_64: Wire up compat epoll_pwait
[PATCH] x86: Don't require the vDSO for handling a.out signals
[PATCH] i386: Fix Cyrix MediaGX detection
[PATCH] i386: Fix warning in cpu initialization
[PATCH] i386: Fix warning in microcode.c
[PATCH] x86: Enable NMI watchdog for AMD Family 0x10 CPUs
[PATCH] x86: Add new CPUID bits for AMD Family 10 CPUs in /proc/cpuinfo
[PATCH] i386: Remove fastcall in paravirt.[ch]
[PATCH] x86-64: Fix wrong gcc check in bitops.h
[PATCH] x86-64: survive having no irq mapping for a vector
[PATCH] i386: geode configuration fixes
[PATCH] i386: add option to show more code in oops reports
...
Diffstat (limited to 'arch/x86_64')
30 files changed, 761 insertions, 416 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 02dd39457bcf..7982cbc3bc94 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -152,18 +152,18 @@ config MPSC Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs with Intel Extended Memory 64 Technology(EM64T). For details see <http://www.intel.com/technology/64bitextensions/>. - Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distingush them + Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the + Netburst core and shouldn't use this option. You can distinguish them using the cpu family field - in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one - (this rule only applies to system that support EM64T) + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one + (this rule only applies to systems that support EM64T) config MCORE2 bool "Intel Core2 / newer Xeon" help Optimize for Intel Core2 and newer Xeons (51xx) - You can distingush the newer Xeons from the older ones using - the cpu family field in /proc/cpuinfo. 15 is a older Xeon + You can distinguish the newer Xeons from the older ones using + the cpu family field in /proc/cpuinfo. 15 is an older Xeon (use CONFIG_MPSC then), 6 is a newer one. This rule only applies to CPUs that support EM64T. @@ -458,8 +458,8 @@ config IOMMU on systems with more than 3GB. This is usually needed for USB, sound, many IDE/SATA chipsets and some other devices. Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART - based IOMMU and a software bounce buffer based IOMMU used on Intel - systems and as fallback. + based hardware IOMMU and a software bounce buffer based IOMMU used + on Intel systems and as fallback. The code is only active when needed (enough memory and limited device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified too. @@ -496,6 +496,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool + help + Support for software bounce buffers used on x86-64 systems + which don't have a hardware IOMMU (e.g. the current generation + of Intel's x86-64 CPUs). Using this PCI devices which can only + access 32-bits of memory can be used on systems with more than + 3 GB of memory. If unsure, say Y. config X86_MCE bool "Machine check support" if EMBEDDED diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 69584c295305..293a4a4c609e 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.20-rc3 -# Fri Jan 5 11:54:41 2007 +# Linux kernel version: 2.6.20-git8 +# Tue Feb 13 11:25:16 2007 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -11,6 +11,7 @@ CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y +CONFIG_ZONE_DMA=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y @@ -153,6 +154,7 @@ CONFIG_NEED_MULTIPLE_NODES=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y +CONFIG_ZONE_DMA_FLAG=1 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 @@ -201,13 +203,14 @@ CONFIG_ACPI=y CONFIG_ACPI_SLEEP=y CONFIG_ACPI_SLEEP_PROC_FS=y CONFIG_ACPI_SLEEP_PROC_SLEEP=y +CONFIG_ACPI_PROCFS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y -# CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y # CONFIG_ACPI_DOCK is not set +# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y @@ -263,7 +266,6 @@ CONFIG_PCI_MMCONFIG=y CONFIG_PCIEPORTBUS=y CONFIG_PCIEAER=y CONFIG_PCI_MSI=y -# CONFIG_PCI_MULTITHREAD_PROBE is not set # CONFIG_PCI_DEBUG is not set # CONFIG_HT_IRQ is not set @@ -398,6 +400,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set # @@ -466,6 +469,7 @@ CONFIG_BLK_DEV_IDECD=y # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set +CONFIG_BLK_DEV_IDEACPI=y # CONFIG_IDE_TASK_IOCTL is not set # @@ -497,6 +501,7 @@ CONFIG_BLK_DEV_ATIIXP=y # CONFIG_BLK_DEV_JMICRON is not set # CONFIG_BLK_DEV_SC1200 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_IT8213 is not set # CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set @@ -507,6 +512,7 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y # CONFIG_BLK_DEV_SLC90E66 is not set # CONFIG_BLK_DEV_TRM290 is not set # CONFIG_BLK_DEV_VIA82CXXX is not set +# CONFIG_BLK_DEV_TC86C001 is not set # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set @@ -599,6 +605,7 @@ CONFIG_MEGARAID_SAS=y # Serial ATA (prod) and Parallel ATA (experimental) drivers # CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set CONFIG_SATA_AHCI=y CONFIG_SATA_SVW=y CONFIG_ATA_PIIX=y @@ -614,6 +621,7 @@ CONFIG_SATA_SIL=y # CONFIG_SATA_ULI is not set CONFIG_SATA_VIA=y # CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set @@ -630,6 +638,7 @@ CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_HPT3X2N is not set # CONFIG_PATA_HPT3X3 is not set # CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set @@ -682,9 +691,7 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_OUI_DB is not set # CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set -# CONFIG_IEEE1394_EXPORT_FULL_API is not set # # Device Drivers @@ -707,6 +714,11 @@ CONFIG_IEEE1394_RAWIO=y # CONFIG_I2O is not set # +# Macintosh device drivers +# +# CONFIG_MAC_EMUMOUSEBTN is not set + +# # Network device support # CONFIG_NETDEVICES=y @@ -774,6 +786,7 @@ CONFIG_8139TOO=y # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set # CONFIG_VIA_RHINE is not set +# CONFIG_SC92031 is not set # # Ethernet (1000 Mbit) @@ -795,11 +808,13 @@ CONFIG_E1000=y CONFIG_TIGON3=y CONFIG_BNX2=y # CONFIG_QLA3XXX is not set +# CONFIG_ATL1 is not set # # Ethernet (10000 Mbit) # # CONFIG_CHELSIO_T1 is not set +# CONFIG_CHELSIO_T3 is not set # CONFIG_IXGB is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set @@ -1115,6 +1130,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y +CONFIG_OBSOLETE_OSS=y # CONFIG_SOUND_BT878 is not set # CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y @@ -1128,6 +1144,7 @@ CONFIG_SOUND_ICH=y # HID Devices # CONFIG_HID=y +# CONFIG_HID_DEBUG is not set # # USB support @@ -1142,10 +1159,8 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y -# CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set -# CONFIG_USB_MULTITHREAD_PROBE is not set # CONFIG_USB_OTG is not set # @@ -1155,9 +1170,11 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set +# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y -# CONFIG_USB_OHCI_BIG_ENDIAN is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set @@ -1208,6 +1225,7 @@ CONFIG_USB_HID=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set +# CONFIG_USB_GTCO is not set # # USB Imaging devices @@ -1313,6 +1331,10 @@ CONFIG_USB_MON=y # # +# Auxiliary Display support +# + +# # Virtualization # # CONFIG_KVM is not set @@ -1512,6 +1534,7 @@ CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_FS=y # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y +# CONFIG_DEBUG_SHIRQ is not set CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set @@ -1520,7 +1543,6 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set @@ -1560,4 +1582,5 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y -CONFIG_IOMAP_COPY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index ff499ef2a1ba..359eacc38509 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -21,6 +21,7 @@ #include <linux/stddef.h> #include <linux/personality.h> #include <linux/compat.h> +#include <linux/binfmts.h> #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/i387.h> @@ -449,7 +450,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, /* Return stub is in 32bit vsyscall page */ { - void __user *restorer = VSYSCALL32_SIGRETURN; + void __user *restorer; + if (current->binfmt->hasvdso) + restorer = VSYSCALL32_SIGRETURN; + else + restorer = (void *)&frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); @@ -495,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif @@ -601,7 +606,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5f32cf4de5fb..eda7a0d4dc15 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -718,4 +718,5 @@ ia32_sys_call_table: .quad compat_sys_vmsplice .quad compat_sys_move_pages .quad sys_getcpu + .quad sys_epoll_pwait ia32_syscall_end: diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 3c7cbff04d3d..ae399458024b 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o +obj-y += pcspeaker.o CFLAGS_vsyscall.o := $(PROFILING) -g0 @@ -56,3 +57,4 @@ quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o +pcspeaker-y += ../../i386/kernel/pcspeaker.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 5ebf62c7a3d2..23178ce6c783 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0; unsigned long acpi_video_flags; extern char wakeup_start, wakeup_end; -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); +extern unsigned long acpi_copy_wakeup_routine(unsigned long); static pgd_t low_ptr; diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 6fe191c58084..4651fd22b213 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } +#ifdef CONFIG_NUMA + /* NUMA memory to node map */ + if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { + *addrp = nodemap_addr + nodemap_size; + return 1; + } +#endif /* XXX ramdisk image here? */ return 0; } @@ -184,6 +191,37 @@ unsigned long __init e820_end_of_ram(void) } /* + * Find the hole size in the range. + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr >= end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr) + ram += last - addr; + } + return ((end - start) - ram); +} + +/* * Mark e820 reserved areas as busy for the resource manager. */ void __init e820_reserve_resources(void) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 1e6f80870679..598a4d0351fc 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -163,6 +163,20 @@ startup_64: */ lgdt cpu_gdt_descr + /* set up data segments. actually 0 would do too */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + /* * Setup up a dummy PDA. this is just for some early bootup code * that does in_interrupt() @@ -173,12 +187,6 @@ startup_64: shrq $32,%rdx wrmsr - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - movl %eax,%ss - movl %eax,%es - /* esi is pointer to real mode structure with interesting info. pass it to C */ movl %esi, %edi diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 6be6730acb5c..566e64d966c4 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -831,7 +831,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); @@ -839,7 +839,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); } if (!apic && !IO_APIC_IRQ(irq)) @@ -851,7 +851,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (vector < 0) return; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.vector = vector; ioapic_register_intr(irq, vector, IOAPIC_AUTO); @@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in */ entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; @@ -1020,18 +1020,17 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %03X %02X ", + printk(KERN_DEBUG " %02x %03X ", i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest + entry.dest ); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", @@ -1293,8 +1292,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(apic_read(APIC_ID)); + entry.dest = GET_APIC_ID(apic_read(APIC_ID)); /* * Add it to the IO-APIC irq-routing table: @@ -1556,7 +1554,7 @@ static inline void unlock_ExtINT_logic(void) entry1.dest_mode = 0; /* physical delivery */ entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; entry1.trigger = 0; @@ -2131,7 +2129,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.trigger = triggering; entry.polarity = polarity; entry.mask = 1; /* Disabled (masked) */ diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index fe063d3cfe42..745b1f0f494e 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); + regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); return 0; } diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 0c06af6c13bc..3bc30d2c13d3 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -18,6 +18,7 @@ #include <asm/uaccess.h> #include <asm/io_apic.h> #include <asm/idle.h> +#include <asm/smp.h> atomic_t irq_err_count; @@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) if (likely(irq < NR_IRQS)) generic_handle_irq(irq); - else if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", - __func__, smp_processor_id(), vector); + else { + if (!disable_apic) + ack_APIC_irq(); + + if (printk_ratelimit()) + printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", + __func__, smp_processor_id(), vector); + } irq_exit(); diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index bdb54a2c9f18..8011a8e1c7d4 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -19,6 +19,7 @@ #include <linux/cpu.h> #include <linux/percpu.h> #include <linux/ctype.h> +#include <linux/kmod.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -42,6 +43,10 @@ static unsigned long console_logged; static int notify_user; static int rip_msr; static int mce_bootlog = 1; +static atomic_t mce_events; + +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; /* * Lockless MCE logging infrastructure. @@ -57,6 +62,7 @@ struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { @@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +static void do_mce_trigger(void) +{ + static atomic_t mce_logged; + int events = atomic_read(&mce_events); + if (events != atomic_read(&mce_logged) && trigger[0]) { + /* Small race window, but should be harmless. */ + atomic_set(&mce_logged, events); + call_usermodehelper(trigger, trigger_argv, NULL, -1); + } +} + /* * The actual machine check handler */ @@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) } /* Never do anything final in the polling timer */ - if (!regs) + if (!regs) { + /* Normal interrupt context here. Call trigger for any new + events. */ + do_mce_trigger(); goto out; + } /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ @@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); +/* TBD should generate these dynamically based on number of available banks */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static struct sysdev_attribute * bank_attributes[NR_BANKS] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; + +static ssize_t show_trigger(struct sys_device *s, char *buf) +{ + strcpy(buf, trigger); + strcat(buf, "\n"); + return strlen(trigger) + 1; +} + +static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +{ + char *p; + int len; + strncpy(trigger, buf, sizeof(trigger)); + trigger[sizeof(trigger)-1] = 0; + len = strlen(trigger); + p = strchr(trigger, '\n'); + if (*p) *p = 0; + return len; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) +static struct sysdev_attribute *mce_attributes[] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, + &attr_tolerant, &attr_check_interval, &attr_trigger, + NULL +}; /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) err = sysdev_register(&per_cpu(device_mce,cpu)); if (!err) { - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_create_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); } return err; } @@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu) { int i; - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 93c707257637..d0bd5d66e103 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -37,6 +37,8 @@ #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 #define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 #define MASK_LVTOFF_HI 0x00F00000 #define MASK_COUNT_EN_HI 0x00080000 #define MASK_INT_TYPE_HI 0x00060000 @@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; if (!block) @@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void) /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -206,10 +216,14 @@ asmlinkage void mce_threshold_interrupt(void) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; + /* Log the machine check that caused the threshold + event. */ + do_machine_check(NULL, 0); + if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, @@ -385,7 +399,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; if (rdmsr_safe(address, &low, &high)) - goto recurse; + return 0; if (!(high & MASK_VALID_HI)) { if (block) @@ -394,8 +408,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) goto recurse; b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 9cb42ecb7f89..486f4c61a948 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15; + return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; @@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data) } #endif +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + unsigned int retval = hz; + + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) { + retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + return retval; +} + int __init check_nmi_watchdog (void) { int *counts; @@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && - ((u64)cpu_khz * 1000) > 0x7fffffffULL) { - nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; - } + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) + nmi_hz = adjust_for_32bit_ctr(nmi_hz); } kfree(counts); @@ -360,6 +368,33 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ @@ -634,7 +669,9 @@ static int setup_intel_arch_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -855,15 +892,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { /* * ArchPerfom/Core Duo needs to re-unmask * the apic vector */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* ARCH_PERFMON has 32 bit counter writes */ + wrmsr(wd->perfctr_msr, + (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); + } else { + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); rc = 1; } else if (nmi_watchdog == NMI_IO_APIC) { /* don't know how to accurately check for this. diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 3d65b1d4c2b3..04480c3b68f5 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = { #define PHB_DEBUG_STUFF_OFFSET 0x0020 +#define EMERGENCY_PAGES 32 /* = 128KB */ + unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; @@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, { unsigned long entry; unsigned long badbit; + unsigned long badend; + + /* were we called with bad_dma_address? */ + badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + WARN_ON(1); + return; + } entry = dma_addr >> PAGE_SHIFT; @@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) u64 start; struct iommu_table *tbl = dev->sysdata; - /* reserve bad_dma_address in case it's a legal address */ - iommu_range_reserve(tbl, bad_dma_address, 1); + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ + iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ start = (640 * 1024); @@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void) } force_iommu = 1; + bad_dma_address = 0x0; dma_ops = &calgary_dma_ops; return 0; diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 683b7a5c1ab3..651ccfb06697 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] - [,forcesac][,fullflush][,nomerge][,biomerge] - size set size of iommu (in bytes) - noagp don't initialize the AGP driver and use full aperture. - off don't use the IOMMU - leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) - memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Default. - force Force IOMMU. - merge Do lazy merging. This may improve performance on some block devices. - Implies force (experimental) - biomerge Do merging at the BIO layer. This is more efficient than merge, - but should be only done with very big IOMMUs. Implies merge,force. - nomerge Don't do SG merging. - forcesac For SAC mode for masks <40bits (experimental) - fullflush Flush IOMMU on each allocation (default) - nofullflush Don't use IOMMU fullflush - allowed overwrite iommu off workarounds for specific chipsets. - soft Use software bounce buffering (default for Intel machines) - noaperture Don't touch the aperture for AGP. - allowdac Allow DMA >4GB - nodac Forbid DMA >4GB - panic Force panic when IOMMU overflows -*/ +/* + * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter + * documentation. + */ __init int iommu_setup(char *p) { iommu_merge = 1; diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index fc1960f1f243..030eb3753358 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; if (force_iommu) mmu = 1; @@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; return mmu; } diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index addc14af0c56..4326a690a509 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) } ret = 0; for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __get_user(tmp, (unsigned long __user *) data); - putreg(child, ui, tmp); + ret = __get_user(tmp, (unsigned long __user *) data); + if (ret) + break; + ret = putreg(child, ui, tmp); + if (ret) + break; data += sizeof(long); } break; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 60477244d1a3..3d98b696881d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -138,128 +138,6 @@ struct resource code_resource = { .flags = IORESOURCE_RAM, }; -#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource adapter_rom_resources[] = { - { .name = "Adapter ROM", .start = 0xc8000, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM } -}; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_ROM, -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_RAM, -}; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; - start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -444,6 +322,11 @@ void __init setup_arch(char **cmdline_p) /* reserve ebda region */ if (ebda_addr) reserve_bootmem_generic(ebda_addr, ebda_size); +#ifdef CONFIG_NUMA + /* reserve nodemap region */ + if (nodemap_addr) + reserve_bootmem_generic(nodemap_addr, nodemap_size); +#endif #ifdef CONFIG_SMP /* @@ -519,15 +402,11 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); /* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ - probe_roms(); + * We trust e820 completely. No explicit ROM probing in memory. + */ e820_reserve_resources(); e820_mark_nosave_regions(); - request_resource(&iomem_resource, &video_ram_resource); - { unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ @@ -1063,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -1081,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -1091,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", + "altmovcr8", "abm", "sse4a", + "misalignsse", "3dnowprefetch", + "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -1103,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", + NULL, /* tsc invariant mapped to constant_tsc */ NULL, /* nothing */ /* constant_tsc - moved to flags */ }; @@ -1219,23 +1104,3 @@ struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - -#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) -#include <linux/platform_device.h> -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); -#endif diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 8c4b80fe71a1..6a70b55f719d 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL(__supported_pte_mask); static int do_not_nx __cpuinitdata = 0; /* noexec=on|off diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 6026b31d037e..65ac2c6b34a6 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr) trace->skip--; return; } - if (trace->nr_entries < trace->max_entries - 1) + if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = addr; } @@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = { void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { dump_trace(task, NULL, NULL, &save_stack_ops, trace); - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 335cc91c49b7..3cc6886f1fb7 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc); #define TICK_COUNT 100000000 #define TICK_MIN 5000 +#define MAX_READ_RETRIES 5 /* * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none @@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc); */ static void __init read_hpet_tsc(int *hpet, int *tsc) { - int tsc1, tsc2, hpet1; + int tsc1, tsc2, hpet1, retries = 0; + static int msg; do { tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - } while (tsc2 - tsc1 > TICK_MIN); + } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES); + if (retries >= MAX_READ_RETRIES && !msg++) + printk(KERN_WARNING + "hpet.c: exceeded max retries to read HPET & TSC\n"); *hpet = hpet1; *tsc = tsc2; } @@ -1221,8 +1226,9 @@ static void hpet_rtc_timer_reinit(void) if (PIE_on) PIE_count += lost_ints; - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); } } diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 6d77e4797a47..0dffae69f4ad 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); EXPORT_SYMBOL(__copy_from_user_inatomic); @@ -34,8 +35,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); #ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +extern void __write_lock_failed(rwlock_t *rw); +extern void __read_lock_failed(rwlock_t *rw); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); #endif diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index b78d4170fce2..8d5f835af481 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o +lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S new file mode 100644 index 000000000000..4620efb12f13 --- /dev/null +++ b/arch/x86_64/lib/copy_user_nocache.S @@ -0,0 +1,217 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +#define FIX_ALIGNMENT 1 + +#include <asm/current.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/cpufeature.h> + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + * + * Input: + * rdi destination + * rsi source + * rdx count + * rcx zero flag when 1 zero on exception + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx /* save zero flag */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 + + xorl %eax,%eax /* zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movnti %r11,(%rdi) +.Ld2: movnti %r8,1*8(%rdi) +.Ld3: movnti %r9,2*8(%rdi) +.Ld4: movnti %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movnti %r11,4*8(%rdi) +.Ld6: movnti %r8,5*8(%rdi) +.Ld7: movnti %r9,6*8(%rdi) +.Ld8: movnti %r10,7*8(%rdi) + + dec %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movnti %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + + CFI_REMEMBER_STATE +.Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE %rcx + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret + CFI_RESTORE_STATE + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + cmpl $0,(%rsp) /* zero flag set? */ + jz .Le_zero + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + CFI_ENDPROC +ENDPROC(__copy_user_nocache) + + diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 49e8cf2e06f8..6ada7231f3ab 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) +static inline int notify_page_fault(struct pt_regs *regs, long err) { struct die_args args = { .regs = regs, - .str = str, + .str = "page fault", .err = err, - .trapnr = trap, - .signr = sig + .trapnr = 14, + .signr = SIGSEGV }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); + return atomic_notifier_call_chain(¬ify_page_fault_chain, + DIE_PAGE_FAULT, &args); } /* Sometimes the CPU reports invalid exceptions on prefetch. @@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (vmalloc_fault(address) >= 0) return; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; if (likely(regs->eflags & X86_EFLAGS_IF)) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 2ee2e003606c..41b8fb069924 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; +unsigned long __initdata nodemap_addr; +unsigned long __initdata nodemap_size; /* @@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) int res = -1; unsigned long addr, end; - if (shift >= 64) - return -1; - memset(memnodemap, 0xff, sizeof(memnodemap)); + memset(memnodemap, 0xff, memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) continue; - if ((end >> shift) >= NODEMAPSIZE) + if ((end >> shift) >= memnodemapsize) return 0; do { if (memnodemap[addr >> shift] != 0xff) return -1; memnodemap[addr >> shift] = i; - addr += (1UL << shift); + addr += (1UL << shift); } while (addr < end); res = 1; } return res; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +static int __init allocate_cachealigned_memnodemap(void) { - int shift = 20; + unsigned long pad, pad_addr; + + memnodemap = memnode.embedded_map; + if (memnodemapsize <= 48) + return 0; + + pad = L1_CACHE_BYTES - 1; + pad_addr = 0x8000; + nodemap_size = pad + memnodemapsize; + nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, + nodemap_size); + if (nodemap_addr == -1UL) { + printk(KERN_ERR + "NUMA: Unable to allocate Memory to Node hash map\n"); + nodemap_addr = nodemap_size = 0; + return -1; + } + pad_addr = (nodemap_addr + pad) & ~pad; + memnodemap = phys_to_virt(pad_addr); + + printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", + nodemap_addr, nodemap_addr + nodemap_size); + return 0; +} - while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) - shift++; +/* + * The LSB of all start and end addresses in the node map is the value of the + * maximum possible shift. + */ +static int __init +extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) +{ + int i, nodes_used = 0; + unsigned long start, end; + unsigned long bitfield = 0, memtop = 0; + + for (i = 0; i < numnodes; i++) { + start = nodes[i].start; + end = nodes[i].end; + if (start >= end) + continue; + bitfield |= start; + nodes_used++; + if (end > memtop) + memtop = end; + } + if (nodes_used <= 1) + i = 63; + else + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i)+1; + return i; +} + +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +{ + int shift; + shift = extract_lsb_from_nodes(nodes, numnodes); + if (allocate_cachealigned_memnodemap()) + return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); @@ -216,31 +272,113 @@ void __init numa_init_array(void) } #ifdef CONFIG_NUMA_EMU +/* Numa emulation */ int numa_fake __initdata = 0; -/* Numa emulation */ +/* + * This function is used to find out if the start and end correspond to + * different zones. + */ +int zone_cross_over(unsigned long start, unsigned long end) +{ + if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && + (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) + return 1; + return 0; +} + static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - int i; + int i, big; struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; + unsigned long sz, old_sz; + unsigned long hole_size; + unsigned long start, end; + unsigned long max_addr = (end_pfn << PAGE_SHIFT); + + start = (start_pfn << PAGE_SHIFT); + hole_size = e820_hole_size(start, max_addr); + sz = (max_addr - start - hole_size) / numa_fake; /* Kludge needed for the hash function */ - if (hweight64(sz) > 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } + old_sz = sz; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + */ + sz &= FAKE_NODE_MIN_HASH_MASK; + + /* + * We ensure that each node is at least 64MB big. Smaller than this + * size can cause VM hiccups. + */ + if (sz == 0) { + printk(KERN_INFO "Not enough memory for %d nodes. Reducing " + "the number of nodes\n", numa_fake); + numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Number of fake nodes will be = %d\n", + numa_fake); + sz = FAKE_NODE_MIN_SIZE; + } + /* + * Find out how many nodes can get an extra NODE_MIN_SIZE granule. + * This logic ensures the extra memory gets distributed among as many + * nodes as possible (as compared to one single node getting all that + * extra memory. + */ + big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " + "%d\n", + (sz >> 20), (hole_size >> 20), big); memset(&nodes,0,sizeof(nodes)); + end = start; for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; + /* + * In case we are not able to allocate enough memory for all + * the nodes, we reduce the number of fake nodes. + */ + if (end >= max_addr) { + numa_fake = i - 1; + break; + } + start = nodes[i].start = end; + /* + * Final node can have all the remaining memory. + */ if (i == numa_fake-1) - sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; - nodes[i].end = nodes[i].start + sz; + sz = max_addr - start; + end = nodes[i].start + sz; + /* + * Fir "big" number of nodes get extra granule. + */ + if (i < big) + end += FAKE_NODE_MIN_SIZE; + /* + * Iterate over the range to ensure that this node gets at + * least sz amount of RAM (excluding holes) + */ + while ((end - start - e820_hole_size(start, end)) < sz) { + end += FAKE_NODE_MIN_SIZE; + if (end >= max_addr) + break; + } + /* + * Look at the next node to make sure there is some real memory + * to map. Bad things happen when the only memory present + * in a zone on a fake node is IO hole. + */ + while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { + if (zone_cross_over(start, end + sz)) { + end = (MAX_DMA32_PFN << PAGE_SHIFT); + break; + } + if (end >= max_addr) + break; + end += FAKE_NODE_MIN_SIZE; + } + if (end > max_addr) + end = max_addr; + nodes[i].end = end; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, @@ -290,6 +428,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; + memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); @@ -321,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -#ifdef CONFIG_SPARSEMEM -static void __init arch_sparse_init(void) -{ - int i; - - for_each_online_node(i) - memory_present(i, node_start_pfn(i), node_end_pfn(i)); - - sparse_init(); -} -#else -#define arch_sparse_init() do {} while (0) -#endif - void __init paging_init(void) { int i; @@ -344,7 +469,8 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = end_pfn; - arch_sparse_init(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); for_each_online_node(i) { setup_node_zones(i); diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index ccb91dd996a9..65c5eaa59905 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) pud_t *pud; pmd_t *pmd; pte_t large_pte; + unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; + large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile index 149aba05a5b8..c9eddc8859c0 100644 --- a/arch/x86_64/pci/Makefile +++ b/arch/x86_64/pci/Makefile @@ -11,7 +11,7 @@ obj-y += fixup.o init.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o obj-$(CONFIG_NUMA) += k8-bus.o @@ -24,3 +24,4 @@ fixup-y += ../../i386/pci/fixup.o i386-y += ../../i386/pci/i386.o init-y += ../../i386/pci/init.o early-y += ../../i386/pci/early.o +mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index faabb6e87f12..65d82736987e 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -13,16 +13,6 @@ #include "pci.h" -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - -/* Verify the first 16 busses. We assume that systems with more busses - get MCFG right. */ -#define MAX_CHECK_BUS 16 - -static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS); - /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { struct acpi_mcfg_allocation *cfg; @@ -32,30 +22,17 @@ static struct mmcfg_virt *pci_mmcfg_virt; static char __iomem *get_virt(unsigned int seg, unsigned bus) { - int cfg_num = -1; struct acpi_mcfg_allocation *cfg; + int cfg_num; - while (1) { - ++cfg_num; - if (cfg_num >= pci_mmcfg_config_num) - break; + for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = pci_mmcfg_virt[cfg_num].cfg; - if (cfg->pci_segment != seg) - continue; - if ((cfg->start_bus_number <= bus) && + if (cfg->pci_segment == seg && + (cfg->start_bus_number <= bus) && (cfg->end_bus_number >= bus)) return pci_mmcfg_virt[cfg_num].virt; } - /* Handle more broken MCFG tables on Asus etc. - They only contain a single entry for bus 0-0. Assume - this applies to all busses. */ - cfg = &pci_mmcfg_config[0]; - if (pci_mmcfg_config_num == 1 && - cfg->pci_segment == 0 && - (cfg->start_bus_number | cfg->end_bus_number) == 0) - return pci_mmcfg_virt[0].virt; - /* Fall back to type 0 */ return NULL; } @@ -63,8 +40,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; - if (seg == 0 && bus < MAX_CHECK_BUS && - test_bit(32*bus + PCI_SLOT(devfn), fallback_slots)) + if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && + test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots)) return NULL; addr = get_virt(seg, bus); if (!addr) @@ -135,79 +112,46 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -/* K8 systems have some devices (typically in the builtin northbridge) - that are only accessible using type1 - Normally this can be expressed in the MCFG by not listing them - and assigning suitable _SEGs, but this isn't implemented in some BIOS. - Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. */ -static __init void unreachable_devices(void) +static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) { - int i, k; - /* Use the max bus number from ACPI here? */ - for (k = 0; k < MAX_CHECK_BUS; k++) { - for (i = 0; i < 32; i++) { - u32 val1; - char __iomem *addr; - - pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); - if (addr == NULL|| readl(addr) != val1) { - set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE "PCI: No mmconfig possible" - " on device %02x:%02x\n", k, i); - } - } + void __iomem *addr; + u32 size; + + size = (cfg->end_bus_number + 1) << 20; + addr = ioremap_nocache(cfg->address, size); + if (addr) { + printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", + cfg->address, cfg->address + size - 1); } + return addr; } -void __init pci_mmcfg_init(int type) +int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, + unsigned int devfn) { - int i; - - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return; - - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; - - /* Only do this check when type 1 works. If it doesn't work - assume we run on a Mac and always use MCFG */ - if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address, - pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, - E820_RESERVED)) { - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n", - (unsigned long)pci_mmcfg_config[0].address); - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); - return; - } + return pci_dev_base(seg, bus, devfn) != NULL; +} - pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); +int __init pci_mmcfg_arch_init(void) +{ + int i; + pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * + pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); - return; + return 0; } + for (i = 0; i < pci_mmcfg_config_num; ++i) { pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; - pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address, - MMCONFIG_APER_MAX); + pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); if (!pci_mmcfg_virt[i].virt) { printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " "segment %d\n", pci_mmcfg_config[i].pci_segment); - return; + return 0; } - printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n", - (unsigned long)pci_mmcfg_config[i].address); } - - unreachable_devices(); - raw_pci_ops = &pci_mmcfg; - pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + return 1; } |