29 files changed, 213 insertions, 205 deletions
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 07c4318c0629..fd8f6bb5face 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -8,20 +8,22 @@
 # for "archclean" and "archdep" for cleaning up and making dependencies for
 # this architecture
 
-ifeq ($(CROSS_COMPILE),)
 # If building with TILERA_ROOT set (i.e. using the Tilera Multicore
 # Development Environment) we can set CROSS_COMPILE based on that.
-ifdef TILERA_ROOT
-CROSS_COMPILE	= $(TILERA_ROOT)/bin/tile-
-endif
-endif
-
 # If we're not cross-compiling, make sure we're on the right architecture.
+# Only bother to test for a few common targets, to avoid useless errors.
 ifeq ($(CROSS_COMPILE),)
-HOST_ARCH = $(shell uname -m)
-ifneq ($(HOST_ARCH),$(ARCH))
+  ifdef TILERA_ROOT
+    CROSS_COMPILE := $(TILERA_ROOT)/bin/tile-
+  else
+    goals := $(if $(MAKECMDGOALS), $(MAKECMDGOALS), all)
+    ifneq ($(strip $(filter vmlinux modules all,$(goals))),)
+      HOST_ARCH := $(shell uname -m)
+      ifneq ($(HOST_ARCH),$(ARCH))
 $(error Set TILERA_ROOT or CROSS_COMPILE when building $(ARCH) on $(HOST_ARCH))
-endif
+      endif
+    endif
+  endif
 endif
 
 
diff --git a/arch/tile/include/arch/abi.h b/arch/tile/include/arch/abi.h
index da8df5b9d914..8affc76f771a 100644
--- a/arch/tile/include/arch/abi.h
+++ b/arch/tile/include/arch/abi.h
@@ -59,9 +59,7 @@
  * The ABI requires callers to allocate a caller state save area of
  * this many bytes at the bottom of each stack frame.
  */
-#ifdef __tile__
-#define C_ABI_SAVE_AREA_SIZE (2 * __SIZEOF_POINTER__)
-#endif
+#define C_ABI_SAVE_AREA_SIZE (2 * (CHIP_WORD_SIZE() / 8))
 
 /**
  * The operand to an 'info' opcode directing the backtracer to not
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index 40a5a3a876d9..ed359aee8837 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -255,43 +255,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
 #define smp_mb__after_atomic_dec()	do { } while (0)
 #define smp_mb__after_atomic_inc()	do { } while (0)
 
-
-/*
- * Support "tns" atomic integers.  These are atomic integers that can
- * hold any value but "1".  They are more efficient than regular atomic
- * operations because the "lock" (aka acquire) step is a single "tns"
- * in the uncontended case, and the "unlock" (aka release) step is a
- * single "store" without an mf.  (However, note that on tilepro the
- * "tns" will evict the local cache line, so it's not all upside.)
- *
- * Note that you can ONLY observe the value stored in the pointer
- * using these operations; a direct read of the value may confusingly
- * return the special value "1".
- */
-
-int __tns_atomic_acquire(atomic_t *);
-void __tns_atomic_release(atomic_t *p, int v);
-
-static inline void tns_atomic_set(atomic_t *v, int i)
-{
-	__tns_atomic_acquire(v);
-	__tns_atomic_release(v, i);
-}
-
-static inline int tns_atomic_cmpxchg(atomic_t *v, int o, int n)
-{
-	int ret = __tns_atomic_acquire(v);
-	__tns_atomic_release(v, (ret == o) ? n : ret);
-	return ret;
-}
-
-static inline int tns_atomic_xchg(atomic_t *v, int n)
-{
-	int ret = __tns_atomic_acquire(v);
-	__tns_atomic_release(v, n);
-	return ret;
-}
-
 #endif /* !__ASSEMBLY__ */
 
 /*
diff --git a/arch/tile/include/asm/backtrace.h b/arch/tile/include/asm/backtrace.h
index 6970bfcad549..758ca4619d50 100644
--- a/arch/tile/include/asm/backtrace.h
+++ b/arch/tile/include/asm/backtrace.h
@@ -21,7 +21,9 @@
 
 #include <arch/chip.h>
 
-#if CHIP_VA_WIDTH() > 32
+#if defined(__tile__)
+typedef unsigned long VirtualAddress;
+#elif CHIP_VA_WIDTH() > 32
 typedef unsigned long long VirtualAddress;
 #else
 typedef unsigned int VirtualAddress;
diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index 84600f3514da..6832b4be8990 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -98,26 +98,27 @@ static inline int fls64(__u64 w)
 	return (sizeof(__u64) * 8) - __builtin_clzll(w);
 }
 
-static inline unsigned int hweight32(unsigned int w)
+static inline unsigned int __arch_hweight32(unsigned int w)
 {
 	return __builtin_popcount(w);
 }
 
-static inline unsigned int hweight16(unsigned int w)
+static inline unsigned int __arch_hweight16(unsigned int w)
 {
 	return __builtin_popcount(w & 0xffff);
 }
 
-static inline unsigned int hweight8(unsigned int w)
+static inline unsigned int __arch_hweight8(unsigned int w)
 {
 	return __builtin_popcount(w & 0xff);
 }
 
-static inline unsigned long hweight64(__u64 w)
+static inline unsigned long __arch_hweight64(__u64 w)
 {
 	return __builtin_popcountll(w);
 }
 
+#include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/ext2-non-atomic.h>
diff --git a/arch/tile/include/asm/cache.h b/arch/tile/include/asm/cache.h
index f6101840c9e7..08a2815b5e4e 100644
--- a/arch/tile/include/asm/cache.h
+++ b/arch/tile/include/asm/cache.h
@@ -27,11 +27,10 @@
 #define L2_CACHE_ALIGN(x)	(((x)+(L2_CACHE_BYTES-1)) & -L2_CACHE_BYTES)
 
 /*
- * TILE-Gx is fully coherents so we don't need to define
- * ARCH_KMALLOC_MINALIGN.
+ * TILE-Gx is fully coherent so we don't need to define ARCH_DMA_MINALIGN.
  */
 #ifndef __tilegx__
-#define ARCH_KMALLOC_MINALIGN	L2_CACHE_BYTES
+#define ARCH_DMA_MINALIGN	L2_CACHE_BYTES
 #endif
 
 /* use the cache line size for the L2, which is where it counts */
diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h
index efdd12e91020..d155db6fa9bd 100644
--- a/arch/tile/include/asm/highmem.h
+++ b/arch/tile/include/asm/highmem.h
@@ -60,7 +60,7 @@ void *kmap_fix_kpte(struct page *page, int finished);
 /* This macro is used only in map_new_virtual() to map "page". */
 #define kmap_prot page_to_kpgprot(page)
 
-void kunmap_atomic(void *kvaddr, enum km_type type);
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index f894a9016da6..7d90641cf18d 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -129,6 +129,11 @@ static inline u64 pmd_val(pmd_t pmd)
 
 #endif
 
+static inline __attribute_const__ int get_order(unsigned long size)
+{
+	return BITS_PER_LONG - __builtin_clzl((size - 1) >> PAGE_SHIFT);
+}
+
 #endif /* !__ASSEMBLY__ */
 
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
@@ -332,7 +337,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
 	(VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #include <asm-generic/memory_model.h>
-#include <asm-generic/getorder.h>
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/tile/include/asm/scatterlist.h b/arch/tile/include/asm/scatterlist.h
index c5604242c0d5..35d786fe93ae 100644
--- a/arch/tile/include/asm/scatterlist.h
+++ b/arch/tile/include/asm/scatterlist.h
@@ -1,22 +1 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#ifndef _ASM_TILE_SCATTERLIST_H
-#define _ASM_TILE_SCATTERLIST_H
-
-#define ISA_DMA_THRESHOLD	(~0UL)
-
 #include <asm-generic/scatterlist.h>
-
-#endif /* _ASM_TILE_SCATTERLIST_H */
diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
index 823ddd47ff6e..7caf0f36b030 100644
--- a/arch/tile/include/asm/setup.h
+++ b/arch/tile/include/asm/setup.h
@@ -15,6 +15,10 @@
 #ifndef _ASM_TILE_SETUP_H
 #define _ASM_TILE_SETUP_H
 
+#define COMMAND_LINE_SIZE	2048
+
+#ifdef __KERNEL__
+
 #include <linux/pfn.h>
 #include <linux/init.h>
 
@@ -23,10 +27,10 @@
  */
 #define MAXMEM_PFN	PFN_DOWN(MAXMEM)
 
-#define COMMAND_LINE_SIZE	2048
-
 void early_panic(const char *fmt, ...);
 void warn_early_printk(void);
 void __init disable_early_printk(void);
 
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_TILE_SETUP_H */
diff --git a/arch/tile/include/asm/siginfo.h b/arch/tile/include/asm/siginfo.h
index 0c12d1b9ddf2..56d661bb010b 100644
--- a/arch/tile/include/asm/siginfo.h
+++ b/arch/tile/include/asm/siginfo.h
@@ -17,6 +17,10 @@
 
 #define __ARCH_SI_TRAPNO
 
+#ifdef __LP64__
+# define __ARCH_SI_PREAMBLE_SIZE	(4 * sizeof(int))
+#endif
+
 #include <asm-generic/siginfo.h>
 
 /*
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index ed17a80ec0ed..ef34d2caa5b1 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -389,14 +389,14 @@ static inline unsigned long __must_check copy_from_user(void *to,
  * Returns number of bytes that could not be copied.
  * On success, this will be zero.
  */
-extern unsigned long __copy_in_user_asm(
+extern unsigned long __copy_in_user_inatomic(
 	void __user *to, const void __user *from, unsigned long n);
 
 static inline unsigned long __must_check
 __copy_in_user(void __user *to, const void __user *from, unsigned long n)
 {
 	might_sleep();
-	return __copy_in_user_asm(to, from, n);
+	return __copy_in_user_inatomic(to, from, n);
 }
 
 static inline unsigned long __must_check
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index 59b46dc53994..9bd303a141b2 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -532,11 +532,11 @@ void hv_disable_intr(HV_IntrMask disab_mask);
  */
 void hv_clear_intr(HV_IntrMask clear_mask);
 
-/** Assert a set of device interrupts.
+/** Raise a set of device interrupts.
  *
- * @param assert_mask Bitmap of interrupts to clear.
+ * @param raise_mask Bitmap of interrupts to raise.
  */
-void hv_assert_intr(HV_IntrMask assert_mask);
+void hv_raise_intr(HV_IntrMask raise_mask);
 
 /** Trigger a one-shot interrupt on some tile
  *
@@ -1712,7 +1712,7 @@ typedef struct
  * @param cache_control This argument allows you to specify a length of
  *        physical address space to flush (maximum HV_FLUSH_MAX_CACHE_LEN).
  *        You can "or" in HV_FLUSH_EVICT_L2 to flush the whole L2 cache.
- *        You can "or" in HV_FLUSH_EVICT_LI1 to flush the whole LII cache.
+ *        You can "or" in HV_FLUSH_EVICT_L1I to flush the whole L1I cache.
  *        HV_FLUSH_ALL flushes all caches.
  * @param cache_cpumask Bitmask (in row-major order, supervisor-relative) of
  *        tile indices to perform cache flush on.  The low bit of the first
diff --git a/arch/tile/kernel/backtrace.c b/arch/tile/kernel/backtrace.c
index 77265f3b58d6..d3c41c1ff6bd 100644
--- a/arch/tile/kernel/backtrace.c
+++ b/arch/tile/kernel/backtrace.c
@@ -19,9 +19,6 @@
 
 #include <arch/chip.h>
 
-#if TILE_CHIP < 10
-
-
 #include <asm/opcode-tile.h>
 
 
@@ -29,6 +26,27 @@
 #define TREG_LR 55
 
 
+#if TILE_CHIP >= 10
+#define tile_bundle_bits tilegx_bundle_bits
+#define TILE_MAX_INSTRUCTIONS_PER_BUNDLE TILEGX_MAX_INSTRUCTIONS_PER_BUNDLE
+#define TILE_BUNDLE_ALIGNMENT_IN_BYTES TILEGX_BUNDLE_ALIGNMENT_IN_BYTES
+#define tile_decoded_instruction tilegx_decoded_instruction
+#define tile_mnemonic tilegx_mnemonic
+#define parse_insn_tile parse_insn_tilegx
+#define TILE_OPC_IRET TILEGX_OPC_IRET
+#define TILE_OPC_ADDI TILEGX_OPC_ADDI
+#define TILE_OPC_ADDLI TILEGX_OPC_ADDLI
+#define TILE_OPC_INFO TILEGX_OPC_INFO
+#define TILE_OPC_INFOL TILEGX_OPC_INFOL
+#define TILE_OPC_JRP TILEGX_OPC_JRP
+#define TILE_OPC_MOVE TILEGX_OPC_MOVE
+#define OPCODE_STORE TILEGX_OPC_ST
+typedef long long bt_int_reg_t;
+#else
+#define OPCODE_STORE TILE_OPC_SW
+typedef int bt_int_reg_t;
+#endif
+
 /** A decoded bundle used for backtracer analysis. */
 struct BacktraceBundle {
 	tile_bundle_bits bits;
@@ -41,7 +59,7 @@ struct BacktraceBundle {
 /* This implementation only makes sense for native tools. */
 /** Default function to read memory. */
 static bool bt_read_memory(void *result, VirtualAddress addr,
-			   size_t size, void *extra)
+			   unsigned int size, void *extra)
 {
 	/* FIXME: this should do some horrible signal stuff to catch
 	 * SEGV cleanly and fail.
@@ -106,6 +124,12 @@ static bool bt_has_addi_sp(const struct BacktraceBundle *bundle, int *adjust)
 		find_matching_insn(bundle, TILE_OPC_ADDI, vals, 2);
 	if (insn == NULL)
 		insn = find_matching_insn(bundle, TILE_OPC_ADDLI, vals, 2);
+#if TILE_CHIP >= 10
+	if (insn == NULL)
+		insn = find_matching_insn(bundle, TILEGX_OPC_ADDXLI, vals, 2);
+	if (insn == NULL)
+		insn = find_matching_insn(bundle, TILEGX_OPC_ADDXI, vals, 2);
+#endif
 	if (insn == NULL)
 		return false;
 
@@ -190,13 +214,52 @@ static inline bool bt_has_move_r52_sp(const struct BacktraceBundle *bundle)
 	return find_matching_insn(bundle, TILE_OPC_MOVE, vals, 2) != NULL;
 }
 
-/** Does this bundle contain the instruction 'sw sp, lr'? */
+/** Does this bundle contain a store of lr to sp? */
 static inline bool bt_has_sw_sp_lr(const struct BacktraceBundle *bundle)
 {
 	static const int vals[2] = { TREG_SP, TREG_LR };
-	return find_matching_insn(bundle, TILE_OPC_SW, vals, 2) != NULL;
+	return find_matching_insn(bundle, OPCODE_STORE, vals, 2) != NULL;
+}
+
+#if TILE_CHIP >= 10
+/** Track moveli values placed into registers. */
+static inline void bt_update_moveli(const struct BacktraceBundle *bundle,
+				    int moveli_args[])
+{
+	int i;
+	for (i = 0; i < bundle->num_insns; i++) {
+		const struct tile_decoded_instruction *insn =
+			&bundle->insns[i];
+
+		if (insn->opcode->mnemonic == TILEGX_OPC_MOVELI) {
+			int reg = insn->operand_values[0];
+			moveli_args[reg] = insn->operand_values[1];
+		}
+	}
 }
 
+/** Does this bundle contain an 'add sp, sp, reg' instruction
+ * from a register that we saw a moveli into, and if so, what
+ * is the value in the register?
+ */
+static bool bt_has_add_sp(const struct BacktraceBundle *bundle, int *adjust,
+			  int moveli_args[])
+{
+	static const int vals[2] = { TREG_SP, TREG_SP };
+
+	const struct tile_decoded_instruction *insn =
+		find_matching_insn(bundle, TILEGX_OPC_ADDX, vals, 2);
+	if (insn) {
+		int reg = insn->operand_values[2];
+		if (moveli_args[reg]) {
+			*adjust = moveli_args[reg];
+			return true;
+		}
+	}
+	return false;
+}
+#endif
+
 /** Locates the caller's PC and SP for a program starting at the
  * given address.
  */
@@ -227,6 +290,11 @@ static void find_caller_pc_and_caller_sp(CallerLocation *location,
 	int next_bundle = 0;
 	VirtualAddress pc;
 
+#if TILE_CHIP >= 10
+	/* Naively try to track moveli values to support addx for -m32. */
+	int moveli_args[TILEGX_NUM_REGISTERS] = { 0 };
+#endif
+
 	/* Default to assuming that the caller's sp is the current sp.
 	 * This is necessary to handle the case where we start backtracing
 	 * right at the end of the epilog.
@@ -380,7 +448,11 @@ static void find_caller_pc_and_caller_sp(CallerLocation *location,
 
 		if (!sp_determined) {
 			int adjust;
-			if (bt_has_addi_sp(&bundle, &adjust)) {
+			if (bt_has_addi_sp(&bundle, &adjust)
+#if TILE_CHIP >= 10
+			    || bt_has_add_sp(&bundle, &adjust, moveli_args)
+#endif
+				) {
 				location->sp_location = SP_LOC_OFFSET;
 
 				if (adjust <= 0) {
@@ -427,6 +499,11 @@ static void find_caller_pc_and_caller_sp(CallerLocation *location,
 					sp_determined = true;
 				}
 			}
+
+#if TILE_CHIP >= 10
+			/* Track moveli arguments for -m32 mode. */
+			bt_update_moveli(&bundle, moveli_args);
+#endif
 		}
 
 		if (bt_has_iret(&bundle)) {
@@ -502,11 +579,10 @@ void backtrace_init(BacktraceIterator *state,
 		break;
 	}
 
-	/* The frame pointer should theoretically be aligned mod 8. If
-	 * it's not even aligned mod 4 then something terrible happened
-	 * and we should mark it as invalid.
+	/* If the frame pointer is not aligned to the basic word size
+	 * something terrible happened and we should mark it as invalid.
 	 */
-	if (fp % 4 != 0)
+	if (fp % sizeof(bt_int_reg_t) != 0)
 		fp = -1;
 
 	/* -1 means "don't know initial_frame_caller_pc". */
@@ -547,9 +623,16 @@ void backtrace_init(BacktraceIterator *state,
 	state->read_memory_func_extra = read_memory_func_extra;
 }
 
+/* Handle the case where the register holds more bits than the VA. */
+static bool valid_addr_reg(bt_int_reg_t reg)
+{
+	return ((VirtualAddress)reg == reg);
+}
+
 bool backtrace_next(BacktraceIterator *state)
 {
-	VirtualAddress next_fp, next_pc, next_frame[2];
+	VirtualAddress next_fp, next_pc;
+	bt_int_reg_t next_frame[2];
 
 	if (state->fp == -1) {
 		/* No parent frame. */
@@ -563,11 +646,9 @@ bool backtrace_next(BacktraceIterator *state)
 	}
 
 	next_fp = next_frame[1];
-	if (next_fp % 4 != 0) {
-		/* Caller's frame pointer is suspect, so give up.
-		 * Technically it should be aligned mod 8, but we will
-		 * be forgiving here.
-		 */
+	if (!valid_addr_reg(next_frame[1]) ||
+	    next_fp % sizeof(bt_int_reg_t) != 0) {
+		/* Caller's frame pointer is suspect, so give up. */
 		return false;
 	}
 
@@ -585,7 +666,7 @@ bool backtrace_next(BacktraceIterator *state)
 	} else {
 		/* Get the caller PC from the frame linkage area. */
 		next_pc = next_frame[0];
-		if (next_pc == 0 ||
+		if (!valid_addr_reg(next_frame[0]) || next_pc == 0 ||
 		    next_pc % TILE_BUNDLE_ALIGNMENT_IN_BYTES != 0) {
 			/* The PC is suspect, so give up. */
 			return false;
@@ -599,23 +680,3 @@ bool backtrace_next(BacktraceIterator *state)
 
 	return true;
 }
-
-#else /* TILE_CHIP < 10 */
-
-void backtrace_init(BacktraceIterator *state,
-		    BacktraceMemoryReader read_memory_func,
-		    void *read_memory_func_extra,
-		    VirtualAddress pc, VirtualAddress lr,
-		    VirtualAddress sp, VirtualAddress r52)
-{
-	state->pc = pc;
-	state->sp = sp;
-	state->fp = -1;
-	state->initial_frame_caller_pc = -1;
-	state->read_memory_func = read_memory_func;
-	state->read_memory_func_extra = read_memory_func_extra;
-}
-
-bool backtrace_next(BacktraceIterator *state) { return false; }
-
-#endif /* TILE_CHIP < 10 */
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index d5efb215dd5f..9c710db43f13 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -56,13 +56,15 @@ struct compat_ucontext {
 	sigset_t	  uc_sigmask;	/* mask last for extensibility */
 };
 
+#define COMPAT_SI_PAD_SIZE	((SI_MAX_SIZE - 3 * sizeof(int)) / sizeof(int))
+
 struct compat_siginfo {
 	int si_signo;
 	int si_errno;
 	int si_code;
 
 	union {
-		int _pad[SI_PAD_SIZE];
+		int _pad[COMPAT_SI_PAD_SIZE];
 
 		/* kill() */
 		struct {
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index 3404c75f8e64..84f296ca9e63 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -952,7 +952,7 @@ STD_ENTRY(interrupt_return)
 	 * able to safely read all the remaining words on those cache
 	 * lines without waiting for the memory subsystem.
 	 */
-	pop_reg_zero r0, r1, sp, PTREGS_OFFSET_REG(30) - PTREGS_OFFSET_REG(0)
+	pop_reg_zero r0, r28, sp, PTREGS_OFFSET_REG(30) - PTREGS_OFFSET_REG(0)
 	pop_reg_zero r30, r2, sp, PTREGS_OFFSET_PC - PTREGS_OFFSET_REG(30)
 	pop_reg_zero r21, r3, sp, PTREGS_OFFSET_EX1 - PTREGS_OFFSET_PC
 	pop_reg_zero lr, r4, sp, PTREGS_OFFSET_REG(52) - PTREGS_OFFSET_EX1
@@ -1017,7 +1017,17 @@ STD_ENTRY(interrupt_return)
 	{ move r22, zero; move r23, zero }
 	{ move r24, zero; move r25, zero }
 	{ move r26, zero; move r27, zero }
-	{ move r28, zero; move r29, zero }
+
+	/* Set r1 to errno if we are returning an error, otherwise zero. */
+	{
+	 moveli r29, 1024
+	 sub    r1, zero, r0
+	}
+	slt_u   r29, r1, r29
+	{
+	 mnz    r1, r29, r1
+	 move   r29, zero
+	}
 	iret
 
 	/*
diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c
index 92ef925d2f8d..2e02c41ddf3b 100644
--- a/arch/tile/kernel/proc.c
+++ b/arch/tile/kernel/proc.c
@@ -23,7 +23,6 @@
 #include <linux/sysctl.h>
 #include <linux/hardirq.h>
 #include <linux/mman.h>
-#include <linux/smp.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 4dd21c1e6d5e..e7d54c73d5c1 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -953,7 +953,7 @@ static void __init load_hv_initrd(void)
 	if (rc != stat.size) {
 		pr_err("Error reading %d bytes from hvfs file '%s': %d\n",
 		       stat.size, initramfs_file, rc);
-		free_bootmem((unsigned long) initrd, stat.size);
+		free_initrd_mem((unsigned long) initrd, stat.size);
 		return;
 	}
 	initrd_start = (unsigned long) initrd;
@@ -962,7 +962,7 @@ static void __init load_hv_initrd(void)
 
 void __init free_initrd_mem(unsigned long begin, unsigned long end)
 {
-	free_bootmem(begin, end - begin);
+	free_bootmem(__pa(begin), end - begin);
 }
 
 static void __init validate_hv(void)
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index b6268d3ae869..38a68b0b4581 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -108,7 +108,6 @@ static bool read_memory_func(void *result, VirtualAddress address,
 /* Return a pt_regs pointer for a valid fault handler frame */
 static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 {
-#ifndef __tilegx__
 	const char *fault = NULL;  /* happy compiler */
 	char fault_buf[64];
 	VirtualAddress sp = kbt->it.sp;
@@ -146,7 +145,6 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 	}
 	if (!kbt->profile || (INT_MASK(p->faultnum) & QUEUED_INTERRUPTS) == 0)
 		return p;
-#endif
 	return NULL;
 }
 
@@ -351,12 +349,6 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 		       kbt->task->pid, kbt->task->tgid, kbt->task->comm,
 		       smp_processor_id(), get_cycles());
 	}
-#ifdef __tilegx__
-	if (kbt->is_current) {
-		__insn_mtspr(SPR_SIM_CONTROL,
-			     SIM_DUMP_SPR_ARG(SIM_DUMP_BACKTRACE));
-	}
-#endif
 	kbt->verbose = 1;
 	i = 0;
 	for (; !KBacktraceIterator_end(kbt); KBacktraceIterator_next(kbt)) {
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index b9ab25a889b5..6bed820e1421 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -36,16 +36,6 @@
 /* How many cycles per second we are running at. */
 static cycles_t cycles_per_sec __write_once;
 
-/*
- * We set up shift and multiply values with a minsec of five seconds,
- * since our timer counter counts down 31 bits at a frequency of
- * no less than 500 MHz.  See @minsec for clocks_calc_mult_shift().
- * We could use a different value for the 64-bit free-running
- * cycle counter, but we use the same one for consistency, and since
- * we will be reasonably precise with this value anyway.
- */
-#define TILE_MINSEC 5
-
 cycles_t get_clock_rate(void)
 {
 	return cycles_per_sec;
@@ -68,6 +58,14 @@ cycles_t get_cycles(void)
 }
 #endif
 
+/*
+ * We use a relatively small shift value so that sched_clock()
+ * won't wrap around very often.
+ */
+#define SCHED_CLOCK_SHIFT 10
+
+static unsigned long sched_clock_mult __write_once;
+
 static cycles_t clocksource_get_cycles(struct clocksource *cs)
 {
 	return get_cycles();
@@ -78,6 +76,7 @@ static struct clocksource cycle_counter_cs = {
 	.rating = 300,
 	.read = clocksource_get_cycles,
 	.mask = CLOCKSOURCE_MASK(64),
+	.shift = 22,   /* typical value, e.g. x86 tsc uses this */
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -88,8 +87,10 @@ static struct clocksource cycle_counter_cs = {
 void __init setup_clock(void)
 {
 	cycles_per_sec = hv_sysconf(HV_SYSCONF_CPU_SPEED);
-	clocksource_calc_mult_shift(&cycle_counter_cs, cycles_per_sec,
-				    TILE_MINSEC);
+	sched_clock_mult =
+		clocksource_hz2mult(cycles_per_sec, SCHED_CLOCK_SHIFT);
+	cycle_counter_cs.mult =
+		clocksource_hz2mult(cycles_per_sec, cycle_counter_cs.shift);
 }
 
 void __init calibrate_delay(void)
@@ -117,9 +118,14 @@ void __init time_init(void)
  * counter, plus bit 31, which signifies that the counter has wrapped
  * from zero to (2**31) - 1.  The INT_TILE_TIMER interrupt will be
  * raised as long as bit 31 is set.
+ *
+ * The TILE_MINSEC value represents the largest range of real-time
+ * we can possibly cover with the timer, based on MAX_TICK combined
+ * with the slowest reasonable clock rate we might run at.
  */
 
 #define MAX_TICK 0x7fffffff   /* we have 31 bits of countdown timer */
+#define TILE_MINSEC 5         /* timer covers no more than 5 seconds */
 
 static int tile_timer_set_next_event(unsigned long ticks,
 				     struct clock_event_device *evt)
@@ -211,8 +217,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
 unsigned long long sched_clock(void)
 {
 	return clocksource_cyc2ns(get_cycles(),
-				  cycle_counter_cs.mult,
-				  cycle_counter_cs.shift);
+				  sched_clock_mult, SCHED_CLOCK_SHIFT);
 }
 
 int setup_profiling_timer(unsigned int multiplier)
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 3870abbeeaa2..0f362dc2c57f 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -128,7 +128,9 @@ static int special_ill(bundle_bits bundle, int *sigp, int *codep)
 #ifdef __tilegx__
 	if ((bundle & TILEGX_BUNDLE_MODE_MASK) != 0)
 		return 0;
-	if (get_Opcode_X1(bundle) != UNARY_OPCODE_X1)
+	if (get_Opcode_X1(bundle) != RRR_0_OPCODE_X1)
+		return 0;
+	if (get_RRROpcodeExtension_X1(bundle) != UNARY_RRR_0_OPCODE_X1)
 		return 0;
 	if (get_UnaryOpcodeExtension_X1(bundle) != ILL_UNARY_OPCODE_X1)
 		return 0;
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 438af38bc9eb..746dc81ed3c4 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -7,7 +7,9 @@ lib-y = cacheflush.o checksum.o cpumask.o delay.o \
 	memcpy_$(BITS).o memchr_$(BITS).o memmove_$(BITS).o memset_$(BITS).o \
 	strchr_$(BITS).o strlen_$(BITS).o
 
-ifneq ($(CONFIG_TILEGX),y)
+ifeq ($(CONFIG_TILEGX),y)
+lib-y += memcpy_user_64.o
+else
 lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
 endif
 
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 6bc7b52b4aa0..ce5dbf56578f 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -36,21 +36,29 @@ EXPORT_SYMBOL(clear_user_asm);
 EXPORT_SYMBOL(current_text_addr);
 EXPORT_SYMBOL(dump_stack);
 
-/* arch/tile/lib/__memcpy.S */
-/* NOTE: on TILE64, these symbols appear in arch/tile/lib/memcpy_tile64.c */
+/* arch/tile/lib/, various memcpy files */
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__copy_to_user_inatomic);
 EXPORT_SYMBOL(__copy_from_user_inatomic);
 EXPORT_SYMBOL(__copy_from_user_zeroing);
+#ifdef __tilegx__
+EXPORT_SYMBOL(__copy_in_user_inatomic);
+#endif
 
 /* hypervisor glue */
 #include <hv/hypervisor.h>
 EXPORT_SYMBOL(hv_dev_open);
 EXPORT_SYMBOL(hv_dev_pread);
 EXPORT_SYMBOL(hv_dev_pwrite);
+EXPORT_SYMBOL(hv_dev_preada);
+EXPORT_SYMBOL(hv_dev_pwritea);
+EXPORT_SYMBOL(hv_dev_poll);
+EXPORT_SYMBOL(hv_dev_poll_cancel);
 EXPORT_SYMBOL(hv_dev_close);
+EXPORT_SYMBOL(hv_sysconf);
+EXPORT_SYMBOL(hv_confstr);
 
-/* -ltile-cc */
+/* libgcc.a */
 uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
 EXPORT_SYMBOL(__udivsi3);
 int32_t __divsi3(int32_t dividend, int32_t divisor);
@@ -70,8 +78,6 @@ EXPORT_SYMBOL(__moddi3);
 #ifndef __tilegx__
 uint64_t __ll_mul(uint64_t n0, uint64_t n1);
 EXPORT_SYMBOL(__ll_mul);
-#endif
-#ifndef __tilegx__
 int64_t __muldi3(int64_t, int64_t);
 EXPORT_SYMBOL(__muldi3);
 uint64_t __lshrdi3(uint64_t, unsigned int);
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index f92984bf60ec..30c3b7ebb55d 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -17,10 +17,6 @@
 
 #include <arch/chip.h>
 
-#if CHIP_HAS_WH64() || defined(MEMCPY_TEST_WH64)
-#define MEMCPY_USE_WH64
-#endif
-
 
 #include <linux/linkage.h>
 
@@ -160,7 +156,7 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 
 	{ addi r3, r1, 60; andi r9, r9, -64 }
 
-#ifdef MEMCPY_USE_WH64
+#if CHIP_HAS_WH64()
         /* No need to prefetch dst, we'll just do the wh64
          * right before we copy a line.
 	 */
@@ -173,7 +169,7 @@ EX:	{ lw r6, r3; addi r3, r3, 64 }
         /* Intentionally stall for a few cycles to leave L2 cache alone. */
         { bnzt zero, . }
 EX:	{ lw r7, r3; addi r3, r3, 64 }
-#ifndef MEMCPY_USE_WH64
+#if !CHIP_HAS_WH64()
         /* Prefetch the dest */
         /* Intentionally stall for a few cycles to leave L2 cache alone. */
         { bnzt zero, . }
@@ -288,15 +284,7 @@ EX:	{ lw r7, r3; addi r3, r3, 64 }
         /* Fill second L1D line. */
 EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 
-#ifdef MEMCPY_TEST_WH64
-        /* Issue a fake wh64 that clobbers the destination words
-         * with random garbage, for testing.
-         */
-	{ movei r19, 64; crc32_32 r10, r2, r9 }
-.Lwh64_test_loop:
-EX:	{ sw r9, r10; addi r9, r9, 4; addi r19, r19, -4 }
-        { bnzt r19, .Lwh64_test_loop; crc32_32 r10, r10, r19 }
-#elif CHIP_HAS_WH64()
+#if CHIP_HAS_WH64()
         /* Prepare destination line for writing. */
 EX:	{ wh64 r9; addi r9, r9, 64 }
 #else
@@ -340,7 +328,7 @@ EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#ifdef MEMCPY_USE_WH64
+#if CHIP_HAS_WH64()
 EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
 #else
         /* Back up the r9 to a cache line we are already storing to
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index bfde5d864df1..d014c1fbcbc2 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -141,7 +141,6 @@ void *memset(void *s, int c, size_t n)
 			 */
 			__insn_prefetch(&out32[ahead32]);
 
-#if 1
 #if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
 #error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
 #endif
@@ -157,30 +156,6 @@ void *memset(void *s, int c, size_t n)
 				*out32++ = v32;
 				*out32++ = v32;
 			}
-#else
-			/* Unfortunately, due to a code generator flaw this
-			 * allocates a separate register for each of these
-			 * stores, which requires a large number of spills,
-			 * which makes this procedure enormously bigger
-			 * (something like 70%)
-			 */
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			*out32++ = v32;
-			n32 -= 16;
-#endif
 
 			/* To save compiled code size, reuse this loop even
 			 * when we run out of prefetching to do by dropping
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 0011f06b4fe2..704f3e8a4385 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -567,6 +567,14 @@ do_sigbus:
  * since that might indicate we have not yet squirreled the SPR
  * contents away and can thus safely take a recursive interrupt.
  * Accordingly, the hypervisor passes us the PC via SYSTEM_SAVE_1_2.
+ *
+ * Note that this routine is called before homecache_tlb_defer_enter(),
+ * which means that we can properly unlock any atomics that might
+ * be used there (good), but also means we must be very sensitive
+ * to not touch any data structures that might be located in memory
+ * that could migrate, as we could be entering the kernel on a dataplane
+ * cpu that has been deferring kernel TLB updates.  This means, for
+ * example, that we can't migrate init_mm or its pgd.
  */
 struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
 				      unsigned long address,
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index ff1cdff5114d..12ab137e7d4f 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -276,7 +276,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
 }
 EXPORT_SYMBOL(kmap_atomic);
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
 	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
@@ -300,7 +300,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
-EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_notypecheck);
 
 /*
  * This API is supposed to allow us to map memory without a "struct page".
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 97c478e7be27..fb3b4a55cec4 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -29,6 +29,7 @@
 #include <linux/timex.h>
 #include <linux/cache.h>
 #include <linux/smp.h>
+#include <linux/module.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -348,6 +349,7 @@ pte_t pte_set_home(pte_t pte, int home)
 
 	return pte;
 }
+EXPORT_SYMBOL(pte_set_home);
 
 /*
  * The routines in this section are the "static" versions of the normal
@@ -403,6 +405,7 @@ struct page *homecache_alloc_pages(gfp_t gfp_mask,
 		homecache_change_page_home(page, order, home);
 	return page;
 }
+EXPORT_SYMBOL(homecache_alloc_pages);
 
 struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
 					unsigned int order, int home)
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 28c23140c947..335c24621c41 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -17,7 +17,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
-#include <linux/smp.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>