From 9859f0c7e05b424a7a42032e639a8c44dd537328 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 14 Dec 2016 10:09:45 +0530
Subject: powerpc/mm: Remove the debug hugepd_ok check

We don't do this for other page table entries. So lets keep this simple
and always return false for hugepd check on a 64K page size config.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable-64k.h |  5 -----
 arch/powerpc/mm/hugetlbpage-hash64.c             | 21 ---------------------
 2 files changed, 26 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index 0d2845b44763..198aff33c380 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -35,10 +35,6 @@ static inline int pgd_huge(pgd_t pgd)
 }
 #define pgd_huge pgd_huge
 
-#ifdef CONFIG_DEBUG_VM
-extern int hugepd_ok(hugepd_t hpd);
-#define is_hugepd(hpd)               (hugepd_ok(hpd))
-#else
 /*
  * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
  * need to setup hugepage directory for them. Our pte and page directory format
@@ -49,7 +45,6 @@ static inline int hugepd_ok(hugepd_t hpd)
 	return 0;
 }
 #define is_hugepd(pdep)			0
-#endif /* CONFIG_DEBUG_VM */
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 37b5f91e381b..a84bb44497f9 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -116,24 +116,3 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
 }
-
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
-/*
- * This enables us to catch the wrong page directory format
- * Moved here so that we can use WARN() in the call.
- */
-int hugepd_ok(hugepd_t hpd)
-{
-	bool is_hugepd;
-	unsigned long hpdval;
-
-	hpdval = hpd_val(hpd);
-
-	/*
-	 * We should not find this format in page directory, warn otherwise.
-	 */
-	is_hugepd = (((hpdval & 0x3) == 0x0) && ((hpdval & HUGEPD_SHIFT_MASK) != 0));
-	WARN(is_hugepd, "Found wrong page directory format\n");
-	return 0;
-}
-#endif
-- 
cgit v1.2.3


From 8ad43336b5c1ad9ac945148cb5e26a1200ccd45c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 4 Jan 2017 08:19:12 +0530
Subject: powerpc/mm/4k: don't allocate larger pmd page table for 4k

We now support THP with both 64k and 4K page size configuration
for radix. (hash only support THP with 64K page size). Hence we
will have CONFIG_TRANSPARENT_HUGEPAGE enabled for both PPC_64K
and PPC_4K config. Since we only need large pmd page table
with hash configuration (to store the slot information
in the second half of the table) restrict the large pmd page table
to THP and 64K configs.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 4c935f7504f7..f7b721bbf918 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -33,9 +33,9 @@
 				 H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT)
 #define H_PGTABLE_RANGE		(ASM_CONST(1) << H_PGTABLE_EADDR_SIZE)
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&  defined(CONFIG_PPC_64K_PAGES)
 /*
- * only with hash we need to use the second half of pmd page table
+ * only with hash 64k we need to use the second half of pmd page table
  * to store pointer to deposited pgtable_t
  */
 #define H_PMD_CACHE_INDEX	(H_PMD_INDEX_SIZE + 1)
-- 
cgit v1.2.3


From 14a41d6b7572026cf8fc88ee72e81b6b40db2ec0 Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Fri, 13 Jan 2017 14:23:49 +1030
Subject: powerpc/powernv: Report size of OPAL memcons log

The OPAL memory console is reported to be size zero, as we do not
initialise the struct attr with any size information due to the size
being variable. This leads users to think that the console is empty.

Instead report the maximum size.

Signed-off-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-msglog.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index 39d6ff9e5630..7a9cde0cfbd1 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -123,6 +123,10 @@ void __init opal_msglog_init(void)
 		return;
 	}
 
+	/* Report maximum size */
+	opal_msglog_attr.size =  be32_to_cpu(mc->ibuf_size) +
+		be32_to_cpu(mc->obuf_size);
+
 	opal_memcons = mc;
 }
 
-- 
cgit v1.2.3


From b492f7e4e07a28e706db26cf4943bb0911435426 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 3 Nov 2016 16:10:55 +1100
Subject: powerpc/64: Fix checksum folding in csum_tcpudp_nofold and
 ip_fast_csum_nofold

These functions compute an IP checksum by computing a 64-bit sum and
folding it to 32 bits (the "nofold" in their names refers to folding
down to 16 bits).  However, doing (u32) (s + (s >> 32)) is not
sufficient to fold a 64-bit sum to 32 bits correctly.  The addition
can produce a carry out from bit 31, which needs to be added in to
the sum to produce the correct result.

To fix this, we copy the from64to32() function from lib/checksum.c
and use that.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/checksum.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 1e8fceb308a5..5b1a6e39afa7 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -53,17 +53,25 @@ static inline __sum16 csum_fold(__wsum sum)
 	return (__force __sum16)(~((__force u32)sum + tmp) >> 16);
 }
 
+static inline u32 from64to32(u64 x)
+{
+	/* add up 32-bit and 32-bit for 32+c bit */
+	x = (x & 0xffffffff) + (x >> 32);
+	/* add up carry.. */
+	x = (x & 0xffffffff) + (x >> 32);
+	return (u32)x;
+}
+
 static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
 					__u8 proto, __wsum sum)
 {
 #ifdef __powerpc64__
-	unsigned long s = (__force u32)sum;
+	u64 s = (__force u32)sum;
 
 	s += (__force u32)saddr;
 	s += (__force u32)daddr;
 	s += proto + len;
-	s += (s >> 32);
-	return (__force __wsum) s;
+	return (__force __wsum) from64to32(s);
 #else
     __asm__("\n\
 	addc %0,%0,%1 \n\
@@ -123,8 +131,7 @@ static inline __wsum ip_fast_csum_nofold(const void *iph, unsigned int ihl)
 
 	for (i = 0; i < ihl - 1; i++, ptr++)
 		s += *ptr;
-	s += (s >> 32);
-	return (__force __wsum)s;
+	return (__force __wsum)from64to32(s);
 #else
 	__wsum sum, tmp;
 
-- 
cgit v1.2.3


From d4fde568a34a93897dfb9ae64cfe9dda9d5c908c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 3 Nov 2016 16:15:42 +1100
Subject: powerpc/64: Use optimized checksum routines on little-endian

Currently we have optimized hand-coded assembly checksum routines for
big-endian 64-bit systems, but for little-endian we use the generic C
routines. This modifies the optimized routines to work for
little-endian. With this, we no longer need to enable
CONFIG_GENERIC_CSUM. This also fixes a couple of comments in
checksum_64.S so they accurately reflect what the associated instruction
does.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
[mpe: Use the more common __BIG_ENDIAN__]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                |  2 +-
 arch/powerpc/include/asm/checksum.h |  4 ++++
 arch/powerpc/lib/Makefile           |  2 --
 arch/powerpc/lib/checksum_64.S      | 12 ++++++++++--
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a8ee573fe610..e022859340b7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -167,7 +167,7 @@ config PPC
 	select HAVE_CC_STACKPROTECTOR
 
 config GENERIC_CSUM
-	def_bool CPU_LITTLE_ENDIAN
+	def_bool n
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 5b1a6e39afa7..4e63787dc3be 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -70,7 +70,11 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
 
 	s += (__force u32)saddr;
 	s += (__force u32)daddr;
+#ifdef __BIG_ENDIAN__
 	s += proto + len;
+#else
+	s += (proto + len) << 8;
+#endif
 	return (__force __wsum) from64to32(s);
 #else
     __asm__("\n\
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 309361e86523..0e649d72fe8d 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -21,9 +21,7 @@ obj64-y	+= copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 
-ifeq ($(CONFIG_GENERIC_CSUM),)
 obj-y			+= checksum_$(BITS).o checksum_wrappers.o
-endif
 
 obj-$(CONFIG_PPC_EMULATE_SSTEP)	+= sstep.o ldstfp.o
 
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index d0d311e108ff..d7f1a966136e 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -36,7 +36,7 @@ _GLOBAL(__csum_partial)
 	 * work to calculate the correct checksum, we ignore that case
 	 * and take the potential slowdown of unaligned loads.
 	 */
-	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
 	beq	.Lcsum_aligned
 
 	li	r7,4
@@ -168,8 +168,12 @@ _GLOBAL(__csum_partial)
 	beq	.Lcsum_finish
 
 	lbz	r6,0(r3)
+#ifdef __BIG_ENDIAN__
 	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
 	adde	r0,r0,r9
+#else
+	adde	r0,r0,r6
+#endif
 
 .Lcsum_finish:
 	addze	r0,r0			/* add in final carry */
@@ -224,7 +228,7 @@ _GLOBAL(csum_partial_copy_generic)
 	 * If the source and destination are relatively unaligned we only
 	 * align the source. This keeps things simple.
 	 */
-	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
 	beq	.Lcopy_aligned
 
 	li	r9,4
@@ -386,8 +390,12 @@ dstnr;	sth	r6,0(r4)
 	beq	.Lcopy_finish
 
 srcnr;	lbz	r6,0(r3)
+#ifdef __BIG_ENDIAN__
 	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
 	adde	r0,r0,r9
+#else
+	adde	r0,r0,r6
+#endif
 dstnr;	stb	r6,0(r4)
 
 .Lcopy_finish:
-- 
cgit v1.2.3


From 4ab2537c4204b976e4ca350bbdc193b4649cad28 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 8 Dec 2016 09:12:13 +0530
Subject: powerpc/mm: Fixup wrong LPCR_VRMASD value

In commit a4b349540a26af ("powerpc/mm: Cleanup LPCR defines") we updated
LPCR_VRMASD wrongly as below.

-#define   LPCR_VRMASD  (0x1ful << (63-16))
+#define   LPCR_VRMASD_SH       47
+#define   LPCR_VRMASD          (ASM_CONST(1) << LPCR_VRMASD_SH)

We initialize the VRMA bits in LPCR to 0x00 in kvm. Hence using a
different mask value as above while updating lpcr should not have any
impact.

This patch updates it to the correct value.

Fixes: a4b349540a26 ("powerpc/mm: Cleanup LPCR defines")
Reported-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Jia He <hejianet@gmail.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0d4531aa2052..818c4e878e60 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -338,7 +338,7 @@
 #define   LPCR_DPFD_SH		52
 #define   LPCR_DPFD		(ASM_CONST(7) << LPCR_DPFD_SH)
 #define   LPCR_VRMASD_SH	47
-#define   LPCR_VRMASD		(ASM_CONST(1) << LPCR_VRMASD_SH)
+#define   LPCR_VRMASD		(ASM_CONST(0x1f) << LPCR_VRMASD_SH)
 #define   LPCR_VRMA_L		ASM_CONST(0x0008000000000000)
 #define   LPCR_VRMA_LP0		ASM_CONST(0x0001000000000000)
 #define   LPCR_VRMA_LP1		ASM_CONST(0x0000800000000000)
-- 
cgit v1.2.3


From fb37e12896c1ba0407012fe8cdc0b054da063b6f Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 24 Aug 2016 22:26:37 +0200
Subject: powerpc/powernv/pci: Use kmalloc_array() in two functions

Use kmalloc_array(), which checks for overflow of the multiplication,
rather than doing it by hand.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b07680cd2518..939de0c2e00f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1326,7 +1326,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	else
 		m64_bars = 1;
 
-	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
+	pdn->m64_map = kmalloc_array(m64_bars,
+				     sizeof(*pdn->m64_map),
+				     GFP_KERNEL);
 	if (!pdn->m64_map)
 		return -ENOMEM;
 	/* Initialize the m64_map to IODA_INVALID_M64 */
@@ -1593,8 +1595,9 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 
 		/* Allocating pe_num_map */
 		if (pdn->m64_single_mode)
-			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map) * num_vfs,
-					GFP_KERNEL);
+			pdn->pe_num_map = kmalloc_array(num_vfs,
+							sizeof(*pdn->pe_num_map),
+							GFP_KERNEL);
 		else
 			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
 
-- 
cgit v1.2.3


From dbecd5093043faa9da83c720ed0e08ec1a5b410e Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 24 Jan 2017 09:49:52 +1100
Subject: powerpc/kernel: Remove nested if statements in rtas_initialize()

This removes the unnecessary nested if statements in function
rtas_initialize(), to simplify the code. No functional changes
introduced.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/rtas.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 112cc3b2ee1a..9759dcbd055d 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1145,31 +1145,30 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
 void __init rtas_initialize(void)
 {
 	unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
+	const __be32 *basep, *entryp, *sizep;
 
 	/* Get RTAS dev node and fill up our "rtas" structure with infos
 	 * about it.
 	 */
 	rtas.dev = of_find_node_by_name(NULL, "rtas");
-	if (rtas.dev) {
-		const __be32 *basep, *entryp, *sizep;
-
-		basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
-		sizep = of_get_property(rtas.dev, "rtas-size", NULL);
-		if (basep != NULL && sizep != NULL) {
-			rtas.base = __be32_to_cpu(*basep);
-			rtas.size = __be32_to_cpu(*sizep);
-			entryp = of_get_property(rtas.dev,
-					"linux,rtas-entry", NULL);
-			if (entryp == NULL) /* Ugh */
-				rtas.entry = rtas.base;
-			else
-				rtas.entry = __be32_to_cpu(*entryp);
-		} else
-			rtas.dev = NULL;
-	}
 	if (!rtas.dev)
 		return;
 
+	basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
+	sizep = of_get_property(rtas.dev, "rtas-size", NULL);
+	if (basep == NULL || sizep == NULL) {
+		rtas.dev = NULL;
+		return;
+	}
+
+	rtas.base = __be32_to_cpu(*basep);
+	rtas.size = __be32_to_cpu(*sizep);
+	entryp = of_get_property(rtas.dev, "linux,rtas-entry", NULL);
+	if (entryp == NULL) /* Ugh */
+		rtas.entry = rtas.base;
+	else
+		rtas.entry = __be32_to_cpu(*entryp);
+
 	/* If RTAS was found, allocate the RMO buffer for it and look for
 	 * the stop-self token if any
 	 */
-- 
cgit v1.2.3


From de6d2d1b7bf2b3a8d5e57ebffad9f2688fe00a7a Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 24 Jan 2017 09:49:53 +1100
Subject: powerpc/kernel: Use of_property_read_u32() in rtas_initialize()

This uses of_property_read_u32() in rtas_initialize() so that we
needn't explicitly care the CPU's endian.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/rtas.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 9759dcbd055d..ba5a4cc0e5b6 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1145,7 +1145,8 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
 void __init rtas_initialize(void)
 {
 	unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
-	const __be32 *basep, *entryp, *sizep;
+	u32 base, size, entry;
+	int no_base, no_size, no_entry;
 
 	/* Get RTAS dev node and fill up our "rtas" structure with infos
 	 * about it.
@@ -1154,20 +1155,17 @@ void __init rtas_initialize(void)
 	if (!rtas.dev)
 		return;
 
-	basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
-	sizep = of_get_property(rtas.dev, "rtas-size", NULL);
-	if (basep == NULL || sizep == NULL) {
+	no_base = of_property_read_u32(rtas.dev, "linux,rtas-base", &base);
+	no_size = of_property_read_u32(rtas.dev, "rtas-size", &size);
+	if (no_base || no_size) {
 		rtas.dev = NULL;
 		return;
 	}
 
-	rtas.base = __be32_to_cpu(*basep);
-	rtas.size = __be32_to_cpu(*sizep);
-	entryp = of_get_property(rtas.dev, "linux,rtas-entry", NULL);
-	if (entryp == NULL) /* Ugh */
-		rtas.entry = rtas.base;
-	else
-		rtas.entry = __be32_to_cpu(*entryp);
+	rtas.base = base;
+	rtas.size = size;
+	no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry);
+	rtas.entry = no_entry ? rtas.base : entry;
 
 	/* If RTAS was found, allocate the RMO buffer for it and look for
 	 * the stop-self token if any
-- 
cgit v1.2.3


From 8b2577832100706124fd6fe09f887992c8d7c0c6 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Tue, 24 Jan 2017 09:49:54 +1100
Subject: powerpc/kernel: Fix unbalanced refcount on RTAS device node

The RTAS device-tree node's refcount has been increased by one in
the function call of_find_node_by_name(), but it's missed to be
decreased by one in the error path. It leads to unbalanced refcount
on RTAS device-tree node.

This fixes above issue by decreasing RTAS device-tree node's refcount
in error path.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/rtas.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index ba5a4cc0e5b6..b8a4987f58cf 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1158,6 +1158,7 @@ void __init rtas_initialize(void)
 	no_base = of_property_read_u32(rtas.dev, "linux,rtas-base", &base);
 	no_size = of_property_read_u32(rtas.dev, "rtas-size", &size);
 	if (no_base || no_size) {
+		of_node_put(rtas.dev);
 		rtas.dev = NULL;
 		return;
 	}
-- 
cgit v1.2.3


From 3c4b66a6d0d2b9f2418f9a09d528e42e2dc18acf Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sat, 21 Jan 2017 15:30:15 +0100
Subject: powerpc/sstep: Return directly after a failed address_ok() in
 emulate_step()

Setting err and going to ldst_done just returns 0, without using err, so
just return 0 directly. We already do that for other call sites in this
function.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
[mpe: Rewrite change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/sstep.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 06c7e9b88408..846dba2c6360 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1803,9 +1803,8 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 			return 0;
 		if (op.ea & (size - 1))
 			break;		/* can't handle misaligned */
-		err = -EFAULT;
 		if (!address_ok(regs, op.ea, size))
-			goto ldst_done;
+			return 0;
 		err = 0;
 		switch (size) {
 		case 4:
@@ -1828,9 +1827,8 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 			return 0;
 		if (op.ea & (size - 1))
 			break;		/* can't handle misaligned */
-		err = -EFAULT;
 		if (!address_ok(regs, op.ea, size))
-			goto ldst_done;
+			return 0;
 		err = 0;
 		switch (size) {
 		case 4:
-- 
cgit v1.2.3


From a967f161abc7c4a6936ceb15737f75c52bfd07f2 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sat, 21 Jan 2017 16:10:50 +0100
Subject: powerpc/mm: Return directly after a failed __copy_from_user() in
 sys_subpage_prot()

This function already has multiple exit points, so there's no harm
adding another. Although it looks odd to return directly in a function
which takes a lock, we've actually just dropped the mmap_sem in this
code, so there's really no reason to go via a label. And it means we can
drop the unhelpfully named out2 label.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
[mpe: Rewrite change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/subpage-prot.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 5c096c01e8bd..94210940112f 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -248,9 +248,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 			nw = (next - addr) >> PAGE_SHIFT;
 
 		up_write(&mm->mmap_sem);
-		err = -EFAULT;
 		if (__copy_from_user(spp, map, nw * sizeof(u32)))
-			goto out2;
+			return -EFAULT;
 		map += nw;
 		down_write(&mm->mmap_sem);
 
@@ -262,6 +261,5 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 	err = 0;
  out:
 	up_write(&mm->mmap_sem);
- out2:
 	return err;
 }
-- 
cgit v1.2.3


From 0e17166d377e74164e4067ae162ed19226352a97 Mon Sep 17 00:00:00 2001
From: Greg Kurz <groug@kaod.org>
Date: Thu, 19 Jan 2017 11:50:10 +0100
Subject: cxl: Drop unused header asm/pnv-pci.h

The kernel API does not use anything from this header file.

Signed-off-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/api.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index 1b35e33d2434..bcc030eacab7 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -11,7 +11,6 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <misc/cxl.h>
-#include <asm/pnv-pci.h>
 #include <linux/msi.h>
 #include <linux/module.h>
 #include <linux/mount.h>
-- 
cgit v1.2.3


From d7b1946c7925a270062b2e0718aa57b42ba619c0 Mon Sep 17 00:00:00 2001
From: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Date: Wed, 4 Jan 2017 11:48:52 +0530
Subject: cxl: Force psl data-cache flush during device shutdown

This change adds a force psl data cache flush during device shutdown
callback. This should reduce a possibility of psl holding a dirty
cache line while the CAPP is being reinitialized, which may result in
a UE [load/store] machine check error.

Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 80a87ab25b83..73432e7d925d 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1610,6 +1610,9 @@ static void cxl_pci_remove_adapter(struct cxl *adapter)
 	cxl_sysfs_adapter_remove(adapter);
 	cxl_debugfs_adapter_remove(adapter);
 
+	/* Flush adapter datacache as its about to be removed */
+	cxl_data_cache_flush(adapter);
+
 	cxl_deconfigure_adapter(adapter);
 
 	device_unregister(&adapter->dev);
-- 
cgit v1.2.3


From 14a3ae34bfd0bcb1cc12d55b06a8584c11fac6fc Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Fri, 9 Dec 2016 17:18:50 +1100
Subject: cxl: Prevent read/write to AFU config space while AFU not configured

During EEH recovery, we deconfigure all AFUs whilst leaving the
corresponding vPHB and virtual PCI device in place.

If something attempts to interact with the AFU's PCI config space (e.g.
running lspci) after the AFU has been deconfigured and before it's
reconfigured, cxl_pcie_{read,write}_config() will read invalid values from
the deconfigured struct cxl_afu and proceed to Oops when they try to
dereference pointers that have been set to NULL during deconfiguration.

Add a rwsem to struct cxl_afu so we can prevent interaction with config
space while the AFU is deconfigured.

Reported-by: Pradipta Ghosh <pradghos@in.ibm.com>
Suggested-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org # v4.9+
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/cxl.h  |  2 ++
 drivers/misc/cxl/main.c |  3 ++-
 drivers/misc/cxl/pci.c  |  2 ++
 drivers/misc/cxl/vphb.c | 51 ++++++++++++++++++++++++++++---------------------
 4 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index b24d76723fb0..b4a43fd14b99 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -418,6 +418,8 @@ struct cxl_afu {
 	struct dentry *debugfs;
 	struct mutex contexts_lock;
 	spinlock_t afu_cntl_lock;
+	/* Used to block access to AFU config space while deconfigured */
+	struct rw_semaphore configured_rwsem;
 
 	/* AFU error buffer fields and bin attribute for sysfs */
 	u64 eb_len, eb_offset;
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index 62e0dfb5f15b..2a6bf1d0a3a4 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -268,7 +268,8 @@ struct cxl_afu *cxl_alloc_afu(struct cxl *adapter, int slice)
 	idr_init(&afu->contexts_idr);
 	mutex_init(&afu->contexts_lock);
 	spin_lock_init(&afu->afu_cntl_lock);
-
+	init_rwsem(&afu->configured_rwsem);
+	down_write(&afu->configured_rwsem);
 	afu->prefault_mode = CXL_PREFAULT_NONE;
 	afu->irqs_max = afu->adapter->user_irqs;
 
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 73432e7d925d..cca938845ffd 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1129,6 +1129,7 @@ static int pci_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pc
 	if ((rc = cxl_native_register_psl_irq(afu)))
 		goto err2;
 
+	up_write(&afu->configured_rwsem);
 	return 0;
 
 err2:
@@ -1141,6 +1142,7 @@ err1:
 
 static void pci_deconfigure_afu(struct cxl_afu *afu)
 {
+	down_write(&afu->configured_rwsem);
 	cxl_native_release_psl_irq(afu);
 	if (afu->adapter->native->sl_ops->release_serr_irq)
 		afu->adapter->native->sl_ops->release_serr_irq(afu);
diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index 3519acebfdab..639a343b7836 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -76,23 +76,22 @@ static int cxl_pcie_cfg_record(u8 bus, u8 devfn)
 	return (bus << 8) + devfn;
 }
 
-static int cxl_pcie_config_info(struct pci_bus *bus, unsigned int devfn,
-				struct cxl_afu **_afu, int *_record)
+static inline struct cxl_afu *pci_bus_to_afu(struct pci_bus *bus)
 {
-	struct pci_controller *phb;
-	struct cxl_afu *afu;
-	int record;
+	struct pci_controller *phb = bus ? pci_bus_to_host(bus) : NULL;
 
-	phb = pci_bus_to_host(bus);
-	if (phb == NULL)
-		return PCIBIOS_DEVICE_NOT_FOUND;
+	return phb ? phb->private_data : NULL;
+}
+
+static inline int cxl_pcie_config_info(struct pci_bus *bus, unsigned int devfn,
+				       struct cxl_afu *afu, int *_record)
+{
+	int record;
 
-	afu = (struct cxl_afu *)phb->private_data;
 	record = cxl_pcie_cfg_record(bus->number, devfn);
 	if (record > afu->crs_num)
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	*_afu = afu;
 	*_record = record;
 	return 0;
 }
@@ -106,9 +105,14 @@ static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
 	u16 val16;
 	u32 val32;
 
-	rc = cxl_pcie_config_info(bus, devfn, &afu, &record);
+	afu = pci_bus_to_afu(bus);
+	/* Grab a reader lock on afu. */
+	if (afu == NULL || !down_read_trylock(&afu->configured_rwsem))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	rc = cxl_pcie_config_info(bus, devfn, afu, &record);
 	if (rc)
-		return rc;
+		goto out;
 
 	switch (len) {
 	case 1:
@@ -127,10 +131,9 @@ static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
 		WARN_ON(1);
 	}
 
-	if (rc)
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	return PCIBIOS_SUCCESSFUL;
+out:
+	up_read(&afu->configured_rwsem);
+	return rc ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL;
 }
 
 static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
@@ -139,9 +142,14 @@ static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
 	int rc, record;
 	struct cxl_afu *afu;
 
-	rc = cxl_pcie_config_info(bus, devfn, &afu, &record);
+	afu = pci_bus_to_afu(bus);
+	/* Grab a reader lock on afu. */
+	if (afu == NULL || !down_read_trylock(&afu->configured_rwsem))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	rc = cxl_pcie_config_info(bus, devfn, afu, &record);
 	if (rc)
-		return rc;
+		goto out;
 
 	switch (len) {
 	case 1:
@@ -157,10 +165,9 @@ static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
 		WARN_ON(1);
 	}
 
-	if (rc)
-		return PCIBIOS_SET_FAILED;
-
-	return PCIBIOS_SUCCESSFUL;
+out:
+	up_read(&afu->configured_rwsem);
+	return rc ? PCIBIOS_SET_FAILED : PCIBIOS_SUCCESSFUL;
 }
 
 static struct pci_ops cxl_pcie_pci_ops =
-- 
cgit v1.2.3


From 052de33ca4f840bf35587eacdf78b3bf8d347bb8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Jan 2017 22:40:00 +0530
Subject: powerpc/bpf: Remove redundant check for non-null image

We have a check earlier to ensure we don't proceed if image is NULL. As
such, the redundant check can be removed.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
[Added similar changes for classic BPF JIT]
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/net/bpf_jit_comp.c   | 17 +++++++++--------
 arch/powerpc/net/bpf_jit_comp64.c | 16 ++++++++--------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 7e706f36e364..f9941b3b5770 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -662,16 +662,17 @@ void bpf_jit_compile(struct bpf_prog *fp)
 		 */
 		bpf_jit_dump(flen, proglen, pass, code_base);
 
-	if (image) {
-		bpf_flush_icache(code_base, code_base + (proglen/4));
+	bpf_flush_icache(code_base, code_base + (proglen/4));
+
 #ifdef CONFIG_PPC64
-		/* Function descriptor nastiness: Address + TOC */
-		((u64 *)image)[0] = (u64)code_base;
-		((u64 *)image)[1] = local_paca->kernel_toc;
+	/* Function descriptor nastiness: Address + TOC */
+	((u64 *)image)[0] = (u64)code_base;
+	((u64 *)image)[1] = local_paca->kernel_toc;
 #endif
-		fp->bpf_func = (void *)image;
-		fp->jited = 1;
-	}
+
+	fp->bpf_func = (void *)image;
+	fp->jited = 1;
+
 out:
 	kfree(addrs);
 	return;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 73a5cf18fd84..935a10f77cfe 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1046,16 +1046,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		 */
 		bpf_jit_dump(flen, proglen, pass, code_base);
 
-	if (image) {
-		bpf_flush_icache(bpf_hdr, image + alloclen);
+	bpf_flush_icache(bpf_hdr, image + alloclen);
+
 #ifdef PPC64_ELF_ABI_v1
-		/* Function descriptor nastiness: Address + TOC */
-		((u64 *)image)[0] = (u64)code_base;
-		((u64 *)image)[1] = local_paca->kernel_toc;
+	/* Function descriptor nastiness: Address + TOC */
+	((u64 *)image)[0] = (u64)code_base;
+	((u64 *)image)[1] = local_paca->kernel_toc;
 #endif
-		fp->bpf_func = (void *)image;
-		fp->jited = 1;
-	}
+
+	fp->bpf_func = (void *)image;
+	fp->jited = 1;
 
 out:
 	kfree(addrs);
-- 
cgit v1.2.3


From 10528b9c45cfb9e8f45217ef2f5ef8b876bbd3f5 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 13 Jan 2017 22:40:01 +0530
Subject: powerpc/bpf: Flush the entire JIT buffer

With bpf_jit_binary_alloc(), we allocate at a page granularity and fill
the rest of the space with illegal instructions to mitigate BPF spraying
attacks, while having the actual JIT'ed BPF program at a random location
within the allocated space. Under this scenario, it would be better to
flush the entire allocated buffer rather than just the part containing
the actual program. We already flush the buffer from start to the end of
the BPF program. Extend this to include the illegal instructions after
the BPF program.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/net/bpf_jit_comp64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 935a10f77cfe..d4ed7a0872b1 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1046,8 +1046,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		 */
 		bpf_jit_dump(flen, proglen, pass, code_base);
 
-	bpf_flush_icache(bpf_hdr, image + alloclen);
-
 #ifdef PPC64_ELF_ABI_v1
 	/* Function descriptor nastiness: Address + TOC */
 	((u64 *)image)[0] = (u64)code_base;
@@ -1057,6 +1055,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	fp->bpf_func = (void *)image;
 	fp->jited = 1;
 
+	bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+
 out:
 	kfree(addrs);
 
-- 
cgit v1.2.3


From d3918e7fd4a27564f93ec46d0359a9739c5deb8d Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 22 Dec 2016 04:29:25 +1000
Subject: KVM: PPC: Book3S: Change interrupt call to reduce scratch space use
 on HV

Change the calling convention to put the trap number together with
CR in two halves of r12, which frees up HSTATE_SCRATCH2 in the HV
handler.

The 64-bit PR handler entry translates the calling convention back
to match the previous call convention (i.e., shared with 32-bit), for
simplicity.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 24 +++++++++++-------------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 16 +++++++++-------
 arch/powerpc/kvm/book3s_segment.S        | 25 ++++++++++++++++++-------
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9a3eee661297..a02a268bde6b 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -233,7 +233,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #endif
 
-#define __KVM_HANDLER_PROLOG(area, n)					\
+#define __KVM_HANDLER(area, h, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
 	ld	r10,area+EX_CFAR(r13);					\
 	std	r10,HSTATE_CFAR(r13);					\
@@ -243,30 +243,28 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	std	r10,HSTATE_PPR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);	\
 	ld	r10,area+EX_R10(r13);					\
-	stw	r9,HSTATE_SCRATCH1(r13);				\
-	ld	r9,area+EX_R9(r13);					\
 	std	r12,HSTATE_SCRATCH0(r13);				\
-
-#define __KVM_HANDLER(area, h, n)					\
-	__KVM_HANDLER_PROLOG(area, n)					\
-	li	r12,n;							\
+	sldi	r12,r9,32;						\
+	ori	r12,r12,(n);						\
+	ld	r9,area+EX_R9(r13);					\
 	b	kvmppc_interrupt
 
 #define __KVM_HANDLER_SKIP(area, h, n)					\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
-	ld	r10,area+EX_R10(r13);					\
 	beq	89f;							\
-	stw	r9,HSTATE_SCRATCH1(r13);				\
 	BEGIN_FTR_SECTION_NESTED(948)					\
-	ld	r9,area+EX_PPR(r13);					\
-	std	r9,HSTATE_PPR(r13);					\
+	ld	r10,area+EX_PPR(r13);					\
+	std	r10,HSTATE_PPR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);	\
-	ld	r9,area+EX_R9(r13);					\
+	ld	r10,area+EX_R10(r13);					\
 	std	r12,HSTATE_SCRATCH0(r13);				\
-	li	r12,n;							\
+	sldi	r12,r9,32;						\
+	ori	r12,r12,(n);						\
+	ld	r9,area+EX_R9(r13);					\
 	b	kvmppc_interrupt;					\
 89:	mtocrf	0x80,r9;						\
 	ld	r9,area+EX_R9(r13);					\
+	ld	r10,area+EX_R10(r13);					\
 	b	kvmppc_skip_##h##interrupt
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9338a818e05c..11882aac8216 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1057,19 +1057,18 @@ hdec_soon:
 kvmppc_interrupt_hv:
 	/*
 	 * Register contents:
-	 * R12		= interrupt vector
+	 * R12		= (guest CR << 32) | interrupt vector
 	 * R13		= PACA
-	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R12 saved in shadow VCPU SCRATCH0
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
-	std	r9, HSTATE_SCRATCH2(r13)
-
+	std	r9, HSTATE_SCRATCH1(r13)
 	lbz	r9, HSTATE_IN_GUEST(r13)
 	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
 	beq	kvmppc_bad_host_intr
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	cmpwi	r9, KVM_GUEST_MODE_GUEST
-	ld	r9, HSTATE_SCRATCH2(r13)
+	ld	r9, HSTATE_SCRATCH1(r13)
 	beq	kvmppc_interrupt_pr
 #endif
 	/* We're now back in the host but in guest MMU context */
@@ -1089,13 +1088,14 @@ kvmppc_interrupt_hv:
 	std	r6, VCPU_GPR(R6)(r9)
 	std	r7, VCPU_GPR(R7)(r9)
 	std	r8, VCPU_GPR(R8)(r9)
-	ld	r0, HSTATE_SCRATCH2(r13)
+	ld	r0, HSTATE_SCRATCH1(r13)
 	std	r0, VCPU_GPR(R9)(r9)
 	std	r10, VCPU_GPR(R10)(r9)
 	std	r11, VCPU_GPR(R11)(r9)
 	ld	r3, HSTATE_SCRATCH0(r13)
-	lwz	r4, HSTATE_SCRATCH1(r13)
 	std	r3, VCPU_GPR(R12)(r9)
+	/* CR is in the high half of r12 */
+	srdi	r4, r12, 32
 	stw	r4, VCPU_CR(r9)
 BEGIN_FTR_SECTION
 	ld	r3, HSTATE_CFAR(r13)
@@ -1114,6 +1114,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	mfspr	r11, SPRN_SRR1
 	std	r10, VCPU_SRR0(r9)
 	std	r11, VCPU_SRR1(r9)
+	/* trap is in the low half of r12, clear CR from the high half */
+	clrldi	r12, r12, 32
 	andi.	r0, r12, 2		/* need to read HSRR0/1? */
 	beq	1f
 	mfspr	r10, SPRN_HSRR0
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index ca8f174289bb..68e45080cf93 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -167,20 +167,31 @@ kvmppc_handler_trampoline_enter_end:
  *                                                                            *
  *****************************************************************************/
 
-.global kvmppc_handler_trampoline_exit
-kvmppc_handler_trampoline_exit:
-
 .global kvmppc_interrupt_pr
 kvmppc_interrupt_pr:
+	/* 64-bit entry. Register usage at this point:
+	 *
+	 * SPRG_SCRATCH0   = guest R13
+	 * R12             = (guest CR << 32) | exit handler id
+	 * R13             = PACA
+	 * HSTATE.SCRATCH0 = guest R12
+	 */
+#ifdef CONFIG_PPC64
+	/* Match 32-bit entry */
+	rotldi	r12, r12, 32		  /* Flip R12 halves for stw */
+	stw	r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
+	srdi	r12, r12, 32		  /* shift trap into low half */
+#endif
 
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
 	/* Register usage at this point:
 	 *
-	 * SPRG_SCRATCH0  = guest R13
-	 * R12            = exit handler id
-	 * R13            = shadow vcpu (32-bit) or PACA (64-bit)
+	 * SPRG_SCRATCH0   = guest R13
+	 * R12             = exit handler id
+	 * R13             = shadow vcpu (32-bit) or PACA (64-bit)
 	 * HSTATE.SCRATCH0 = guest R12
 	 * HSTATE.SCRATCH1 = guest CR
-	 *
 	 */
 
 	/* Save registers */
-- 
cgit v1.2.3


From 7ede531773ea69fa56b02a873ed83ce3507eb8d5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 22 Dec 2016 04:29:26 +1000
Subject: KVM: PPC: Book3S: Move 64-bit KVM interrupt handler out from alt
 section

A subsequent patch to make KVM handlers relocation-safe makes them
unusable from within alt section "else" cases (due to the way fixed
addresses are taken from within fixed section head code).

Stop open-coding the KVM handlers, and add them both as normal. A more
optimal fix may be to allow some level of alternate feature patching in
the exception macros themselves, but for now this will do.

The TRAMP_KVM handlers must be moved to the "virt" fixed section area
(name is arbitrary) in order to be closer to .text and avoid the dreaded
"relocation truncated to fit" error.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/head-64.h   | 2 +-
 arch/powerpc/kernel/exceptions-64s.S | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index fca7033839a9..9bd81619d090 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -218,7 +218,7 @@ name:
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 #define TRAMP_KVM_BEGIN(name)						\
-	TRAMP_REAL_BEGIN(name)
+	TRAMP_VIRT_BEGIN(name)
 #else
 #define TRAMP_KVM_BEGIN(name)
 #endif
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d39d6118c6e9..89b4f122aec6 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -717,13 +717,9 @@ hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
 					    EXC_HV, SOFTEN_TEST_HV)
-do_kvm_H0x500:
-		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
 					    EXC_STD, SOFTEN_TEST_PR)
-do_kvm_0x500:
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 EXC_REAL_END(hardware_interrupt, 0x500, 0x600)
 
@@ -737,6 +733,8 @@ hardware_interrupt_relon_hv:
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600)
 
+TRAMP_KVM(PACA_EXGEN, 0x500)
+TRAMP_KVM_HV(PACA_EXGEN, 0x500)
 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
-- 
cgit v1.2.3


From 79270e0a3fd124388a0407f9edbd6ace75eacb69 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 28 Jan 2017 21:18:40 +0530
Subject: powerpc/mm/hash: Properly mask the ESID bits when building proto VSID

The proto VSID is built using both the MMU context id and effective
segment ID (ESID). We should not have overlapping bits between those.
That could result in us having a VSID collision. With the current code
we missed masking the top bits of the ESID. This implies for kernel
address we ended up using the top 4 bits of the ESID as part of the
proto VSID, which is wrong.

The current code use the top 4 context values (0x7fffc - 0x7ffff) for
the kernel. With those context IDs used for the kernel, we don't run
into VSID collisions because we get the same proto VSID irrespective of
whether we mask the ESID bits or not. eg:

  ea         = 0xf000000000000000
  context    = 0x7ffff

  w/out masking:
  proto_vsid = (0x7ffff << 6 | 0xf000000000000000 >> 40)
	     = (0x1ffffc0 | 0xf00000)
	     =  0x1ffffc0

  with masking:
  proto_vsid = (0x7ffff << 6 | ((0xf000000000000000 >> 40) & 0x3f))
	     = (0x1ffffc0 | (0xf00000 & 0x3f))
	     =  0x1ffffc0 | 0)
	     =  0x1ffffc0

So although there is no bug, the code is still overly subtle, so fix it
to save ourselves pain in future.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 2e6a823fa502..823015cff149 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -525,6 +525,9 @@ extern void slb_set_size(u16 size);
 #define ESID_BITS		18
 #define ESID_BITS_1T		6
 
+#define ESID_BITS_MASK		((1 << ESID_BITS) - 1)
+#define ESID_BITS_1T_MASK	((1 << ESID_BITS_1T) - 1)
+
 /*
  * 256MB segment
  * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
@@ -660,9 +663,9 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
 
 	if (ssize == MMU_SEGSIZE_256M)
 		return vsid_scramble((context << ESID_BITS)
-				     | (ea >> SID_SHIFT), 256M);
+				     | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M);
 	return vsid_scramble((context << ESID_BITS_1T)
-			     | (ea >> SID_SHIFT_1T), 1T);
+			     | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T);
 }
 
 /*
-- 
cgit v1.2.3


From f2a5e8f0023eba847ad2adb145b2f631934bb12b Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Mon, 24 Oct 2016 23:51:51 +0530
Subject: powerpc/fadump: Fix the race in crash_fadump().

There are chances that multiple CPUs can call crash_fadump() simultaneously
and would start duplicating same info to vmcoreinfo ELF note section. This
causes makedumpfile to fail during kdump capture. One example is,
triggering dumprestart from HMC which sends system reset to all the CPUs at
once.

makedumpfile --dump-dmesg /proc/vmcore
read_vmcoreinfo_basic_info: Invalid data in /tmp/vmcoreinfoyjgxlL: CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971
makedumpfile Failed.
Running makedumpfile --dump-dmesg /proc/vmcore failed (1).

makedumpfile  -d 31 -l /proc/vmcore
read_vmcoreinfo_basic_info: Invalid data in /tmp/vmcoreinfo1mmVdO: CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971CRASHTIME=1475605971
makedumpfile Failed.
Running makedumpfile  -d 31 -l /proc/vmcore failed (1).

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/fadump.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8f0c7c5d93f2..8ff0dd4e77a7 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -406,12 +406,35 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
 void crash_fadump(struct pt_regs *regs, const char *str)
 {
 	struct fadump_crash_info_header *fdh = NULL;
+	int old_cpu, this_cpu;
 
 	if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
 		return;
 
+	/*
+	 * old_cpu == -1 means this is the first CPU which has come here,
+	 * go ahead and trigger fadump.
+	 *
+	 * old_cpu != -1 means some other CPU has already on it's way
+	 * to trigger fadump, just keep looping here.
+	 */
+	this_cpu = smp_processor_id();
+	old_cpu = cmpxchg(&crashing_cpu, -1, this_cpu);
+
+	if (old_cpu != -1) {
+		/*
+		 * We can't loop here indefinitely. Wait as long as fadump
+		 * is in force. If we race with fadump un-registration this
+		 * loop will break and then we go down to normal panic path
+		 * and reboot. If fadump is in force the first crashing
+		 * cpu will definitely trigger fadump.
+		 */
+		while (fw_dump.dump_registered)
+			cpu_relax();
+		return;
+	}
+
 	fdh = __va(fw_dump.fadumphdr_addr);
-	crashing_cpu = smp_processor_id();
 	fdh->crashing_cpu = crashing_cpu;
 	crash_save_vmcoreinfo();
 
-- 
cgit v1.2.3


From 7656cd8e8e23ac4b059f4d96939cb73eb3121ae9 Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Thu, 13 Oct 2016 13:45:30 -0500
Subject: powerpc/mm: Simplify loop control in parse_numa_properties()

The flow of the main loop in parse_numa_properties() is overly
complicated. Simplify it to be less confusing and easier to read.
No functional change.

The end of the main loop in parse_numa_properties() looks like this:

	for_each_node_by_type(...) {
		...
		if (!condition) {
			if (--ranges)
				goto new_range;
			else
				continue;
		}

		statement();

		if (--ranges)
			goto new_range;
		/* else
		 *	continue; <- implicit, this is the end of the loop
		 */
	}

The only effect of !condition is to skip execution of statement(). This
can be rewritten in a simpler way:

	for_each_node_by_type(...) {
		...
		if (condition)
			statement();

		if (--ranges)
			goto new_range;
	}

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b1099cb2f393..51fe1c5b6d71 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -786,14 +786,9 @@ new_range:
 		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
 		node_set_online(nid);
 
-		if (!(size = numa_enforce_memory_limit(start, size))) {
-			if (--ranges)
-				goto new_range;
-			else
-				continue;
-		}
-
-		memblock_set_node(start, size, &memblock.memory, nid);
+		size = numa_enforce_memory_limit(start, size);
+		if (size)
+			memblock_set_node(start, size, &memblock.memory, nid);
 
 		if (--ranges)
 			goto new_range;
-- 
cgit v1.2.3


From 2a8628d41602dc9f988af051a657eef648eec5c0 Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Wed, 16 Nov 2016 10:45:03 -0600
Subject: powerpc/mm: Allow memory hotplug into an offline node

Relax the check preventing us from hotplugging into an offline node.

This limitation was added in commit 482ec7c403d2 ("[PATCH] powerpc numa:
Support sparse online node map") to prevent adding resources to an
uninitialized node.

These days, there is no harm in doing so. The addition will actually
cause the node to be initialized and onlined; add_memory_resource()
calls hotadd_new_pgdat() (if necessary) and node_set_online().

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 51fe1c5b6d71..16267ff8c86c 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1093,7 +1093,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
 		nid = hot_add_node_scn_to_nid(scn_addr);
 	}
 
-	if (nid < 0 || !node_online(nid))
+	if (nid < 0 || !node_possible(nid))
 		nid = first_online_node;
 
 	return nid;
-- 
cgit v1.2.3


From 1d0761d2557d1540727723e4f05395d53321d555 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 14 Dec 2016 13:36:51 +1100
Subject: powerpc/powernv: Initialise nest mmu

POWER9 contains an off core mmu called the nest mmu (NMMU). This is
used by other hardware units on the chip to translate virtual
addresses into real addresses. The unit attempting an address
translation provides the majority of the context required for the
translation request except for the base address of the partition table
(ie. the PTCR) which needs to be programmed into the NMMU.

This patch adds a call to OPAL to set the PTCR for the nest mmu in
opal_init().

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal-api.h            |  3 ++-
 arch/powerpc/include/asm/opal.h                |  1 +
 arch/powerpc/include/asm/powernv.h             | 19 +++++++++++++++++++
 arch/powerpc/mm/pgtable-radix.c                |  2 ++
 arch/powerpc/mm/pgtable_64.c                   |  6 +++++-
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/opal.c          | 11 +++++++++++
 7 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/include/asm/powernv.h

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 0e2e57bcab50..a0aa285869b5 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -167,7 +167,8 @@
 #define OPAL_INT_EOI				124
 #define OPAL_INT_SET_MFRR			125
 #define OPAL_PCI_TCE_KILL			126
-#define OPAL_LAST				126
+#define OPAL_NMMU_SET_PTCR			127
+#define OPAL_LAST				127
 
 /* Device tree flags */
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 5c7db0f1a708..08ddea966601 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -232,6 +232,7 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
 int64_t opal_rm_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
 			     uint32_t pe_num, uint32_t tce_size,
 			     uint64_t dma_addr, uint32_t npages);
+int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h
new file mode 100644
index 000000000000..0e9c2402dd20
--- /dev/null
+++ b/arch/powerpc/include/asm/powernv.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2017 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ASM_POWERNV_H
+#define _ASM_POWERNV_H
+
+#ifdef CONFIG_PPC_POWERNV
+extern void powernv_set_nmmu_ptcr(unsigned long ptcr);
+#else
+static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
+#endif
+
+#endif /* _ASM_POWERNV_H */
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfa53ccc8baf..086522b7c60f 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -18,6 +18,7 @@
 #include <asm/machdep.h>
 #include <asm/mmu.h>
 #include <asm/firmware.h>
+#include <asm/powernv.h>
 
 #include <trace/events/thp.h>
 
@@ -438,6 +439,7 @@ void radix__mmu_cleanup_all(void)
 		lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
 		mtspr(SPRN_PTCR, 0);
+		powernv_set_nmmu_ptcr(0);
 		radix__flush_tlb_all();
 	}
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 8bca7f58afc4..4ee9c9d18760 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -52,6 +52,7 @@
 #include <asm/sections.h>
 #include <asm/firmware.h>
 #include <asm/dma.h>
+#include <asm/powernv.h>
 
 #include "mmu_decl.h"
 
@@ -436,6 +437,7 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 void __init mmu_partition_table_init(void)
 {
 	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+	unsigned long ptcr;
 
 	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
 	partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
@@ -448,7 +450,9 @@ void __init mmu_partition_table_init(void)
 	 * update partition table control register,
 	 * 64 K size.
 	 */
-	mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+	mtspr(SPRN_PTCR, ptcr);
+	powernv_set_nmmu_ptcr(ptcr);
 }
 
 void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3aa40f1b20f5..f7c19c9c57ed 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -311,4 +311,5 @@ OPAL_CALL_REAL(opal_rm_int_eoi,			OPAL_INT_EOI);
 OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
 OPAL_CALL_REAL(opal_rm_int_set_mfrr,		OPAL_INT_SET_MFRR);
 OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
+OPAL_CALL(opal_nmmu_set_ptcr,			OPAL_NMMU_SET_PTCR);
 OPAL_CALL_REAL(opal_rm_pci_tce_kill,		OPAL_PCI_TCE_KILL);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 282293572dc8..86d9fde93c17 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -875,6 +875,17 @@ int opal_error_code(int rc)
 	}
 }
 
+void powernv_set_nmmu_ptcr(unsigned long ptcr)
+{
+	int rc;
+
+	if (firmware_has_feature(FW_FEATURE_OPAL)) {
+		rc = opal_nmmu_set_ptcr(-1UL, ptcr);
+		if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
+			pr_warn("%s: Unable to set nest mmu ptcr\n", __func__);
+	}
+}
+
 EXPORT_SYMBOL_GPL(opal_poll_events);
 EXPORT_SYMBOL_GPL(opal_rtc_read);
 EXPORT_SYMBOL_GPL(opal_rtc_write);
-- 
cgit v1.2.3


From 616badd2fb499320d3ac3b54462f55dededd0e0f Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Tue, 10 Jan 2017 15:41:44 +1100
Subject: powerpc/powernv: Use OPAL call for TCE kill on NVLink2

Add detection of NPU2 PHBs. NPU2/NVLink2 has a different register
layout for the TCE kill register therefore TCE invalidation should be
done via the OPAL call rather than using the register directly as it
is for PHB3 and NVLink1. This changes TCE invalidation to use the OPAL
call in the case of a NPU2 PHB model.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 9 ++++++++-
 arch/powerpc/platforms/powernv/pci.c      | 7 +++++++
 arch/powerpc/platforms/powernv/pci.h      | 1 +
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 939de0c2e00f..ddfa069c5e7e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1953,7 +1953,12 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		struct pnv_phb *phb = pe->phb;
 		unsigned int shift = tbl->it_page_shift;
 
-		if (phb->type == PNV_PHB_NPU) {
+		/*
+		 * NVLink1 can use the TCE kill register directly as
+		 * it's the same as PHB3. NVLink2 is different and
+		 * should go via the OPAL call.
+		 */
+		if (phb->model == PNV_PHB_MODEL_NPU) {
 			/*
 			 * The NVLink hardware does not support TCE kill
 			 * per TCE entry so we have to invalidate
@@ -3674,6 +3679,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 		phb->model = PNV_PHB_MODEL_PHB3;
 	else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
 		phb->model = PNV_PHB_MODEL_NPU;
+	else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
+		phb->model = PNV_PHB_MODEL_NPU2;
 	else
 		phb->model = PNV_PHB_MODEL_UNKNOWN;
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index c6d554fe585c..eb835e977e33 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -940,6 +940,13 @@ void __init pnv_pci_init(void)
 	for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
 		pnv_pci_init_npu_phb(np);
 
+	/*
+	 * Look for NPU2 PHBs which we treat mostly as NPU PHBs with
+	 * the exception of TCE kill which requires an OPAL call.
+	 */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
+		pnv_pci_init_npu_phb(np);
+
 	/* Configure IOMMU DMA hooks */
 	set_pci_dma_ops(&dma_iommu_ops);
 }
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e64df7894d6e..e1d3e5526b54 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -19,6 +19,7 @@ enum pnv_phb_model {
 	PNV_PHB_MODEL_P7IOC,
 	PNV_PHB_MODEL_PHB3,
 	PNV_PHB_MODEL_NPU,
+	PNV_PHB_MODEL_NPU2,
 };
 
 #define PNV_PCI_DIAG_BUF_SIZE	8192
-- 
cgit v1.2.3


From 823b7bd5156a93872d9561b3f033dfe5cb80204e Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 25 Jan 2017 14:06:25 +0530
Subject: powernv:idle: Add IDLE_STATE_ENTER_SEQ_NORET macro

Currently all the low-power idle states are expected to wake up
at reset vector 0x100. Which is why the macro IDLE_STATE_ENTER_SEQ
that puts the CPU to an idle state and never returns.

On ISA v3.0, when the ESL and EC bits in the PSSCR are zero, the CPU
is expected to wake up at the next instruction of the idle
instruction.

This patch adds a new macro named IDLE_STATE_ENTER_SEQ_NORET for the
no-return variant and reuses the name IDLE_STATE_ENTER_SEQ
for a variant that allows resuming operation at the instruction next
to the idle-instruction.

Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cpuidle.h   |  5 ++++-
 arch/powerpc/kernel/exceptions-64s.S |  6 +++---
 arch/powerpc/kernel/idle_book3s.S    | 10 +++++-----
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 3919332965af..0a3255b12587 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -21,7 +21,7 @@ extern u64 pnv_first_deep_stop_state;
 
 /* Idle state entry routines */
 #ifdef	CONFIG_PPC_P7_NAP
-#define	IDLE_STATE_ENTER_SEQ(IDLE_INST)				\
+#define IDLE_STATE_ENTER_SEQ(IDLE_INST)                         \
 	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
 	std	r0,0(r1);					\
 	ptesync;						\
@@ -29,6 +29,9 @@ extern u64 pnv_first_deep_stop_state;
 1:	cmpd	cr0,r0,r0;					\
 	bne	1b;						\
 	IDLE_INST;						\
+
+#define	IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)			\
+	IDLE_STATE_ENTER_SEQ(IDLE_INST)                         \
 	b	.
 #endif /* CONFIG_PPC_P7_NAP */
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d39d6118c6e9..069aac8af909 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -381,12 +381,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	lbz	r3,PACA_THREAD_IDLE_STATE(r13)
 	cmpwi	r3,PNV_THREAD_NAP
 	bgt	10f
-	IDLE_STATE_ENTER_SEQ(PPC_NAP)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
 	/* No return */
 10:
 	cmpwi	r3,PNV_THREAD_SLEEP
 	bgt	2f
-	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
 	/* No return */
 
 2:
@@ -400,7 +400,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	 */
 	ori	r13,r13,1
 	SET_PACA(r13)
-	IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 	/* No return */
 4:
 #endif
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 72dac0b58061..be90e2f62bba 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -205,7 +205,7 @@ pnv_enter_arch207_idle_mode:
 	stb	r3,PACA_THREAD_IDLE_STATE(r13)
 	cmpwi	cr3,r3,PNV_THREAD_SLEEP
 	bge	cr3,2f
-	IDLE_STATE_ENTER_SEQ(PPC_NAP)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
 	/* No return */
 2:
 	/* Sleep or winkle */
@@ -239,7 +239,7 @@ pnv_fastsleep_workaround_at_entry:
 
 common_enter: /* common code for all the threads entering sleep or winkle */
 	bgt	cr3,enter_winkle
-	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
 
 fastsleep_workaround_at_entry:
 	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
@@ -261,7 +261,7 @@ fastsleep_workaround_at_entry:
 enter_winkle:
 	bl	save_sprs_to_stack
 
-	IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 
 /*
  * r3 - requested stop state
@@ -280,7 +280,7 @@ power_enter_stop:
 	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 	cmpd	r3,r4
 	bge	2f
-	IDLE_STATE_ENTER_SEQ(PPC_STOP)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP)
 2:
 /*
  * Entering deep idle state.
@@ -302,7 +302,7 @@ lwarx_loop_stop:
 
 	bl	save_sprs_to_stack
 
-	IDLE_STATE_ENTER_SEQ(PPC_STOP)
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP)
 
 _GLOBAL(power7_idle)
 	/* Now check if user or arch enabled NAP mode */
-- 
cgit v1.2.3


From dd34c74c97b6c3ed1ac7caec0b46267142659aff Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 25 Jan 2017 14:06:26 +0530
Subject: powernv:stop: Rename pnv_arch300_idle_init to pnv_power9_idle_init

Balbir pointed out that the name of the function pnv_arch300_idle_init
was inconsistent with the names of the variables and functions
pertaining to POWER9 features in book3s_idle.S.

This patch renames pnv_arch300_idle_init to pnv_power9_idle_init.

This patch does not change any behaviour.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/idle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 479c25601612..57bec031291b 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -298,7 +298,7 @@ u64 pnv_deepest_stop_state;
  * @dt_idle_states: Number of idle state entries
  * Returns 0 on success
  */
-static int __init pnv_arch300_idle_init(struct device_node *np, u32 *flags,
+static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 					int dt_idle_states)
 {
 	u64 *psscr_val = NULL;
@@ -373,7 +373,7 @@ static void __init pnv_probe_idle_states(void)
 	}
 
 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		if (pnv_arch300_idle_init(np, flags, dt_idle_states))
+		if (pnv_power9_idle_init(np, flags, dt_idle_states))
 			goto out;
 	}
 
-- 
cgit v1.2.3


From 9e9fc6f00a54f7064dc681ac187be6498d566a4f Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 25 Jan 2017 14:06:27 +0530
Subject: cpuidle:powernv: Add helper function to populate powernv idle states.

In the current code for powernv_add_idle_states, there is a lot of code
duplication while initializing an idle state in powernv_states table.

Add an inline helper function to populate the powernv_states[] table
for a given idle state. Invoke this for populating the "Nap",
"Fastsleep" and the stop states in powernv_add_idle_states.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/cpuidle/cpuidle-powernv.c | 89 +++++++++++++++++++++++----------------
 include/linux/cpuidle.h           |  1 +
 2 files changed, 54 insertions(+), 36 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 0835a37a5f3a..6871b7f34dc8 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -20,6 +20,10 @@
 #include <asm/opal.h>
 #include <asm/runlatch.h>
 
+/*
+ * Expose only those Hardware idle states via the cpuidle framework
+ * that have latency value below POWERNV_THRESHOLD_LATENCY_NS.
+ */
 #define POWERNV_THRESHOLD_LATENCY_NS 200000
 
 static struct cpuidle_driver powernv_idle_driver = {
@@ -167,6 +171,24 @@ static int powernv_cpuidle_driver_init(void)
 	return 0;
 }
 
+static inline void add_powernv_state(int index, const char *name,
+				     unsigned int flags,
+				     int (*idle_fn)(struct cpuidle_device *,
+						    struct cpuidle_driver *,
+						    int),
+				     unsigned int target_residency,
+				     unsigned int exit_latency,
+				     u64 psscr_val)
+{
+	strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
+	strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+	powernv_states[index].flags = flags;
+	powernv_states[index].target_residency = target_residency;
+	powernv_states[index].exit_latency = exit_latency;
+	powernv_states[index].enter = idle_fn;
+	stop_psscr_table[index] = psscr_val;
+}
+
 static int powernv_add_idle_states(void)
 {
 	struct device_node *power_mgt;
@@ -236,6 +258,7 @@ static int powernv_add_idle_states(void)
 		"ibm,cpu-idle-state-residency-ns", residency_ns, dt_idle_states);
 
 	for (i = 0; i < dt_idle_states; i++) {
+		unsigned int exit_latency, target_residency;
 		/*
 		 * If an idle state has exit latency beyond
 		 * POWERNV_THRESHOLD_LATENCY_NS then don't use it
@@ -243,28 +266,33 @@ static int powernv_add_idle_states(void)
 		 */
 		if (latency_ns[i] > POWERNV_THRESHOLD_LATENCY_NS)
 			continue;
+		/*
+		 * Firmware passes residency and latency values in ns.
+		 * cpuidle expects it in us.
+		 */
+		exit_latency = latency_ns[i] / 1000;
+		if (!rc)
+			target_residency = residency_ns[i] / 1000;
+		else
+			target_residency = 0;
 
 		/*
-		 * Cpuidle accepts exit_latency and target_residency in us.
-		 * Use default target_residency values if f/w does not expose it.
+		 * For nap and fastsleep, use default target_residency
+		 * values if f/w does not expose it.
 		 */
 		if (flags[i] & OPAL_PM_NAP_ENABLED) {
+			if (!rc)
+				target_residency = 100;
 			/* Add NAP state */
-			strcpy(powernv_states[nr_idle_states].name, "Nap");
-			strcpy(powernv_states[nr_idle_states].desc, "Nap");
-			powernv_states[nr_idle_states].flags = 0;
-			powernv_states[nr_idle_states].target_residency = 100;
-			powernv_states[nr_idle_states].enter = nap_loop;
+			add_powernv_state(nr_idle_states, "Nap",
+					  CPUIDLE_FLAG_NONE, nap_loop,
+					  target_residency, exit_latency, 0);
 		} else if ((flags[i] & OPAL_PM_STOP_INST_FAST) &&
 				!(flags[i] & OPAL_PM_TIMEBASE_STOP)) {
-			strncpy(powernv_states[nr_idle_states].name,
-				names[i], CPUIDLE_NAME_LEN);
-			strncpy(powernv_states[nr_idle_states].desc,
-				names[i], CPUIDLE_NAME_LEN);
-			powernv_states[nr_idle_states].flags = 0;
-
-			powernv_states[nr_idle_states].enter = stop_loop;
-			stop_psscr_table[nr_idle_states] = psscr_val[i];
+			add_powernv_state(nr_idle_states, names[i],
+					  CPUIDLE_FLAG_NONE, stop_loop,
+					  target_residency, exit_latency,
+					  psscr_val[i]);
 		}
 
 		/*
@@ -274,32 +302,21 @@ static int powernv_add_idle_states(void)
 #ifdef CONFIG_TICK_ONESHOT
 		if (flags[i] & OPAL_PM_SLEEP_ENABLED ||
 			flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) {
+			if (!rc)
+				target_residency = 300000;
 			/* Add FASTSLEEP state */
-			strcpy(powernv_states[nr_idle_states].name, "FastSleep");
-			strcpy(powernv_states[nr_idle_states].desc, "FastSleep");
-			powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIMER_STOP;
-			powernv_states[nr_idle_states].target_residency = 300000;
-			powernv_states[nr_idle_states].enter = fastsleep_loop;
+			add_powernv_state(nr_idle_states, "FastSleep",
+					  CPUIDLE_FLAG_TIMER_STOP,
+					  fastsleep_loop,
+					  target_residency, exit_latency, 0);
 		} else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) &&
 				(flags[i] & OPAL_PM_TIMEBASE_STOP)) {
-			strncpy(powernv_states[nr_idle_states].name,
-				names[i], CPUIDLE_NAME_LEN);
-			strncpy(powernv_states[nr_idle_states].desc,
-				names[i], CPUIDLE_NAME_LEN);
-
-			powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIMER_STOP;
-			powernv_states[nr_idle_states].enter = stop_loop;
-			stop_psscr_table[nr_idle_states] = psscr_val[i];
+			add_powernv_state(nr_idle_states, names[i],
+					  CPUIDLE_FLAG_TIMER_STOP, stop_loop,
+					  target_residency, exit_latency,
+					  psscr_val[i]);
 		}
 #endif
-		powernv_states[nr_idle_states].exit_latency =
-				((unsigned int)latency_ns[i]) / 1000;
-
-		if (!rc) {
-			powernv_states[nr_idle_states].target_residency =
-				((unsigned int)residency_ns[i]) / 1000;
-		}
-
 		nr_idle_states++;
 	}
 out:
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index da346f2817a8..fc1e5d7fc1c7 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -62,6 +62,7 @@ struct cpuidle_state {
 };
 
 /* Idle State Flags */
+#define CPUIDLE_FLAG_NONE       (0x00)
 #define CPUIDLE_FLAG_COUPLED	(0x02) /* state applies to multiple cpus */
 #define CPUIDLE_FLAG_TIMER_STOP (0x04)  /* timer is stopped on this state */
 
-- 
cgit v1.2.3


From 09206b600c76f20984e80d99f3b5343c79332a97 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 25 Jan 2017 14:06:28 +0530
Subject: powernv: Pass PSSCR value and mask to power9_idle_stop

The power9_idle_stop method currently takes only the requested stop
level as a parameter and picks up the rest of the PSSCR bits from a
hand-coded macro. This is not a very flexible design, especially when
the firmware has the capability to communicate the psscr value and the
mask associated with a particular stop state via device tree.

This patch modifies the power9_idle_stop API to take as parameters the
PSSCR value and the PSSCR mask corresponding to the stop state that
needs to be set. These PSSCR value and mask are respectively obtained
by parsing the "ibm,cpu-idle-state-psscr" and
"ibm,cpu-idle-state-psscr-mask" fields from the device tree.

In addition to this, the patch adds support for handling stop states
for which ESL and EC bits in the PSSCR are zero. As per the
architecture, a wakeup from these stop states resumes execution from
the subsequent instruction as opposed to waking up at the System
Vector.

The older firmware sets only the Requested Level (RL) field in the
psscr and psscr-mask exposed in the device tree. For older firmware
where psscr-mask=0xf, this patch will set the default sane values that
the set for for remaining PSSCR fields (i.e PSLL, MTL, ESL, EC, and
TR). For the new firmware, the patch will validate that the invariants
required by the ISA for the psscr values are maintained by the
firmware.

This skiboot patch that exports fully populated PSSCR values and the
mask for all the stop states can be found here:
https://lists.ozlabs.org/pipermail/skiboot/2016-September/004869.html

[Optimize the number of instructions before entering STOP with
ESL=EC=0, validate the PSSCR values provided by the firimware
maintains the invariants required as per the ISA suggested by Balbir
Singh]

Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cpuidle.h       |  44 ++++++++++
 arch/powerpc/include/asm/processor.h     |   3 +-
 arch/powerpc/kernel/idle_book3s.S        |  30 ++++---
 arch/powerpc/platforms/powernv/idle.c    | 138 ++++++++++++++++++++++++++++---
 arch/powerpc/platforms/powernv/powernv.h |   3 +-
 arch/powerpc/platforms/powernv/smp.c     |  14 ++--
 drivers/cpuidle/cpuidle-powernv.c        |  52 +++++++++---
 7 files changed, 241 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 0a3255b12587..fd321eb423cb 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -10,11 +10,55 @@
 #define PNV_CORE_IDLE_LOCK_BIT          0x100
 #define PNV_CORE_IDLE_THREAD_BITS       0x0FF
 
+/*
+ * ============================ NOTE =================================
+ * The older firmware populates only the RL field in the psscr_val and
+ * sets the psscr_mask to 0xf. On such a firmware, the kernel sets the
+ * remaining PSSCR fields to default values as follows:
+ *
+ * - ESL and EC bits are to 1. So wakeup from any stop state will be
+ *   at vector 0x100.
+ *
+ * - MTL and PSLL are set to the maximum allowed value as per the ISA,
+ *    i.e. 15.
+ *
+ * - The Transition Rate, TR is set to the Maximum value 3.
+ */
+#define PSSCR_HV_DEFAULT_VAL    (PSSCR_ESL | PSSCR_EC |		    \
+				PSSCR_PSLL_MASK | PSSCR_TR_MASK |   \
+				PSSCR_MTL_MASK)
+
+#define PSSCR_HV_DEFAULT_MASK   (PSSCR_ESL | PSSCR_EC |		    \
+				PSSCR_PSLL_MASK | PSSCR_TR_MASK |   \
+				PSSCR_MTL_MASK | PSSCR_RL_MASK)
+#define PSSCR_EC_SHIFT    20
+#define PSSCR_ESL_SHIFT   21
+#define GET_PSSCR_EC(x)   (((x) & PSSCR_EC) >> PSSCR_EC_SHIFT)
+#define GET_PSSCR_ESL(x)  (((x) & PSSCR_ESL) >> PSSCR_ESL_SHIFT)
+#define GET_PSSCR_RL(x)   ((x) & PSSCR_RL_MASK)
+
+#define ERR_EC_ESL_MISMATCH		-1
+#define ERR_DEEP_STATE_ESL_MISMATCH	-2
+
 #ifndef __ASSEMBLY__
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
 
 extern u64 pnv_first_deep_stop_state;
+
+int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
+static inline void report_invalid_psscr_val(u64 psscr_val, int err)
+{
+	switch (err) {
+	case ERR_EC_ESL_MISMATCH:
+		pr_warn("Invalid psscr 0x%016llx : ESL,EC bits unequal",
+			psscr_val);
+		break;
+	case ERR_DEEP_STATE_ESL_MISMATCH:
+		pr_warn("Invalid psscr 0x%016llx : ESL cleared for deep stop-state",
+			psscr_val);
+	}
+}
 #endif
 
 #endif
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 1ba814436c73..21e0b52685b5 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -454,7 +454,8 @@ extern int powersave_nap;	/* set if nap mode can be used in idle loop */
 extern unsigned long power7_nap(int check_irq);
 extern unsigned long power7_sleep(void);
 extern unsigned long power7_winkle(void);
-extern unsigned long power9_idle_stop(unsigned long stop_level);
+extern unsigned long power9_idle_stop(unsigned long stop_psscr_val,
+				      unsigned long stop_psscr_mask);
 
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index be90e2f62bba..4f6cf5596235 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -40,9 +40,7 @@
 #define _WORC	GPR11
 #define _PTCR	GPR12
 
-#define PSSCR_HV_TEMPLATE	PSSCR_ESL | PSSCR_EC | \
-				PSSCR_PSLL_MASK | PSSCR_TR_MASK | \
-				PSSCR_MTL_MASK
+#define PSSCR_EC_ESL_MASK_SHIFTED          (PSSCR_EC | PSSCR_ESL) >> 16
 
 	.text
 
@@ -264,7 +262,7 @@ enter_winkle:
 	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 
 /*
- * r3 - requested stop state
+ * r3 - PSSCR value corresponding to the requested stop state.
  */
 power_enter_stop:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -273,10 +271,19 @@ power_enter_stop:
 	/* DO THIS IN REAL MODE!  See comment above. */
 	stb	r4,HSTATE_HWTHREAD_STATE(r13)
 #endif
+/*
+ * Check if we are executing the lite variant with ESL=EC=0
+ */
+	andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
+	clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
+	bne	 1f
+	IDLE_STATE_ENTER_SEQ(PPC_STOP)
+	li	r3,0  /* Since we didn't lose state, return 0 */
+	b 	pnv_wakeup_noloss
 /*
  * Check if the requested state is a deep idle state.
  */
-	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
+1:	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
 	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 	cmpd	r3,r4
 	bge	2f
@@ -353,16 +360,17 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
 	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
 20:	nop;
 
-
 /*
- * r3 - requested stop state
+ * r3 - The PSSCR value corresponding to the stop state.
+ * r4 - The PSSCR mask corrresonding to the stop state.
  */
 _GLOBAL(power9_idle_stop)
-	LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE)
-	or	r4,r4,r3
-	mtspr	SPRN_PSSCR, r4
-	li	r4, 1
+	mfspr   r5,SPRN_PSSCR
+	andc    r5,r5,r4
+	or      r3,r3,r5
+	mtspr 	SPRN_PSSCR,r3
 	LOAD_REG_ADDR(r5,power_enter_stop)
+	li	r4,1
 	b	pnv_powersave_common
 	/* No return */
 /*
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 57bec031291b..4ee837e6391a 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -237,15 +237,21 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
 			show_fastsleep_workaround_applyonce,
 			store_fastsleep_workaround_applyonce);
 
+/*
+ * The default stop state that will be used by ppc_md.power_save
+ * function on platforms that support stop instruction.
+ */
+u64 pnv_default_stop_val;
+u64 pnv_default_stop_mask;
 
 /*
  * Used for ppc_md.power_save which needs a function with no parameters
  */
 static void power9_idle(void)
 {
-	/* Requesting stop state 0 */
-	power9_idle_stop(0);
+	power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask);
 }
+
 /*
  * First deep stop state. Used to figure out when to save/restore
  * hypervisor context.
@@ -253,9 +259,11 @@ static void power9_idle(void)
 u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
 
 /*
- * Deepest stop idle state. Used when a cpu is offlined
+ * psscr value and mask of the deepest stop idle state.
+ * Used when a cpu is offlined.
  */
-u64 pnv_deepest_stop_state;
+u64 pnv_deepest_stop_psscr_val;
+u64 pnv_deepest_stop_psscr_mask;
 
 /*
  * Power ISA 3.0 idle initialization.
@@ -292,6 +300,44 @@ u64 pnv_deepest_stop_state;
  *	Bits 60:63 - Requested Level
  *	Used to specify which power-saving level must be entered on executing
  *	stop instruction
+ */
+
+int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
+{
+	int err = 0;
+
+	/*
+	 * psscr_mask == 0xf indicates an older firmware.
+	 * Set remaining fields of psscr to the default values.
+	 * See NOTE above definition of PSSCR_HV_DEFAULT_VAL
+	 */
+	if (*psscr_mask == 0xf) {
+		*psscr_val = *psscr_val | PSSCR_HV_DEFAULT_VAL;
+		*psscr_mask = PSSCR_HV_DEFAULT_MASK;
+		return err;
+	}
+
+	/*
+	 * New firmware is expected to set the psscr_val bits correctly.
+	 * Validate that the following invariants are correctly maintained by
+	 * the new firmware.
+	 * - ESL bit value matches the EC bit value.
+	 * - ESL bit is set for all the deep stop states.
+	 */
+	if (GET_PSSCR_ESL(*psscr_val) != GET_PSSCR_EC(*psscr_val)) {
+		err = ERR_EC_ESL_MISMATCH;
+	} else if ((flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+		GET_PSSCR_ESL(*psscr_val) == 0) {
+		err = ERR_DEEP_STATE_ESL_MISMATCH;
+	}
+
+	return err;
+}
+
+/*
+ * pnv_arch300_idle_init: Initializes the default idle state, first
+ *                        deep idle state and deepest idle state on
+ *                        ISA 3.0 CPUs.
  *
  * @np: /ibm,opal/power-mgt device node
  * @flags: cpu-idle-state-flags array
@@ -302,43 +348,109 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 					int dt_idle_states)
 {
 	u64 *psscr_val = NULL;
+	u64 *psscr_mask = NULL;
+	u32 *residency_ns = NULL;
+	u64 max_residency_ns = 0;
 	int rc = 0, i;
+	bool default_stop_found = false, deepest_stop_found = false;
 
-	psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val),
-				GFP_KERNEL);
-	if (!psscr_val) {
+	psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
+	psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
+	residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns),
+			       GFP_KERNEL);
+
+	if (!psscr_val || !psscr_mask || !residency_ns) {
 		rc = -1;
 		goto out;
 	}
+
 	if (of_property_read_u64_array(np,
 		"ibm,cpu-idle-state-psscr",
 		psscr_val, dt_idle_states)) {
-		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-states-psscr in DT\n");
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n");
+		rc = -1;
+		goto out;
+	}
+
+	if (of_property_read_u64_array(np,
+				       "ibm,cpu-idle-state-psscr-mask",
+				       psscr_mask, dt_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n");
+		rc = -1;
+		goto out;
+	}
+
+	if (of_property_read_u32_array(np,
+				       "ibm,cpu-idle-state-residency-ns",
+					residency_ns, dt_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n");
 		rc = -1;
 		goto out;
 	}
 
 	/*
-	 * Set pnv_first_deep_stop_state and pnv_deepest_stop_state.
+	 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
+	 * and the pnv_default_stop_{val,mask}.
+	 *
 	 * pnv_first_deep_stop_state should be set to the first stop
 	 * level to cause hypervisor state loss.
-	 * pnv_deepest_stop_state should be set to the deepest stop
-	 * stop state.
+	 *
+	 * pnv_deepest_stop_{val,mask} should be set to values corresponding to
+	 * the deepest stop state.
+	 *
+	 * pnv_default_stop_{val,mask} should be set to values corresponding to
+	 * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
 	 */
 	pnv_first_deep_stop_state = MAX_STOP_STATE;
 	for (i = 0; i < dt_idle_states; i++) {
+		int err;
 		u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK;
 
 		if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) &&
 		     (pnv_first_deep_stop_state > psscr_rl))
 			pnv_first_deep_stop_state = psscr_rl;
 
-		if (pnv_deepest_stop_state < psscr_rl)
-			pnv_deepest_stop_state = psscr_rl;
+		err = validate_psscr_val_mask(&psscr_val[i], &psscr_mask[i],
+					      flags[i]);
+		if (err) {
+			report_invalid_psscr_val(psscr_val[i], err);
+			continue;
+		}
+
+		if (max_residency_ns < residency_ns[i]) {
+			max_residency_ns = residency_ns[i];
+			pnv_deepest_stop_psscr_val = psscr_val[i];
+			pnv_deepest_stop_psscr_mask = psscr_mask[i];
+			deepest_stop_found = true;
+		}
+
+		if (!default_stop_found &&
+		    (flags[i] & OPAL_PM_STOP_INST_FAST)) {
+			pnv_default_stop_val = psscr_val[i];
+			pnv_default_stop_mask = psscr_mask[i];
+			default_stop_found = true;
+		}
+	}
+
+	if (!default_stop_found) {
+		pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL;
+		pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK;
+		pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+			pnv_default_stop_val, pnv_default_stop_mask);
+	}
+
+	if (!deepest_stop_found) {
+		pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL;
+		pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK;
+		pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+			pnv_deepest_stop_psscr_val,
+			pnv_deepest_stop_psscr_mask);
 	}
 
 out:
 	kfree(psscr_val);
+	kfree(psscr_mask);
+	kfree(residency_ns);
 	return rc;
 }
 
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index da7c843ac7f1..613052232475 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -18,7 +18,8 @@ static inline void pnv_pci_shutdown(void) { }
 #endif
 
 extern u32 pnv_get_supported_cpuidle_states(void);
-extern u64 pnv_deepest_stop_state;
+extern u64 pnv_deepest_stop_psscr_val;
+extern u64 pnv_deepest_stop_psscr_mask;
 
 extern void pnv_lpc_init(void);
 
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c789258ae1e1..1c6405fb769a 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -182,15 +182,17 @@ static void pnv_smp_cpu_kill_self(void)
 
 		ppc64_runlatch_off();
 
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
-			srr1 = power9_idle_stop(pnv_deepest_stop_state);
-		else if (idle_states & OPAL_PM_WINKLE_ENABLED)
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
+						pnv_deepest_stop_psscr_mask);
+		} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
 			srr1 = power7_winkle();
-		else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
-				(idle_states & OPAL_PM_SLEEP_ENABLED_ER1))
+		} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+			   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
 			srr1 = power7_sleep();
-		else
+		} else {
 			srr1 = power7_nap(1);
+		}
 
 		ppc64_runlatch_on();
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 6871b7f34dc8..370593006f5f 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -19,6 +19,7 @@
 #include <asm/firmware.h>
 #include <asm/opal.h>
 #include <asm/runlatch.h>
+#include <asm/cpuidle.h>
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -34,7 +35,12 @@ static struct cpuidle_driver powernv_idle_driver = {
 static int max_idle_state;
 static struct cpuidle_state *cpuidle_state_table;
 
-static u64 stop_psscr_table[CPUIDLE_STATE_MAX];
+struct stop_psscr_table {
+	u64 val;
+	u64 mask;
+};
+
+static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX];
 
 static u64 snooze_timeout;
 static bool snooze_timeout_en;
@@ -106,7 +112,8 @@ static int stop_loop(struct cpuidle_device *dev,
 		     int index)
 {
 	ppc64_runlatch_off();
-	power9_idle_stop(stop_psscr_table[index]);
+	power9_idle_stop(stop_psscr_table[index].val,
+			 stop_psscr_table[index].mask);
 	ppc64_runlatch_on();
 	return index;
 }
@@ -178,7 +185,7 @@ static inline void add_powernv_state(int index, const char *name,
 						    int),
 				     unsigned int target_residency,
 				     unsigned int exit_latency,
-				     u64 psscr_val)
+				     u64 psscr_val, u64 psscr_mask)
 {
 	strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
 	strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
@@ -186,7 +193,8 @@ static inline void add_powernv_state(int index, const char *name,
 	powernv_states[index].target_residency = target_residency;
 	powernv_states[index].exit_latency = exit_latency;
 	powernv_states[index].enter = idle_fn;
-	stop_psscr_table[index] = psscr_val;
+	stop_psscr_table[index].val = psscr_val;
+	stop_psscr_table[index].mask = psscr_mask;
 }
 
 static int powernv_add_idle_states(void)
@@ -198,7 +206,9 @@ static int powernv_add_idle_states(void)
 	u32 residency_ns[CPUIDLE_STATE_MAX];
 	u32 flags[CPUIDLE_STATE_MAX];
 	u64 psscr_val[CPUIDLE_STATE_MAX];
+	u64 psscr_mask[CPUIDLE_STATE_MAX];
 	const char *names[CPUIDLE_STATE_MAX];
+	u32 has_stop_states = 0;
 	int i, rc;
 
 	/* Currently we have snooze statically defined */
@@ -245,15 +255,25 @@ static int powernv_add_idle_states(void)
 
 	/*
 	 * If the idle states use stop instruction, probe for psscr values
-	 * which are necessary to specify required stop level.
+	 * and psscr mask which are necessary to specify required stop level.
 	 */
-	if (flags[0] & (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP))
+	has_stop_states = (flags[0] &
+			   (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP));
+	if (has_stop_states) {
 		if (of_property_read_u64_array(power_mgt,
 		    "ibm,cpu-idle-state-psscr", psscr_val, dt_idle_states)) {
-			pr_warn("cpuidle-powernv: missing ibm,cpu-idle-states-psscr in DT\n");
+			pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n");
 			goto out;
 		}
 
+		if (of_property_read_u64_array(power_mgt,
+					       "ibm,cpu-idle-state-psscr-mask",
+						psscr_mask, dt_idle_states)) {
+			pr_warn("cpuidle-powernv:Missing ibm,cpu-idle-state-psscr-mask in DT\n");
+			goto out;
+		}
+	}
+
 	rc = of_property_read_u32_array(power_mgt,
 		"ibm,cpu-idle-state-residency-ns", residency_ns, dt_idle_states);
 
@@ -276,6 +296,16 @@ static int powernv_add_idle_states(void)
 		else
 			target_residency = 0;
 
+		if (has_stop_states) {
+			int err = validate_psscr_val_mask(&psscr_val[i],
+							  &psscr_mask[i],
+							  flags[i]);
+			if (err) {
+				report_invalid_psscr_val(psscr_val[i], err);
+				continue;
+			}
+		}
+
 		/*
 		 * For nap and fastsleep, use default target_residency
 		 * values if f/w does not expose it.
@@ -286,13 +316,13 @@ static int powernv_add_idle_states(void)
 			/* Add NAP state */
 			add_powernv_state(nr_idle_states, "Nap",
 					  CPUIDLE_FLAG_NONE, nap_loop,
-					  target_residency, exit_latency, 0);
+					  target_residency, exit_latency, 0, 0);
 		} else if ((flags[i] & OPAL_PM_STOP_INST_FAST) &&
 				!(flags[i] & OPAL_PM_TIMEBASE_STOP)) {
 			add_powernv_state(nr_idle_states, names[i],
 					  CPUIDLE_FLAG_NONE, stop_loop,
 					  target_residency, exit_latency,
-					  psscr_val[i]);
+					  psscr_val[i], psscr_mask[i]);
 		}
 
 		/*
@@ -308,13 +338,13 @@ static int powernv_add_idle_states(void)
 			add_powernv_state(nr_idle_states, "FastSleep",
 					  CPUIDLE_FLAG_TIMER_STOP,
 					  fastsleep_loop,
-					  target_residency, exit_latency, 0);
+					  target_residency, exit_latency, 0, 0);
 		} else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) &&
 				(flags[i] & OPAL_PM_TIMEBASE_STOP)) {
 			add_powernv_state(nr_idle_states, names[i],
 					  CPUIDLE_FLAG_TIMER_STOP, stop_loop,
 					  target_residency, exit_latency,
-					  psscr_val[i]);
+					  psscr_val[i], psscr_mask[i]);
 		}
 #endif
 		nr_idle_states++;
-- 
cgit v1.2.3


From b48ff52043f489d594b989b318c120ca340a2e41 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Wed, 25 Jan 2017 14:06:29 +0530
Subject: Documentation:powerpc: Add device-tree bindings for power-mgt

Document the device-tree bindings defining the the properties under
the @power-mgt node in the device tree that describe the idle states
for Linux running on baremetal POWER servers.

These bindings are documented separately instead of using the the
common idle state bindings since the idle-states on POWER servers
are exposed as property arrays where as the common idle state bindings
expect idle-states to be described as nodes.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../devicetree/bindings/powerpc/opal/power-mgt.txt | 118 +++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt

diff --git a/Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt b/Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt
new file mode 100644
index 000000000000..9d619e955576
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt
@@ -0,0 +1,118 @@
+IBM Power-Management Bindings
+=============================
+
+Linux running on baremetal POWER machines has access to the processor
+idle states. The description of these idle states is exposed via the
+node @power-mgt in the device-tree by the firmware.
+
+Definitions:
+----------------
+Typically each idle state has the following associated properties:
+
+- name: The name of the idle state as defined by the firmware.
+
+- flags: indicating some aspects of this idle states such as the
+         extent of state-loss, whether timebase is stopped on this
+         idle states and so on. The flag bits are as follows:
+
+- exit-latency: The latency involved in transitioning the state of the
+		CPU from idle to running.
+
+- target-residency: The minimum time that the CPU needs to reside in
+		    this idle state in order to accrue power-savings
+		    benefit.
+
+Properties
+----------------
+The following properties provide details about the idle states. These
+properties are exposed as arrays. Each entry in the property array
+provides the value of that property for the idle state associated with
+the array index of that entry.
+
+If idle-states are defined, then the properties
+"ibm,cpu-idle-state-names" and "ibm,cpu-idle-state-flags" are
+required. The other properties are required unless mentioned
+otherwise. The length of all the property arrays must be the same.
+
+- ibm,cpu-idle-state-names:
+	Array of strings containing the names of the idle states.
+
+- ibm,cpu-idle-state-flags:
+	Array of unsigned 32-bit values containing the values of the
+	flags associated with the the aforementioned idle-states. The
+	flag bits are as follows:
+		0x00000001 /* Decrementer would stop */
+		0x00000002 /* Needs timebase restore */
+		0x00001000 /* Restore GPRs like nap */
+		0x00002000 /* Restore hypervisor resource from PACA pointer */
+		0x00004000 /* Program PORE to restore PACA pointer */
+		0x00010000 /* This is a nap state (POWER7,POWER8) */
+		0x00020000 /* This is a fast-sleep state (POWER8)*/
+		0x00040000 /* This is a winkle state (POWER8) */
+		0x00080000 /* This is a fast-sleep state which requires a */
+			   /* software workaround for restoring the */
+			   /* timebase (POWER8) */
+		0x00800000 /* This state uses SPR PMICR instruction */
+			   /* (POWER8)*/
+		0x00100000 /* This is a fast stop state (POWER9) */
+		0x00200000 /* This is a deep-stop state (POWER9) */
+
+- ibm,cpu-idle-state-latencies-ns:
+	Array of unsigned 32-bit values containing the values of the
+	exit-latencies (in ns) for the idle states in
+	ibm,cpu-idle-state-names.
+
+- ibm,cpu-idle-state-residency-ns:
+	Array of unsigned 32-bit values containing the values of the
+	target-residency (in ns) for the idle states in
+	ibm,cpu-idle-state-names. On POWER8 this is an optional
+	property. If the property is absent, the target residency for
+	the "Nap", "FastSleep" are defined to 10000 and 300000000
+	respectively by the kernel. On POWER9 this property is required.
+
+- ibm,cpu-idle-state-psscr:
+	Array of unsigned 64-bit values containing the values for the
+	PSSCR for each of the idle states in ibm,cpu-idle-state-names.
+	This property is required on POWER9 and absent on POWER8.
+
+- ibm,cpu-idle-state-psscr-mask:
+	Array of unsigned 64-bit values containing the masks
+	indicating which psscr fields are set in the corresponding
+	entries of ibm,cpu-idle-state-psscr. This property is
+	required on POWER9 and absent on POWER8.
+
+	Whenever the firmware sets an entry in
+	ibm,cpu-idle-state-psscr-mask value to 0xf, it implies that
+	only the Requested Level (RL) field of the corresponding entry
+	in ibm,cpu-idle-state-psscr should be considered by the
+	kernel. For such idle states, the kernel would set the
+	remaining fields of the psscr to the following sane-default
+	values.
+
+		- ESL and EC bits are to 1. So wakeup from any stop
+		  state will be at vector 0x100.
+
+		- MTL and PSLL are set to the maximum allowed value as
+		  per the ISA, i.e. 15.
+
+		- The Transition Rate, TR is set to the Maximum value
+                  3.
+
+	For all the other values of the entry in
+	ibm,cpu-idle-state-psscr-mask, the kernel expects all the
+	psscr fields of the corresponding entry in
+	ibm,cpu-idle-state-psscr to be correctly set by the firmware.
+
+- ibm,cpu-idle-state-pmicr:
+	Array of unsigned 64-bit values containing the pmicr values
+	for the idle states in ibm,cpu-idle-state-names. This 64-bit
+	register value is to be set in pmicr for the corresponding
+	state if the flag indicates that pmicr SPR should be set. This
+	is an optional property on POWER8 and is absent on
+	POWER9.
+
+- ibm,cpu-idle-state-pmicr-mask:
+	Array of unsigned 64-bit values containing the mask indicating
+	which of the fields of the PMICR are set in the corresponding
+	entries in ibm,cpu-idle-state-pmicr. This is an optional
+	property on POWER8 and is absent on POWER9.
-- 
cgit v1.2.3


From 38e9d36bc149932964dcd25818c7f7283239592d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 30 Jan 2017 18:11:55 +1100
Subject: powerpc: Move isa bridge definitions to separate include

We'll be adding non-PCI isa bridge support so let's not
have all the definition in pci-bridge.h

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/isa-bridge.h     | 28 ++++++++++++++++++++++++++++
 arch/powerpc/include/asm/pci-bridge.h     | 18 ------------------
 arch/powerpc/kernel/iomap.c               |  1 +
 arch/powerpc/kernel/isa-bridge.c          |  1 +
 arch/powerpc/platforms/maple/pci.c        |  1 +
 arch/powerpc/platforms/powernv/opal-lpc.c |  1 +
 arch/powerpc/platforms/pseries/setup.c    |  1 +
 7 files changed, 33 insertions(+), 18 deletions(-)
 create mode 100644 arch/powerpc/include/asm/isa-bridge.h

diff --git a/arch/powerpc/include/asm/isa-bridge.h b/arch/powerpc/include/asm/isa-bridge.h
new file mode 100644
index 000000000000..94d9c1c9706d
--- /dev/null
+++ b/arch/powerpc/include/asm/isa-bridge.h
@@ -0,0 +1,28 @@
+#ifndef __ISA_BRIDGE_H
+#define __ISA_BRIDGE_H
+
+#ifdef CONFIG_PPC64
+
+extern void isa_bridge_find_early(struct pci_controller *hose);
+
+static inline int isa_vaddr_is_ioport(void __iomem *address)
+{
+	/* Check if address hits the reserved legacy IO range */
+	unsigned long ea = (unsigned long)address;
+	return ea >= ISA_IO_BASE && ea < ISA_IO_END;
+}
+
+#else
+
+static inline int isa_vaddr_is_ioport(void __iomem *address)
+{
+	/* No specific ISA handling on ppc32 at this stage, it
+	 * all goes through PCI
+	 */
+	return 0;
+}
+
+#endif
+
+#endif /* __ISA_BRIDGE_H */
+
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index c0309c59bed8..56c67d3f0108 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -174,14 +174,6 @@ extern int pci_device_from_OF_node(struct device_node *node,
 				   u8 *bus, u8 *devfn);
 extern void pci_create_OF_bus_map(void);
 
-static inline int isa_vaddr_is_ioport(void __iomem *address)
-{
-	/* No specific ISA handling on ppc32 at this stage, it
-	 * all goes through PCI
-	 */
-	return 0;
-}
-
 #else	/* CONFIG_PPC64 */
 
 /*
@@ -269,16 +261,6 @@ extern void pci_hp_remove_devices(struct pci_bus *bus);
 /** Discover new pci devices under this bus, and add them */
 extern void pci_hp_add_devices(struct pci_bus *bus);
 
-
-extern void isa_bridge_find_early(struct pci_controller *hose);
-
-static inline int isa_vaddr_is_ioport(void __iomem *address)
-{
-	/* Check if address hits the reserved legacy IO range */
-	unsigned long ea = (unsigned long)address;
-	return ea >= ISA_IO_BASE && ea < ISA_IO_END;
-}
-
 extern int pcibios_unmap_io_space(struct pci_bus *bus);
 extern int pcibios_map_io_space(struct pci_bus *bus);
 
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index 3963f0b68d52..a1854d1ded8b 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -8,6 +8,7 @@
 #include <linux/export.h>
 #include <asm/io.h>
 #include <asm/pci-bridge.h>
+#include <asm/isa-bridge.h>
 
 /*
  * Here comes the ppc64 implementation of the IOMAP 
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index ae1316106e2b..c898ff0cafec 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -29,6 +29,7 @@
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
 #include <asm/ppc-pci.h>
+#include <asm/isa-bridge.h>
 
 unsigned long isa_io_base;	/* NULL if no ISA bus */
 EXPORT_SYMBOL(isa_io_base);
diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c
index a0589aac4163..69794d9389c2 100644
--- a/arch/powerpc/platforms/maple/pci.c
+++ b/arch/powerpc/platforms/maple/pci.c
@@ -24,6 +24,7 @@
 #include <asm/machdep.h>
 #include <asm/iommu.h>
 #include <asm/ppc-pci.h>
+#include <asm/isa-bridge.h>
 
 #include "maple.h"
 
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 4886eb8b6381..2048ce7912f4 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -23,6 +23,7 @@
 #include <asm/prom.h>
 #include <linux/uaccess.h>
 #include <asm/debug.h>
+#include <asm/isa-bridge.h>
 
 static int opal_lpc_chip_id = -1;
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 7736352f7279..b4d362ed03a1 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -66,6 +66,7 @@
 #include <asm/reg.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/kexec.h>
+#include <asm/isa-bridge.h>
 
 #include "pseries.h"
 
-- 
cgit v1.2.3


From b3c711a9e19eb2b300132f1712ff3d5afb4b02dc Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 30 Jan 2017 18:11:56 +1100
Subject: powerpc: Add support for non-PCI ISA bridges

The POWER9 chip supports an LPC bus that isn't hanging
off a PCI bus, so let's add support for that, mapping it
to the reserved space at ISA_IO_BASE

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/isa-bridge.h |  1 +
 arch/powerpc/kernel/isa-bridge.c      | 91 +++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/arch/powerpc/include/asm/isa-bridge.h b/arch/powerpc/include/asm/isa-bridge.h
index 94d9c1c9706d..a3a7c1d63a7c 100644
--- a/arch/powerpc/include/asm/isa-bridge.h
+++ b/arch/powerpc/include/asm/isa-bridge.h
@@ -4,6 +4,7 @@
 #ifdef CONFIG_PPC64
 
 extern void isa_bridge_find_early(struct pci_controller *hose);
+extern void isa_bridge_init_non_pci(struct device_node *np);
 
 static inline int isa_vaddr_is_ioport(void __iomem *address)
 {
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index c898ff0cafec..bb6f8993412e 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -167,6 +167,97 @@ void __init isa_bridge_find_early(struct pci_controller *hose)
 	pr_debug("ISA bridge (early) is %s\n", np->full_name);
 }
 
+/**
+ * isa_bridge_find_early - Find and map the ISA IO space early before
+ *                         main PCI discovery. This is optionally called by
+ *                         the arch code when adding PCI PHBs to get early
+ *                         access to ISA IO ports
+ */
+void __init isa_bridge_init_non_pci(struct device_node *np)
+{
+	const __be32 *ranges, *pbasep = NULL;
+	int rlen, i, rs;
+	u32 na, ns, pna;
+	u64 cbase, pbase, size = 0;
+
+	/* If we already have an ISA bridge, bail off */
+	if (isa_bridge_devnode != NULL)
+		return;
+
+	pna = of_n_addr_cells(np);
+	if (of_property_read_u32(np, "#address-cells", &na) ||
+	    of_property_read_u32(np, "#size-cells", &ns)) {
+		pr_warn("ISA: Non-PCI bridge %s is missing address format\n",
+			np->full_name);
+		return;
+	}
+
+	/* Check it's a supported address format */
+	if (na != 2 || ns != 1) {
+		pr_warn("ISA: Non-PCI bridge %s has unsupported address format\n",
+			np->full_name);
+		return;
+	}
+	rs = na + ns + pna;
+
+	/* Grab the ranges property */
+	ranges = of_get_property(np, "ranges", &rlen);
+	if (ranges == NULL || rlen < rs) {
+		pr_warn("ISA: Non-PCI bridge %s has absent or invalid ranges\n",
+			np->full_name);
+		return;
+	}
+
+	/* Parse it. We are only looking for IO space */
+	for (i = 0; (i + rs - 1) < rlen; i += rs) {
+		if (be32_to_cpup(ranges + i) != 1)
+			continue;
+		cbase = be32_to_cpup(ranges + i + 1);
+		size = of_read_number(ranges + i + na + pna, ns);
+		pbasep = ranges + i + na;
+		break;
+	}
+
+	/* Got something ? */
+	if (!size || !pbasep) {
+		pr_warn("ISA: Non-PCI bridge %s has no usable IO range\n",
+			np->full_name);
+		return;
+	}
+
+	/* Align size and make sure it's cropped to 64K */
+	size = PAGE_ALIGN(size);
+	if (size > 0x10000)
+		size = 0x10000;
+
+	/* Map pbase */
+	pbase = of_translate_address(np, pbasep);
+	if (pbase == OF_BAD_ADDR) {
+		pr_warn("ISA: Non-PCI bridge %s failed to translate IO base\n",
+			np->full_name);
+		return;
+	}
+
+	/* We need page alignment */
+	if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) {
+		pr_warn("ISA: Non-PCI bridge %s has non aligned IO range\n",
+			np->full_name);
+		return;
+	}
+
+	/* Got it */
+	isa_bridge_devnode = np;
+
+	/* Set the global ISA io base to indicate we have an ISA bridge
+	 * and map it
+	 */
+	isa_io_base = ISA_IO_BASE;
+	__ioremap_at(pbase, (void *)ISA_IO_BASE,
+		     size, pgprot_val(pgprot_noncached(__pgprot(0))));
+
+	pr_debug("ISA: Non-PCI bridge is %s\n", np->full_name);
+}
+
 /**
  * isa_bridge_find_late - Find and map the ISA IO space upon discovery of
  *                        a new ISA bridge
-- 
cgit v1.2.3


From 023b13a50183d9cfc4fc5a66cb1f773ace22024c Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 30 Jan 2017 18:11:57 +1100
Subject: powerpc/powernv: Add support for direct mapped LPC on POWER9

Use the new non-PCI ISA bridge support to expose the POWER9
LPC bus as direct mapped via the ISA IO port range. This
enables direct access via drivers such as 8250

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/legacy_serial.c       |  3 ++-
 arch/powerpc/platforms/powernv/opal-lpc.c | 18 +++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index bc525ea0dc09..0694d20f85b6 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -233,7 +233,8 @@ static int __init add_legacy_isa_port(struct device_node *np,
 	 *
 	 * Note: Don't even try on P8 lpc, we know it's not directly mapped
 	 */
-	if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc")) {
+	if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc") ||
+	    of_get_property(isa_brg, "ranges", NULL)) {
 		taddr = of_translate_address(np, reg);
 		if (taddr == OF_BAD_ADDR)
 			taddr = 0;
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 2048ce7912f4..1a8cd54c1e74 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -407,9 +407,17 @@ void opal_lpc_init(void)
 	if (opal_lpc_chip_id < 0)
 		return;
 
-	/* Setup special IO ops */
-	ppc_pci_io = opal_lpc_io;
-	isa_io_special = true;
-
-	pr_info("OPAL: Power8 LPC bus found, chip ID %d\n", opal_lpc_chip_id);
+	/* Does it support direct mapping ? */
+	if (of_get_property(np, "ranges", NULL)) {
+		pr_info("OPAL: Found memory mapped LPC bus on chip %d\n",
+			opal_lpc_chip_id);
+		isa_bridge_init_non_pci(np);
+	} else {
+		pr_info("OPAL: Found non-mapped LPC bus on chip %d\n",
+			opal_lpc_chip_id);
+
+		/* Setup special IO ops */
+		ppc_pci_io = opal_lpc_io;
+		isa_io_special = true;
+	}
 }
-- 
cgit v1.2.3


From b5200ec9edf038459619fce9988842efa751a2c5 Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Mon, 16 Jan 2017 13:07:43 -0600
Subject: powerpc/mm: refactor radix physical page mapping

Move the page mapping code in radix_init_pgtable() into a separate
function that will also be used for memory hotplug.

The current goto loop progressively decreases its mapping size as it
covers the tail of a range whose end is unaligned. Change this to a for
loop which can do the same for both ends of the range.

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-radix.c | 88 +++++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 086522b7c60f..c0365eca7f81 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -108,54 +108,66 @@ set_the_pte:
 	return 0;
 }
 
+static inline void __meminit print_mapping(unsigned long start,
+					   unsigned long end,
+					   unsigned long size)
+{
+	if (end <= start)
+		return;
+
+	pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size);
+}
+
+static int __meminit create_physical_mapping(unsigned long start,
+					     unsigned long end)
+{
+	unsigned long addr, mapping_size = 0;
+
+	start = _ALIGN_UP(start, PAGE_SIZE);
+	for (addr = start; addr < end; addr += mapping_size) {
+		unsigned long gap, previous_size;
+		int rc;
+
+		gap = end - addr;
+		previous_size = mapping_size;
+
+		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+		    mmu_psize_defs[MMU_PAGE_1G].shift)
+			mapping_size = PUD_SIZE;
+		else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+			 mmu_psize_defs[MMU_PAGE_2M].shift)
+			mapping_size = PMD_SIZE;
+		else
+			mapping_size = PAGE_SIZE;
+
+		if (mapping_size != previous_size) {
+			print_mapping(start, addr, previous_size);
+			start = addr;
+		}
+
+		rc = radix__map_kernel_page((unsigned long)__va(addr), addr,
+					    PAGE_KERNEL_X, mapping_size);
+		if (rc)
+			return rc;
+	}
+
+	print_mapping(start, addr, mapping_size);
+	return 0;
+}
+
 static void __init radix_init_pgtable(void)
 {
-	int loop_count;
-	u64 base, end, start_addr;
 	unsigned long rts_field;
 	struct memblock_region *reg;
-	unsigned long linear_page_size;
 
 	/* We don't support slb for radix */
 	mmu_slb_size = 0;
 	/*
 	 * Create the linear mapping, using standard page size for now
 	 */
-	loop_count = 0;
-	for_each_memblock(memory, reg) {
-
-		start_addr = reg->base;
-
-redo:
-		if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift)
-			linear_page_size = PUD_SIZE;
-		else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift)
-			linear_page_size = PMD_SIZE;
-		else
-			linear_page_size = PAGE_SIZE;
-
-		base = _ALIGN_UP(start_addr, linear_page_size);
-		end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size);
-
-		pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n",
-			(unsigned long)base, (unsigned long)end,
-			linear_page_size);
-
-		while (base < end) {
-			radix__map_kernel_page((unsigned long)__va(base),
-					      base, PAGE_KERNEL_X,
-					      linear_page_size);
-			base += linear_page_size;
-		}
-		/*
-		 * map the rest using lower page size
-		 */
-		if (end < reg->base + reg->size) {
-			start_addr = end;
-			loop_count++;
-			goto redo;
-		}
-	}
+	for_each_memblock(memory, reg)
+		WARN_ON(create_physical_mapping(reg->base,
+						reg->base + reg->size));
 	/*
 	 * Allocate Partition table and process table for the
 	 * host.
-- 
cgit v1.2.3


From 6cc27341b21a81052f36c137bc44cf55f83e46ff Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Mon, 16 Jan 2017 13:07:44 -0600
Subject: powerpc/mm: add radix__create_section_mapping()

Wire up memory hotplug page mapping for radix. Share the mapping
function already used by radix_init_pgtable().

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 4 ++++
 arch/powerpc/mm/pgtable-book3s64.c         | 2 +-
 arch/powerpc/mm/pgtable-radix.c            | 7 +++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index b4d1302387a3..43c25718de61 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -291,5 +291,9 @@ static inline unsigned long radix__get_tree_size(void)
 	}
 	return rts_field;
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int radix__create_section_mapping(unsigned long start, unsigned long end);
+#endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 653ff6c74ebe..2b13f6b87e25 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -131,7 +131,7 @@ void mmu_cleanup_all(void)
 int create_section_mapping(unsigned long start, unsigned long end)
 {
 	if (radix_enabled())
-		return -ENODEV;
+		return radix__create_section_mapping(start, end);
 
 	return hash__create_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index c0365eca7f81..7829e09ab4dd 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -481,6 +481,13 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	memblock_set_current_limit(first_memblock_base + first_memblock_size);
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
+{
+	return create_physical_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 int __meminit radix__vmemmap_create_mapping(unsigned long start,
 				      unsigned long page_size,
-- 
cgit v1.2.3


From 4b5d62ca17a1cd2ffc8399e1d1c3ebbabf16e78f Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Mon, 16 Jan 2017 13:07:45 -0600
Subject: powerpc/mm: add radix__remove_section_mapping()

Tear down and free the four-level page tables of physical mappings
during memory hotremove.

Borrow the basic structure of remove_pagetable() and friends from the
identically-named x86 functions. Reduce the frequency of tlb flushes and
page_table_lock spinlocks by only doing them in the outermost function.
There was some question as to whether the locking is needed at all.
Leave it for now, but we could consider dropping it.

Memory must be offline to be removed, thus not in use. So there
shouldn't be the sort of concurrent page walking activity here that
might prompt us to use RCU.

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/radix.h |   1 +
 arch/powerpc/mm/pgtable-book3s64.c         |   2 +-
 arch/powerpc/mm/pgtable-radix.c            | 133 +++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 43c25718de61..0032b662284c 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int radix__create_section_mapping(unsigned long start, unsigned long end);
+int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 2b13f6b87e25..b798ff674fab 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned long end)
 int remove_section_mapping(unsigned long start, unsigned long end)
 {
 	if (radix_enabled())
-		return -ENODEV;
+		return radix__remove_section_mapping(start, end);
 
 	return hash__remove_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 7829e09ab4dd..aef9d49f70ce 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -482,10 +482,143 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (!pte_none(*pte))
+			return;
+	}
+
+	pte_free_kernel(&init_mm, pte_start);
+	pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (!pmd_none(*pmd))
+			return;
+	}
+
+	pmd_free(&init_mm, pmd_start);
+	pud_clear(pud);
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pte_t *pte;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr = next, pte++) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		if (!pte_present(*pte))
+			continue;
+
+		pte_clear(&init_mm, addr, pte);
+	}
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pte_t *pte_base;
+	pmd_t *pmd;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_huge(*pmd)) {
+			pte_clear(&init_mm, addr, (pte_t *)pmd);
+			continue;
+		}
+
+		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+		remove_pte_table(pte_base, addr, next);
+		free_pte_table(pte_base, pmd);
+	}
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pmd_t *pmd_base;
+	pud_t *pud;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_huge(*pud)) {
+			pte_clear(&init_mm, addr, (pte_t *)pud);
+			continue;
+		}
+
+		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+		remove_pmd_table(pmd_base, addr, next);
+		free_pmd_table(pmd_base, pud);
+	}
+}
+
+static void remove_pagetable(unsigned long start, unsigned long end)
+{
+	unsigned long addr, next;
+	pud_t *pud_base;
+	pgd_t *pgd;
+
+	spin_lock(&init_mm.page_table_lock);
+
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+
+		pgd = pgd_offset_k(addr);
+		if (!pgd_present(*pgd))
+			continue;
+
+		if (pgd_huge(*pgd)) {
+			pte_clear(&init_mm, addr, (pte_t *)pgd);
+			continue;
+		}
+
+		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+		remove_pud_table(pud_base, addr, next);
+	}
+
+	spin_unlock(&init_mm.page_table_lock);
+	radix__flush_tlb_kernel_range(start, end);
+}
+
 int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
 {
 	return create_physical_mapping(start, end);
 }
+
+int radix__remove_section_mapping(unsigned long start, unsigned long end)
+{
+	remove_pagetable(start, end);
+	return 0;
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-- 
cgit v1.2.3


From 0d0a4bc2a6f7de19cb0256a55891955961d70b1c Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Mon, 16 Jan 2017 13:07:46 -0600
Subject: powerpc/mm: unstub radix__vmemmap_remove_mapping()

Use remove_pagetable() and friends for radix vmemmap removal.

We do not require the special-case handling of vmemmap done in the x86
versions of these functions. This is because vmemmap_free() has already
freed the mapped pages, and calls us with an aligned address range.

So, add a few failsafe WARNs, but otherwise the code to remove physical
mappings is already sufficient for vmemmap.

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-radix.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index aef9d49f70ce..30374586e01d 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -527,6 +527,15 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 
+		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+			/*
+			 * The vmemmap_free() and remove_section_mapping()
+			 * codepaths call us with aligned addresses.
+			 */
+			WARN_ONCE(1, "%s: unaligned range\n", __func__);
+			continue;
+		}
+
 		pte_clear(&init_mm, addr, pte);
 	}
 }
@@ -546,6 +555,12 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 			continue;
 
 		if (pmd_huge(*pmd)) {
+			if (!IS_ALIGNED(addr, PMD_SIZE) ||
+			    !IS_ALIGNED(next, PMD_SIZE)) {
+				WARN_ONCE(1, "%s: unaligned range\n", __func__);
+				continue;
+			}
+
 			pte_clear(&init_mm, addr, (pte_t *)pmd);
 			continue;
 		}
@@ -571,6 +586,12 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
 			continue;
 
 		if (pud_huge(*pud)) {
+			if (!IS_ALIGNED(addr, PUD_SIZE) ||
+			    !IS_ALIGNED(next, PUD_SIZE)) {
+				WARN_ONCE(1, "%s: unaligned range\n", __func__);
+				continue;
+			}
+
 			pte_clear(&init_mm, addr, (pte_t *)pud);
 			continue;
 		}
@@ -597,6 +618,12 @@ static void remove_pagetable(unsigned long start, unsigned long end)
 			continue;
 
 		if (pgd_huge(*pgd)) {
+			if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
+			    !IS_ALIGNED(next, PGDIR_SIZE)) {
+				WARN_ONCE(1, "%s: unaligned range\n", __func__);
+				continue;
+			}
+
 			pte_clear(&init_mm, addr, (pte_t *)pgd);
 			continue;
 		}
@@ -636,7 +663,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
 #ifdef CONFIG_MEMORY_HOTPLUG
 void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
 {
-	/* FIXME!! intel does more. We should free page tables mapping vmemmap ? */
+	remove_pagetable(start, start + page_size);
 }
 #endif
 #endif
-- 
cgit v1.2.3


From a97a65d53d9f53b6897dc1b2aed381bc1707136b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 27 Jan 2017 14:00:34 +1000
Subject: KVM: PPC: Book3S: 64-bit CONFIG_RELOCATABLE support for interrupts

64-bit Book3S exception handlers must find the dynamic kernel base
to add to the target address when branching beyond __end_interrupts,
in order to support kernel running at non-0 physical address.

Support this in KVM by branching with CTR, similarly to regular
interrupt handlers. The guest CTR saved in HSTATE_SCRATCH1 and
restored after the branch.

Without this, the host kernel hangs and crashes randomly when it is
running at a non-0 address and a KVM guest is started.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 45 +++++++++++++++++++++++++++++---
 arch/powerpc/kernel/exceptions-64s.S     |  2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 12 ++++++---
 arch/powerpc/kvm/book3s_segment.S        |  7 +++++
 4 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index a02a268bde6b..9a5dbfb2d9f2 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -97,6 +97,15 @@
 	ld	reg,PACAKBASE(r13);					\
 	ori	reg,reg,(ABS_ADDR(label))@l;
 
+/*
+ * Branches from unrelocated code (e.g., interrupts) to labels outside
+ * head-y require >64K offsets.
+ */
+#define __LOAD_FAR_HANDLER(reg, label)					\
+	ld	reg,PACAKBASE(r13);					\
+	ori	reg,reg,(ABS_ADDR(label))@l;				\
+	addis	reg,reg,(ABS_ADDR(label))@h;
+
 /* Exception register prefixes */
 #define EXC_HV	H
 #define EXC_STD
@@ -227,12 +236,40 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtctr	reg;							\
 	bctr
 
+/*
+ * KVM requires __LOAD_FAR_HANDLER.
+ *
+ * __BRANCH_TO_KVM_EXIT branches are also a special case because they
+ * explicitly use r9 then reload it from PACA before branching. Hence
+ * the double-underscore.
+ */
+#define __BRANCH_TO_KVM_EXIT(area, label)				\
+	mfctr	r9;							\
+	std	r9,HSTATE_SCRATCH1(r13);				\
+	__LOAD_FAR_HANDLER(r9, label);					\
+	mtctr	r9;							\
+	ld	r9,area+EX_R9(r13);					\
+	bctr
+
+#define BRANCH_TO_KVM(reg, label)					\
+	__LOAD_FAR_HANDLER(reg, label);					\
+	mtctr	reg;							\
+	bctr
+
 #else
 #define BRANCH_TO_COMMON(reg, label)					\
 	b	label
 
+#define BRANCH_TO_KVM(reg, label)					\
+	b	label
+
+#define __BRANCH_TO_KVM_EXIT(area, label)				\
+	ld	r9,area+EX_R9(r13);					\
+	b	label
+
 #endif
 
+
 #define __KVM_HANDLER(area, h, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
 	ld	r10,area+EX_CFAR(r13);					\
@@ -246,8 +283,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	std	r12,HSTATE_SCRATCH0(r13);				\
 	sldi	r12,r9,32;						\
 	ori	r12,r12,(n);						\
-	ld	r9,area+EX_R9(r13);					\
-	b	kvmppc_interrupt
+	/* This reloads r9 before branching to kvmppc_interrupt */	\
+	__BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt)
 
 #define __KVM_HANDLER_SKIP(area, h, n)					\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
@@ -260,8 +297,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	std	r12,HSTATE_SCRATCH0(r13);				\
 	sldi	r12,r9,32;						\
 	ori	r12,r12,(n);						\
-	ld	r9,area+EX_R9(r13);					\
-	b	kvmppc_interrupt;					\
+	/* This reloads r9 before branching to kvmppc_interrupt */	\
+	__BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt);			\
 89:	mtocrf	0x80,r9;						\
 	ld	r9,area+EX_R9(r13);					\
 	ld	r10,area+EX_R10(r13);					\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 89b4f122aec6..65a2559eeb7f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r0,0
 	beq	1f
-	b	kvm_start_guest
+	BRANCH_TO_KVM(r10, kvm_start_guest)
 1:
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 11882aac8216..264ac9ad4585 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1060,15 +1060,16 @@ kvmppc_interrupt_hv:
 	 * R12		= (guest CR << 32) | interrupt vector
 	 * R13		= PACA
 	 * guest R12 saved in shadow VCPU SCRATCH0
+	 * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE
 	 * guest R13 saved in SPRN_SCRATCH0
 	 */
-	std	r9, HSTATE_SCRATCH1(r13)
+	std	r9, HSTATE_SCRATCH2(r13)
 	lbz	r9, HSTATE_IN_GUEST(r13)
 	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
 	beq	kvmppc_bad_host_intr
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	cmpwi	r9, KVM_GUEST_MODE_GUEST
-	ld	r9, HSTATE_SCRATCH1(r13)
+	ld	r9, HSTATE_SCRATCH2(r13)
 	beq	kvmppc_interrupt_pr
 #endif
 	/* We're now back in the host but in guest MMU context */
@@ -1088,7 +1089,7 @@ kvmppc_interrupt_hv:
 	std	r6, VCPU_GPR(R6)(r9)
 	std	r7, VCPU_GPR(R7)(r9)
 	std	r8, VCPU_GPR(R8)(r9)
-	ld	r0, HSTATE_SCRATCH1(r13)
+	ld	r0, HSTATE_SCRATCH2(r13)
 	std	r0, VCPU_GPR(R9)(r9)
 	std	r10, VCPU_GPR(R10)(r9)
 	std	r11, VCPU_GPR(R11)(r9)
@@ -1151,7 +1152,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 11:	stw	r3,VCPU_HEIR(r9)
 
 	/* these are volatile across C function calls */
+#ifdef CONFIG_RELOCATABLE
+	ld	r3, HSTATE_SCRATCH1(r13)
+	mtctr	r3
+#else
 	mfctr	r3
+#endif
 	mfxer	r4
 	std	r3, VCPU_CTR(r9)
 	std	r4, VCPU_XER(r9)
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 68e45080cf93..2a2b96d53999 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -175,9 +175,16 @@ kvmppc_interrupt_pr:
 	 * R12             = (guest CR << 32) | exit handler id
 	 * R13             = PACA
 	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE
 	 */
 #ifdef CONFIG_PPC64
 	/* Match 32-bit entry */
+#ifdef CONFIG_RELOCATABLE
+	std	r9, HSTATE_SCRATCH2(r13)
+	ld	r9, HSTATE_SCRATCH1(r13)
+	mtctr	r9
+	ld	r9, HSTATE_SCRATCH2(r13)
+#endif
 	rotldi	r12, r12, 32		  /* Flip R12 halves for stw */
 	stw	r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
 	srdi	r12, r12, 32		  /* shift trap into low half */
-- 
cgit v1.2.3


From 18569c1f134e1c5c88228f043c09678ae6052b7c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:34 +1100
Subject: powerpc/64: Don't try to use radix MMU under a hypervisor

Currently, if the kernel is running on a POWER9 processor under a
hypervisor, it will try to use the radix MMU even though it doesn't have
the necessary code to use radix under a hypervisor (it doesn't negotiate
use of radix, and it doesn't do the H_REGISTER_PROC_TBL hcall). The
result is that the guest kernel will crash when it tries to turn on the
MMU.

This fixes it by looking for the /chosen/ibm,architecture-vec-5
property, and if it exists, clears the radix MMU feature bit, before we
decide whether to initialize for radix or HPT. This property is created
by the hypervisor as a result of the guest calling the
ibm,client-architecture-support method to indicate its capabilities, so
it will indicate whether the hypervisor agreed to us using radix.

Systems without a hypervisor may have this property also (for example,
skiboot creates it), so we check the HV bit in the MSR to see whether we
are running as a guest or not. If we are in hypervisor mode, then we can
do whatever we like including using the radix MMU.

The reason for using this property is that in future, when we have
support for using radix under a hypervisor, we will need to check this
property to see whether the hypervisor agreed to us using radix.

Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines")
Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/init_64.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 93abf8a9813d..4d9481ec2468 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -42,6 +42,8 @@
 #include <linux/memblock.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -344,12 +346,43 @@ static int __init parse_disable_radix(char *p)
 }
 early_param("disable_radix", parse_disable_radix);
 
+/*
+ * If we're running under a hypervisor, we currently can't do radix
+ * since we don't have the code to do the H_REGISTER_PROC_TBL hcall.
+ * We tell that we're running under a hypervisor by looking for the
+ * /chosen/ibm,architecture-vec-5 property.
+ */
+static void early_check_vec5(void)
+{
+	unsigned long root, chosen;
+	int size;
+	const u8 *vec5;
+
+	root = of_get_flat_dt_root();
+	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
+	if (chosen == -FDT_ERR_NOTFOUND)
+		return;
+	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
+	if (!vec5)
+		return;
+	cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+}
+
 void __init mmu_early_init_devtree(void)
 {
 	/* Disable radix mode based on kernel command line. */
 	if (disable_radix)
 		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
 
+	/*
+	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
+	 * When running bare-metal, we can use radix if we like
+	 * even though the ibm,architecture-vec-5 property created by
+	 * skiboot doesn't have the necessary bits set.
+	 */
+	if (early_radix_enabled() && !(mfmsr() & MSR_HV))
+		early_check_vec5();
+
 	if (early_radix_enabled())
 		radix__early_init_devtree();
 	else
-- 
cgit v1.2.3


From 3f4ab2f83b4e443c66549206eb88a9fa5a85d647 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:35 +1100
Subject: powerpc/pseries: Fixes for the "ibm,architecture-vec-5" options

This fixes the byte index values for some of the option bits in
the "ibm,architectur-vec-5" property. The "platform facilities options"
bits are in byte 17 not byte 14, so the upper 8 bits of their
definitions need to be 0x11 not 0x0E. The "sub processor support" option
is in byte 21 not byte 15.

Note none of these options are actually looked up in
"ibm,architecture-vec-5" at this time, so there is no bug.

When checking whether option bits are set, we should check that
the offset of the byte being checked is less than the vector
length that we got from the hypervisor.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/prom.h           | 8 ++++----
 arch/powerpc/platforms/pseries/firmware.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 5e57705b4759..e6d83d0fada7 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -151,10 +151,10 @@ struct of_drconf_cell {
 #define OV5_XCMO		0x0440	/* Page Coalescing */
 #define OV5_TYPE1_AFFINITY	0x0580	/* Type 1 NUMA affinity */
 #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
-#define OV5_PFO_HW_RNG		0x0E80	/* PFO Random Number Generator */
-#define OV5_PFO_HW_842		0x0E40	/* PFO Compression Accelerator */
-#define OV5_PFO_HW_ENCR		0x0E20	/* PFO Encryption Accelerator */
-#define OV5_SUB_PROCESSORS	0x0F01	/* 1,2,or 4 Sub-Processors supported */
+#define OV5_PFO_HW_RNG		0x1180	/* PFO Random Number Generator */
+#define OV5_PFO_HW_842		0x1140	/* PFO Compression Accelerator */
+#define OV5_PFO_HW_ENCR		0x1120	/* PFO Encryption Accelerator */
+#define OV5_SUB_PROCESSORS	0x1501	/* 1,2,or 4 Sub-Processors supported */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX		0x02	/* Linux is our OS */
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index ea7f09bd73b1..7d67623203b8 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -126,7 +126,7 @@ static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)
 		index = OV5_INDX(vec5_fw_features_table[i].feature);
 		feat = OV5_FEAT(vec5_fw_features_table[i].feature);
 
-		if (vec5[index] & feat)
+		if (index < len && (vec5[index] & feat))
 			powerpc_firmware_features |=
 				vec5_fw_features_table[i].val;
 	}
-- 
cgit v1.2.3


From cc3d2940133d24000e2866b21e03ce32adfead0a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:36 +1100
Subject: powerpc/64: Enable use of radix MMU under hypervisor on POWER9

To use radix as a guest, we first need to tell the hypervisor via
the ibm,client-architecture call first that we support POWER9 and
architecture v3.00, and that we can do either radix or hash and
that we would like to choose later using an hcall (the
H_REGISTER_PROC_TBL hcall).

Then we need to check whether the hypervisor agreed to us using
radix.  We need to do this very early on in the kernel boot process
before any of the MMU initialization is done.  If the hypervisor
doesn't agree, we can't use radix and therefore clear the radix
MMU feature bit.

Later, when we have set up our process table, which points to the
radix tree for each process, we need to install that using the
H_REGISTER_PROC_TBL hcall.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  6 ++++++
 arch/powerpc/include/asm/hvcall.h        | 11 +++++++++++
 arch/powerpc/include/asm/prom.h          |  9 +++++++++
 arch/powerpc/kernel/prom_init.c          | 18 +++++++++++++++++-
 arch/powerpc/mm/init_64.c                | 12 +++++++-----
 arch/powerpc/mm/pgtable-radix.c          |  2 ++
 arch/powerpc/platforms/pseries/lpar.c    | 29 +++++++++++++++++++++++++++++
 7 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 8afb0e00f7d9..cea522c3bcae 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -138,5 +138,11 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 extern int (*register_process_table)(unsigned long base, unsigned long page_size,
 				     unsigned long tbl_size);
 
+#ifdef CONFIG_PPC_PSERIES
+extern void radix_init_pseries(void);
+#else
+static inline void radix_init_pseries(void) { };
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 77ff1ba99d1f..54d11b3a6bf7 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -276,6 +276,7 @@
 #define H_GET_MPP_X		0x314
 #define H_SET_MODE		0x31C
 #define H_CLEAR_HPT		0x358
+#define H_REGISTER_PROC_TBL	0x37C
 #define H_SIGNAL_SYS_RESET	0x380
 #define MAX_HCALL_OPCODE	H_SIGNAL_SYS_RESET
 
@@ -313,6 +314,16 @@
 #define H_SIGNAL_SYS_RESET_ALL_OTHERS		-2
 /* >= 0 values are CPU number */
 
+/* Flag values used in H_REGISTER_PROC_TBL hcall */
+#define PROC_TABLE_OP_MASK	0x18
+#define PROC_TABLE_DEREG	0x10
+#define PROC_TABLE_NEW		0x18
+#define PROC_TABLE_TYPE_MASK	0x06
+#define PROC_TABLE_HPT_SLB	0x00
+#define PROC_TABLE_HPT_PT	0x02
+#define PROC_TABLE_RADIX	0x04
+#define PROC_TABLE_GTSE		0x01
+
 #ifndef __ASSEMBLY__
 
 /**
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index e6d83d0fada7..8af2546ea593 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -121,6 +121,8 @@ struct of_drconf_cell {
 #define OV1_PPC_2_06		0x02	/* set if we support PowerPC 2.06 */
 #define OV1_PPC_2_07		0x01	/* set if we support PowerPC 2.07 */
 
+#define OV1_PPC_3_00		0x80	/* set if we support PowerPC 3.00 */
+
 /* Option vector 2: Open Firmware options supported */
 #define OV2_REAL_MODE		0x20	/* set if we want OF in real mode */
 
@@ -155,6 +157,13 @@ struct of_drconf_cell {
 #define OV5_PFO_HW_842		0x1140	/* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR		0x1120	/* PFO Encryption Accelerator */
 #define OV5_SUB_PROCESSORS	0x1501	/* 1,2,or 4 Sub-Processors supported */
+#define OV5_XIVE_EXPLOIT	0x1701	/* XIVE exploitation supported */
+#define OV5_MMU_RADIX_300	0x1880	/* ISA v3.00 radix MMU supported */
+#define OV5_MMU_HASH_300	0x1840	/* ISA v3.00 hash MMU supported */
+#define OV5_MMU_SEGM_RADIX	0x1820	/* radix mode (no segmentation) */
+#define OV5_MMU_PROC_TBL	0x1810	/* hcall selects SLB or proc table */
+#define OV5_MMU_SLB		0x1800	/* always use SLB */
+#define OV5_MMU_GTSE		0x1808	/* Guest translation shootdown */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX		0x02	/* Linux is our OS */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index ec47a939cbdd..358d43f8f84f 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -649,6 +649,7 @@ static void __init early_cmdline_parse(void)
 struct option_vector1 {
 	u8 byte1;
 	u8 arch_versions;
+	u8 arch_versions3;
 } __packed;
 
 struct option_vector2 {
@@ -691,6 +692,9 @@ struct option_vector5 {
 	u8 reserved2;
 	__be16 reserved3;
 	u8 subprocessors;
+	u8 byte22;
+	u8 intarch;
+	u8 mmu;
 } __packed;
 
 struct option_vector6 {
@@ -700,7 +704,7 @@ struct option_vector6 {
 } __packed;
 
 struct ibm_arch_vec {
-	struct { u32 mask, val; } pvrs[10];
+	struct { u32 mask, val; } pvrs[12];
 
 	u8 num_vectors;
 
@@ -749,6 +753,14 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 			.mask = cpu_to_be32(0xffff0000), /* POWER8 */
 			.val  = cpu_to_be32(0x004d0000),
 		},
+		{
+			.mask = cpu_to_be32(0xffff0000), /* POWER9 */
+			.val  = cpu_to_be32(0x004e0000),
+		},
+		{
+			.mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */
+			.val  = cpu_to_be32(0x0f000005),
+		},
 		{
 			.mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */
 			.val  = cpu_to_be32(0x0f000004),
@@ -774,6 +786,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.byte1 = 0,
 		.arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
 				 OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
+		.arch_versions3 = OV1_PPC_3_00,
 	},
 
 	.vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)),
@@ -836,6 +849,9 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.reserved2 = 0,
 		.reserved3 = 0,
 		.subprocessors = 1,
+		.intarch = 0,
+		.mmu = OV5_FEAT(OV5_MMU_RADIX_300) | OV5_FEAT(OV5_MMU_HASH_300) |
+			OV5_FEAT(OV5_MMU_PROC_TBL) | OV5_FEAT(OV5_MMU_GTSE),
 	},
 
 	/* option vector 6: IBM PAPR hints */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 4d9481ec2468..10c9a545a646 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -347,10 +347,9 @@ static int __init parse_disable_radix(char *p)
 early_param("disable_radix", parse_disable_radix);
 
 /*
- * If we're running under a hypervisor, we currently can't do radix
- * since we don't have the code to do the H_REGISTER_PROC_TBL hcall.
- * We tell that we're running under a hypervisor by looking for the
- * /chosen/ibm,architecture-vec-5 property.
+ * If we're running under a hypervisor, we need to check the contents of
+ * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
+ * radix.  If not, we clear the radix feature bit so we fall back to hash.
  */
 static void early_check_vec5(void)
 {
@@ -365,7 +364,10 @@ static void early_check_vec5(void)
 	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
 	if (!vec5)
 		return;
-	cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+	if (size <= OV5_INDX(OV5_MMU_RADIX_300) ||
+	    !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300)))
+		/* Hypervisor doesn't support radix */
+		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
 }
 
 void __init mmu_early_init_devtree(void)
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfa53ccc8baf..94323c4ececc 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -401,6 +401,8 @@ void __init radix__early_init_mmu(void)
 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 		radix_init_partition_table();
 		radix_init_amor();
+	} else {
+		radix_init_pseries();
 	}
 
 	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5dc1c3c6e716..0587655aea69 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -609,6 +609,29 @@ static int __init disable_bulk_remove(char *str)
 
 __setup("bulk_remove=", disable_bulk_remove);
 
+/* Actually only used for radix, so far */
+static int pseries_lpar_register_process_table(unsigned long base,
+			unsigned long page_size, unsigned long table_size)
+{
+	long rc;
+	unsigned long flags = PROC_TABLE_NEW;
+
+	if (radix_enabled())
+		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
+	for (;;) {
+		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
+					page_size, table_size);
+		if (!H_IS_LONG_BUSY(rc))
+			break;
+		mdelay(get_longbusy_msecs(rc));
+	}
+	if (rc != H_SUCCESS) {
+		pr_err("Failed to register process table (rc=%ld)\n", rc);
+		BUG();
+	}
+	return rc;
+}
+
 void __init hpte_init_pseries(void)
 {
 	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
@@ -622,6 +645,12 @@ void __init hpte_init_pseries(void)
 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
 }
 
+void radix_init_pseries(void)
+{
+	pr_info("Using radix MMU under hypervisor\n");
+	register_process_table = pseries_lpar_register_process_table;
+}
+
 #ifdef CONFIG_PPC_SMLPAR
 #define CMO_FREE_HINT_DEFAULT 1
 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
-- 
cgit v1.2.3


From dbcbfee0c81c7938e40d7d6bc659a5191f490b50 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:37 +1100
Subject: powerpc/64: More definitions for POWER9

This adds definitions for bits in the DSISR register which are used
by POWER9 for various translation-related exception conditions, and
for some more bits in the partition table entry that will be needed
by KVM.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 12 +++++++++++-
 arch/powerpc/include/asm/reg.h           |  4 ++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index cea522c3bcae..d73e9dfa5237 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -44,10 +44,20 @@ struct patb_entry {
 };
 extern struct patb_entry *partition_tb;
 
+/* Bits in patb0 field */
 #define PATB_HR		(1UL << 63)
-#define PATB_GR		(1UL << 63)
 #define RPDB_MASK	0x0ffffffffffff00fUL
 #define RPDB_SHIFT	(1UL << 8)
+#define RTS1_SHIFT	61		/* top 2 bits of radix tree size */
+#define RTS1_MASK	(3UL << RTS1_SHIFT)
+#define RTS2_SHIFT	5		/* bottom 3 bits of radix tree size */
+#define RTS2_MASK	(7UL << RTS2_SHIFT)
+#define RPDS_MASK	0x1f		/* root page dir. size field */
+
+/* Bits in patb1 field */
+#define PATB_GR		(1UL << 63)	/* guest uses radix; must match HR */
+#define PRTS_MASK	0x1f		/* process table size field */
+
 /*
  * Limit process table to PAGE_SIZE table. This
  * also limit the max pid we can support.
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0d4531aa2052..aa44a83ad3ec 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -274,10 +274,14 @@
 #define SPRN_DSISR	0x012	/* Data Storage Interrupt Status Register */
 #define   DSISR_NOHPTE		0x40000000	/* no translation found */
 #define   DSISR_PROTFAULT	0x08000000	/* protection fault */
+#define   DSISR_BADACCESS	0x04000000	/* bad access to CI or G */
 #define   DSISR_ISSTORE		0x02000000	/* access was a store */
 #define   DSISR_DABRMATCH	0x00400000	/* hit data breakpoint */
 #define   DSISR_NOSEGMENT	0x00200000	/* SLB miss */
 #define   DSISR_KEYFAULT	0x00200000	/* Key fault */
+#define   DSISR_UNSUPP_MMU	0x00080000	/* Unsupported MMU config */
+#define   DSISR_SET_RC		0x00040000	/* Failed setting of R/C bits */
+#define   DSISR_PGDIRFAULT      0x00020000      /* Fault on page directory */
 #define SPRN_TBRL	0x10C	/* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU	0x10D	/* Time Base Read Upper Register (user, R/O) */
 #define SPRN_CIR	0x11B	/* Chip Information Register (hyper, R/0) */
-- 
cgit v1.2.3


From ba9b399aee6fb70cbe988f0750d6dd9f6677293b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:38 +1100
Subject: powerpc/64: Export pgtable_cache and pgtable_cache_add for KVM

This exports the pgtable_cache array and the pgtable_cache_add
function so that HV KVM can use them for allocating radix page
tables for guests.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/init-common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index a175cd82ae8c..2be5dc242832 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -41,6 +41,7 @@ static void pmd_ctor(void *addr)
 }
 
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+EXPORT_SYMBOL_GPL(pgtable_cache);	/* used by kvm_hv module */
 
 /*
  * Create a kmem_cache() for pagetables.  This is not used for PTE
@@ -82,7 +83,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 	pgtable_cache[shift - 1] = new;
 	pr_debug("Allocated pgtable cache for order %d\n", shift);
 }
-
+EXPORT_SYMBOL_GPL(pgtable_cache_add);	/* used by kvm_hv module */
 
 void pgtable_cache_init(void)
 {
-- 
cgit v1.2.3


From 16ed141677c5a1a796408e74ccd0a6f6554c3f21 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:39 +1100
Subject: powerpc/64: Make type of partition table flush depend on partition
 type

When changing a partition table entry on POWER9, we do a particular
form of the tlbie instruction which flushes all TLBs and caches of
the partition table for a given logical partition ID (LPID).
This instruction has a field in the instruction word, labelled R
(radix), which should be 1 if the partition was previously a radix
partition and 0 if it was a HPT partition.  This implements that
logic.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable_64.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 8bca7f58afc4..d6b5e5cde412 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -454,13 +454,23 @@ void __init mmu_partition_table_init(void)
 void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
 				   unsigned long dw1)
 {
+	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
 	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
 	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
 
-	/* Global flush of TLBs and partition table caches for this lpid */
+	/*
+	 * Global flush of TLBs and partition table caches for this lpid.
+	 * The type of flush (hash or radix) depends on what the previous
+	 * use of this partition ID was, not the new use.
+	 */
 	asm volatile("ptesync" : : : "memory");
-	asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
-		     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	if (old & PATB_HR)
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+	else
+		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 }
 EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
-- 
cgit v1.2.3


From bc3551257af837fc603d295e59f9e32953525b98 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:40 +1100
Subject: powerpc/64: Allow for relocation-on interrupts from guest to host

With host and guest both using radix translation, it is feasible
for the host to take interrupts that come from the guest with
relocation on, and that is in fact what the POWER9 hardware will
do when LPCR[AIL] = 3.  All such interrupts use HSRR0/1 not SRR0/1
except for system call with LEV=1 (hcall).

Therefore this adds the KVM tests to the _HV variants of the
relocation-on interrupt handlers, and adds the KVM test to the
relocation-on system call entry point.

We also instantiate the relocation-on versions of the hypervisor
data storage and instruction interrupt handlers, since these can
occur with relocation on in radix guests.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 10 +++---
 arch/powerpc/kernel/exceptions-64s.S     | 53 +++++++++++++++++---------------
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9a5dbfb2d9f2..8fa09fa500f0 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -428,12 +428,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD)
 
 #define STD_RELON_EXCEPTION_HV(loc, vec, label)		\
-	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);	/* save r13 */		\
-	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec);
+	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label,	\
+				       EXC_HV, KVMTEST_HV, vec);
 
 #define STD_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);		\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec);	\
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
 
 /* This associate vector numbers with bits in paca->irq_happened */
@@ -510,10 +510,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label)			\
 	_MASKABLE_RELON_EXCEPTION_PSERIES(vec, label,			\
-					  EXC_HV, SOFTEN_NOTEST_HV)
+					  EXC_HV, SOFTEN_TEST_HV)
 
 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec);		\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec);		\
 	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
 
 /*
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 65a2559eeb7f..34a04a5fa468 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -830,6 +830,31 @@ EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00)
 TRAMP_KVM(PACA_EXGEN, 0xb00)
 EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	 /*
+	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
+	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
+	  * code to save that value into the guest state (it is the guest's PPR
+	  * value). Otherwise just change to HMT_MEDIUM as userspace has
+	  * already saved the PPR.
+	  */
+#define SYSCALL_KVMTEST							\
+	SET_SCRATCH0(r13);						\
+	GET_PACA(r13);							\
+	std	r9,PACA_EXGEN+EX_R9(r13);				\
+	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);			\
+	HMT_MEDIUM;							\
+	std	r10,PACA_EXGEN+EX_R10(r13);				\
+	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);	\
+	mfcr	r9;							\
+	KVMTEST_PR(0xc00);						\
+	GET_SCRATCH0(r13)
+
+#else
+#define SYSCALL_KVMTEST							\
+	HMT_MEDIUM
+#endif
+	
 #define LOAD_SYSCALL_HANDLER(reg)					\
 	__LOAD_HANDLER(reg, system_call_common)
 
@@ -883,34 +908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 #endif
 
 EXC_REAL_BEGIN(system_call, 0xc00, 0xd00)
-	 /*
-	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
-	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
-	  * code to save that value into the guest state (it is the guest's PPR
-	  * value). Otherwise just change to HMT_MEDIUM as userspace has
-	  * already saved the PPR.
-	  */
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	std	r9,PACA_EXGEN+EX_R9(r13)
-	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);
-	HMT_MEDIUM;
-	std	r10,PACA_EXGEN+EX_R10(r13)
-	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);
-	mfcr	r9
-	KVMTEST_PR(0xc00)
-	GET_SCRATCH0(r13)
-#else
-	HMT_MEDIUM;
-#endif
+	SYSCALL_KVMTEST
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_RFID
 	SYSCALL_PSERIES_3
 EXC_REAL_END(system_call, 0xc00, 0xd00)
 
 EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00)
-	HMT_MEDIUM
+	SYSCALL_KVMTEST
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_DIRECT
 	SYSCALL_PSERIES_3
@@ -925,7 +930,7 @@ TRAMP_KVM(PACA_EXGEN, 0xd00)
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
 EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20)
-EXC_VIRT_NONE(0x4e00, 0x4e20)
+EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x4e20, 0xe00)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
 EXC_COMMON_BEGIN(h_data_storage_common)
 	mfspr   r10,SPRN_HDAR
@@ -941,7 +946,7 @@ EXC_COMMON_BEGIN(h_data_storage_common)
 
 
 EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40)
-EXC_VIRT_NONE(0x4e20, 0x4e40)
+EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x4e40, 0xe20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
-- 
cgit v1.2.3


From c92701322711682de89b2bd0f32affad040b6e86 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:41 +1100
Subject: KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU

This adds two capabilities and two ioctls to allow userspace to
find out about and configure the POWER9 MMU in a guest.  The two
capabilities tell userspace whether KVM can support a guest using
the radix MMU, or using the hashed page table (HPT) MMU with a
process table and segment tables.  (Note that the MMUs in the
POWER9 processor cores do not use the process and segment tables
when in HPT mode, but the nest MMU does).

The KVM_PPC_CONFIGURE_V3_MMU ioctl allows userspace to specify
whether a guest will use the radix MMU or the HPT MMU, and to
specify the size and location (in guest space) of the process
table.

The KVM_PPC_GET_RMMU_INFO ioctl gives userspace information about
the radix MMU.  It returns a list of supported radix tree geometries
(base page size and number of bits indexed at each level of the
radix tree) and the encoding used to specify the various page
sizes for the TLB invalidate entry instruction.

Initially, both capabilities return 0 and the ioctls return -EINVAL,
until the necessary infrastructure for them to operate correctly
is added.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 Documentation/virtual/kvm/api.txt   | 83 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +
 arch/powerpc/include/uapi/asm/kvm.h | 20 +++++++++
 arch/powerpc/kvm/book3s_hv.c        | 13 ++++++
 arch/powerpc/kvm/powerpc.c          | 32 ++++++++++++++
 include/uapi/linux/kvm.h            |  6 +++
 6 files changed, 156 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 03145b7cafaa..4470671b0c26 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3201,6 +3201,71 @@ struct kvm_reinject_control {
 pit_reinject = 0 (!reinject mode) is recommended, unless running an old
 operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 
+4.99 KVM_PPC_CONFIGURE_V3_MMU
+
+Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_mmuv3_cfg (in)
+Returns: 0 on success,
+         -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read,
+         -EINVAL if the configuration is invalid
+
+This ioctl controls whether the guest will use radix or HPT (hashed
+page table) translation, and sets the pointer to the process table for
+the guest.
+
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;
+};
+
+There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and
+KVM_PPC_MMUV3_GTSE.  KVM_PPC_MMUV3_RADIX, if set, configures the guest
+to use radix tree translation, and if clear, to use HPT translation.
+KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest
+to be able to use the global TLB and SLB invalidation instructions;
+if clear, the guest may not use these instructions.
+
+The process_table field specifies the address and size of the guest
+process table, which is in the guest's space.  This field is formatted
+as the second doubleword of the partition table entry, as defined in
+the Power ISA V3.00, Book III section 5.7.6.1.
+
+4.100 KVM_PPC_GET_RMMU_INFO
+
+Capability: KVM_CAP_PPC_RADIX_MMU
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_ppc_rmmu_info (out)
+Returns: 0 on success,
+	 -EFAULT if struct kvm_ppc_rmmu_info cannot be written,
+	 -EINVAL if no useful information can be returned
+
+This ioctl returns a structure containing two things: (a) a list
+containing supported radix tree geometries, and (b) a list that maps
+page sizes to put in the "AP" (actual page size) field for the tlbie
+(TLB invalidate entry) instruction.
+
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
+The geometries[] field gives up to 8 supported geometries for the
+radix page table, in terms of the log base 2 of the smallest page
+size, and the number of bits indexed at each level of the tree, from
+the PTE level up to the PGD level in that order.  Any unused entries
+will have 0 in the page_shift field.
+
+The ap_encodings gives the supported page sizes and their AP field
+encodings, encoded with the AP value in the top 3 bits and the log
+base 2 of the page size in the bottom 6 bits.
+
 5. The kvm_run structure
 ------------------------
 
@@ -3942,3 +4007,21 @@ In order to use SynIC, it has to be activated by setting this
 capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
 will disable the use of APIC hardware virtualization even if supported
 by the CPU, as it's incompatible with SynIC auto-EOI behavior.
+
+8.3 KVM_CAP_PPC_RADIX_MMU
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
+processor).
+
+8.4 KVM_CAP_PPC_HASH_MMU_V3
+
+Architectures: ppc
+
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel can support guests using the
+hashed page table MMU defined in Power ISA V3.00 (as implemented in
+the POWER9 processor), including in-memory segment tables.
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2da67bf1f2ec..48c760f89590 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -291,6 +291,8 @@ struct kvmppc_ops {
 				       struct irq_bypass_producer *);
 	void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
 					struct irq_bypass_producer *);
+	int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
+	int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 3603b6f51b11..cc0908b6c2a0 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -413,6 +413,26 @@ struct kvm_get_htab_header {
 	__u16	n_invalid;
 };
 
+/* For KVM_PPC_CONFIGURE_V3_MMU */
+struct kvm_ppc_mmuv3_cfg {
+	__u64	flags;
+	__u64	process_table;	/* second doubleword of partition table entry */
+};
+
+/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
+#define KVM_PPC_MMUV3_RADIX	1	/* 1 = radix mode, 0 = HPT */
+#define KVM_PPC_MMUV3_GTSE	2	/* global translation shootdown enb. */
+
+/* For KVM_PPC_GET_RMMU_INFO */
+struct kvm_ppc_rmmu_info {
+	struct kvm_ppc_radix_geom {
+		__u8	page_shift;
+		__u8	level_bits[4];
+		__u8	pad[3];
+	}	geometries[8];
+	__u32	ap_encodings[8];
+};
+
 /* Per-vcpu XICS interrupt controller state */
 #define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ec34e39471a7..5f08ed070ae5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3657,6 +3657,17 @@ static void init_default_hcalls(void)
 	}
 }
 
+/* dummy implementations for now */
+static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
+{
+	return -EINVAL;
+}
+
+static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+	return -EINVAL;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -3694,6 +3705,8 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
 	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
 #endif
+	.configure_mmu = kvmhv_configure_mmu,
+	.get_rmmu_info = kvmhv_get_rmmu_info,
 };
 
 static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cd892dec7cb6..38c0d154c01e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_HWRNG:
 		r = kvmppc_hwrng_present();
 		break;
+	case KVM_CAP_PPC_MMU_RADIX:
+		r = !!(0 && hv_enabled && radix_enabled());
+		break;
+	case KVM_CAP_PPC_MMU_HASH_V3:
+		r = !!(0 && hv_enabled && !radix_enabled() &&
+		       cpu_has_feature(CPU_FTR_ARCH_300));
+		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -1468,6 +1475,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
 		break;
 	}
+	case KVM_PPC_CONFIGURE_V3_MMU: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_mmuv3_cfg cfg;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->configure_mmu)
+			goto out;
+		r = -EFAULT;
+		if (copy_from_user(&cfg, argp, sizeof(cfg)))
+			goto out;
+		r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg);
+		break;
+	}
+	case KVM_PPC_GET_RMMU_INFO: {
+		struct kvm *kvm = filp->private_data;
+		struct kvm_ppc_rmmu_info info;
+
+		r = -EINVAL;
+		if (!kvm->arch.kvm_ops->get_rmmu_info)
+			goto out;
+		r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
 	default: {
 		struct kvm *kvm = filp->private_data;
 		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cac48eda1075..e0035808c814 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -871,6 +871,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_USER_INSTR0 130
 #define KVM_CAP_MSI_DEVID 131
 #define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_PPC_MMU_RADIX 134
+#define KVM_CAP_PPC_MMU_HASH_V3 135
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1187,6 +1189,10 @@ struct kvm_s390_ucas_mapping {
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+#define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
+/* Available with KVM_CAP_PPC_RADIX_MMU */
+#define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
-- 
cgit v1.2.3


From 468808bd35c4aa3cf7d9fde0ebb010270038734b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:42 +1100
Subject: KVM: PPC: Book3S HV: Set process table for HPT guests on POWER9

This adds the implementation of the KVM_PPC_CONFIGURE_V3_MMU ioctl
for HPT guests on POWER9.  With this, we can return 1 for the
KVM_CAP_PPC_MMU_HASH_V3 capability.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s_hv.c        | 35 +++++++++++++++++++++++++++++++----
 arch/powerpc/kvm/powerpc.c          |  2 +-
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e59b172666cd..944532dc4a57 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -264,6 +264,7 @@ struct kvm_arch {
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
 	int hpt_cma_alloc;
+	u64 process_table;
 	struct dentry *debugfs_dir;
 	struct dentry *htab_dentry;
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 5f08ed070ae5..bb2854314ba4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3091,8 +3091,8 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
 	/* HTABSIZE and HTABORG fields */
 	dw0 |= kvm->arch.sdr1;
 
-	/* Second dword has GR=0; other fields are unused since UPRT=0 */
-	dw1 = 0;
+	/* Second dword as set by userspace */
+	dw1 = kvm->arch.process_table;
 
 	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
 }
@@ -3657,10 +3657,37 @@ static void init_default_hcalls(void)
 	}
 }
 
-/* dummy implementations for now */
 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 {
-	return -EINVAL;
+	unsigned long lpcr;
+
+	/* If not on a POWER9, reject it */
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -ENODEV;
+
+	/* If any unknown flags set, reject it */
+	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
+		return -EINVAL;
+
+	/* We can't do radix yet */
+	if (cfg->flags & KVM_PPC_MMUV3_RADIX)
+		return -EINVAL;
+
+	/* GR (guest radix) bit in process_table field must match */
+	if (cfg->process_table & PATB_GR)
+		return -EINVAL;
+
+	/* Process table size field must be reasonable, i.e. <= 24 */
+	if ((cfg->process_table & PRTS_MASK) > 24)
+		return -EINVAL;
+
+	kvm->arch.process_table = cfg->process_table;
+	kvmppc_setup_partition_table(kvm);
+
+	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
+	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+
+	return 0;
 }
 
 static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 38c0d154c01e..1476a480745e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -569,7 +569,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = !!(0 && hv_enabled && radix_enabled());
 		break;
 	case KVM_CAP_PPC_MMU_HASH_V3:
-		r = !!(0 && hv_enabled && !radix_enabled() &&
+		r = !!(hv_enabled && !radix_enabled() &&
 		       cpu_has_feature(CPU_FTR_ARCH_300));
 		break;
 #endif
-- 
cgit v1.2.3


From ef8c640cb9cc865a461827b698fcc55b0ecaa600 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:43 +1100
Subject: KVM: PPC: Book3S HV: Use ASDR for HPT guests on POWER9

POWER9 adds a register called ASDR (Access Segment Descriptor
Register), which is set by hypervisor data/instruction storage
interrupts to contain the segment descriptor for the address
being accessed, assuming the guest is using HPT translation.
(For radix guests, it contains the guest real address of the
access.)

Thus, for HPT guests on POWER9, we can use this register rather
than looking up the SLB with the slbfee. instruction.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 264ac9ad4585..01f4392a284d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1715,6 +1715,10 @@ kvmppc_hdsi:
 	/* HPTE not found fault or protection fault? */
 	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
 	beq	1f			/* if not, send it to the guest */
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
+	b	4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
 	beq	3f
 	clrrdi	r0, r4, 28
@@ -1791,6 +1795,10 @@ fast_interrupt_c_return:
 kvmppc_hisi:
 	andis.	r0, r11, SRR1_ISI_NOPT@h
 	beq	1f
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
+	b	4f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
 	beq	3f
 	clrrdi	r0, r10, 28
-- 
cgit v1.2.3


From 9e04ba69beec372ddf857c700ff922e95f50b0d0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:44 +1100
Subject: KVM: PPC: Book3S HV: Add basic infrastructure for radix guests

This adds a field in struct kvm_arch and an inline helper to
indicate whether a guest is a radix guest or not, plus a new file
to contain the radix MMU code, which currently contains just a
translate function which knows how to traverse the guest page
tables to translate an address.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h    |   3 +
 arch/powerpc/include/asm/kvm_book3s_64.h |   6 ++
 arch/powerpc/include/asm/kvm_host.h      |   2 +
 arch/powerpc/kvm/Makefile                |   3 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  10 ++-
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 139 +++++++++++++++++++++++++++++++
 6 files changed, 160 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_radix.c

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5cf306ae0ac3..7adfcc03a35f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -182,6 +182,9 @@ extern void kvmppc_mmu_hpte_sysexit(void);
 extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 
+extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			struct kvmppc_pte *gpte, bool data, bool iswrite);
+
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 848292176908..0db010cc4e65 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -36,6 +36,12 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+
+static inline bool kvm_is_radix(struct kvm *kvm)
+{
+	return kvm->arch.radix;
+}
+
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
 #endif
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 944532dc4a57..fb73518bd03b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -264,6 +264,8 @@ struct kvm_arch {
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
 	int hpt_cma_alloc;
+	u8 radix;
+	pgd_t *pgtable;
 	u64 process_table;
 	struct dentry *debugfs_dir;
 	struct dentry *htab_dentry;
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 7dd89b79d038..b87ccde2137a 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -70,7 +70,8 @@ endif
 kvm-hv-y += \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
-	book3s_64_mmu_hv.o
+	book3s_64_mmu_hv.o \
+	book3s_64_mmu_radix.o
 
 kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
 	book3s_hv_rm_xics.o
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index b795dd1ac2ef..c208bf3b252f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -119,6 +119,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 	long err = -EBUSY;
 	long order;
 
+	if (kvm_is_radix(kvm))
+		return -EINVAL;
+
 	mutex_lock(&kvm->lock);
 	if (kvm->arch.hpte_setup_done) {
 		kvm->arch.hpte_setup_done = 0;
@@ -157,7 +160,7 @@ void kvmppc_free_hpt(struct kvm *kvm)
 	if (kvm->arch.hpt_cma_alloc)
 		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
 				1 << (kvm->arch.hpt_order - PAGE_SHIFT));
-	else
+	else if (kvm->arch.hpt_virt)
 		free_pages(kvm->arch.hpt_virt,
 			   kvm->arch.hpt_order - PAGE_SHIFT);
 }
@@ -1675,7 +1678,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.slb_nr = 32;		/* POWER7/POWER8 */
 
-	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	if (kvm_is_radix(vcpu->kvm))
+		mmu->xlate = kvmppc_mmu_radix_xlate;
+	else
+		mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
 
 	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
new file mode 100644
index 000000000000..9091407fbfd4
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -0,0 +1,139 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+/*
+ * Supported radix tree geometry.
+ * Like p9, we support either 5 or 9 bits at the first (lowest) level,
+ * for a page size of 64k or 4k.
+ */
+static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
+
+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			   struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u32 pid;
+	int ret, level, ps;
+	__be64 prte, rpte;
+	unsigned long root, pte, index;
+	unsigned long rts, bits, offset;
+	unsigned long gpa;
+	unsigned long proc_tbl_size;
+
+	/* Work out effective PID */
+	switch (eaddr >> 62) {
+	case 0:
+		pid = vcpu->arch.pid;
+		break;
+	case 3:
+		pid = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
+	if (pid * 16 >= proc_tbl_size)
+		return -EINVAL;
+
+	/* Read partition table to find root of tree for effective PID */
+	ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16,
+			     &prte, sizeof(prte));
+	if (ret)
+		return ret;
+
+	root = be64_to_cpu(prte);
+	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
+		((root & RTS2_MASK) >> RTS2_SHIFT);
+	bits = root & RPDS_MASK;
+	root = root & RPDB_MASK;
+
+	/* P9 DD1 interprets RTS (radix tree size) differently */
+	offset = rts + 31;
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		offset -= 3;
+
+	/* current implementations only support 52-bit space */
+	if (offset != 52)
+		return -EINVAL;
+
+	for (level = 3; level >= 0; --level) {
+		if (level && bits != p9_supported_radix_bits[level])
+			return -EINVAL;
+		if (level == 0 && !(bits == 5 || bits == 9))
+			return -EINVAL;
+		offset -= bits;
+		index = (eaddr >> offset) & ((1UL << bits) - 1);
+		/* check that low bits of page table base are zero */
+		if (root & ((1UL << (bits + 3)) - 1))
+			return -EINVAL;
+		ret = kvm_read_guest(kvm, root + index * 8,
+				     &rpte, sizeof(rpte));
+		if (ret)
+			return ret;
+		pte = __be64_to_cpu(rpte);
+		if (!(pte & _PAGE_PRESENT))
+			return -ENOENT;
+		if (pte & _PAGE_PTE)
+			break;
+		bits = pte & 0x1f;
+		root = pte & 0x0fffffffffffff00ul;
+	}
+	/* need a leaf at lowest level; 512GB pages not supported */
+	if (level < 0 || level == 3)
+		return -EINVAL;
+
+	/* offset is now log base 2 of the page size */
+	gpa = pte & 0x01fffffffffff000ul;
+	if (gpa & ((1ul << offset) - 1))
+		return -EINVAL;
+	gpa += eaddr & ((1ul << offset) - 1);
+	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
+		if (offset == mmu_psize_defs[ps].shift)
+			break;
+	gpte->page_size = ps;
+
+	gpte->eaddr = eaddr;
+	gpte->raddr = gpa;
+
+	/* Work out permissions */
+	gpte->may_read = !!(pte & _PAGE_READ);
+	gpte->may_write = !!(pte & _PAGE_WRITE);
+	gpte->may_execute = !!(pte & _PAGE_EXEC);
+	if (kvmppc_get_msr(vcpu) & MSR_PR) {
+		if (pte & _PAGE_PRIVILEGED) {
+			gpte->may_read = 0;
+			gpte->may_write = 0;
+			gpte->may_execute = 0;
+		}
+	} else {
+		if (!(pte & _PAGE_PRIVILEGED)) {
+			/* Check AMR/IAMR to see if strict mode is in force */
+			if (vcpu->arch.amr & (1ul << 62))
+				gpte->may_read = 0;
+			if (vcpu->arch.amr & (1ul << 63))
+				gpte->may_write = 0;
+			if (vcpu->arch.iamr & (1ul << 62))
+				gpte->may_execute = 0;
+		}
+	}
+
+	return 0;
+}
+
-- 
cgit v1.2.3


From f4c51f841d2ac7d36cacb84efbc383190861f87c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:45 +1100
Subject: KVM: PPC: Book3S HV: Modify guest entry/exit paths to handle radix
 guests

This adds code to  branch around the parts that radix guests don't
need - clearing and loading the SLB with the guest SLB contents,
saving the guest SLB contents on exit, and restoring the host SLB
contents.

Since the host is now using radix, we need to save and restore the
host value for the PID register.

On hypervisor data/instruction storage interrupts, we don't do the
guest HPT lookup on radix, but just save the guest physical address
for the fault (from the ASDR register) in the vcpu struct.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_host.h     |  1 +
 arch/powerpc/kernel/asm-offsets.c       |  2 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 57 ++++++++++++++++++++++++++-------
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index fb73518bd03b..da1421a4d6f2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -606,6 +606,7 @@ struct kvm_vcpu_arch {
 	ulong fault_dar;
 	u32 fault_dsisr;
 	unsigned long intr_msr;
+	ulong fault_gpa;	/* guest real address of page fault (POWER9) */
 #endif
 
 #ifdef CONFIG_BOOKE
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0601e6a7297c..3afa0ad9837f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -498,6 +498,7 @@ int main(void)
 	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
 	DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
+	DEFINE(KVM_RADIX, offsetof(struct kvm, arch.radix));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
@@ -537,6 +538,7 @@ int main(void)
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_FAULT_GPA, offsetof(struct kvm_vcpu, arch.fault_gpa));
 	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 01f4392a284d..7fc7a9221509 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -518,6 +518,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 /* Stack frame offsets */
 #define STACK_SLOT_TID		(112-16)
 #define STACK_SLOT_PSSCR	(112-24)
+#define STACK_SLOT_PID		(112-32)
 
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
@@ -530,6 +531,7 @@ kvmppc_hv_entry:
 	 * R1 = host R1
 	 * R2 = TOC
 	 * all other volatile GPRS = free
+	 * Does not preserve non-volatile GPRs or CR fields
 	 */
 	mflr	r0
 	std	r0, PPC_LR_STKOFF(r1)
@@ -549,32 +551,38 @@ kvmppc_hv_entry:
 	bl	kvmhv_start_timing
 1:
 #endif
-	/* Clear out SLB */
+
+	/* Use cr7 as an indication of radix mode */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r9, VCORE_KVM(r5)	/* pointer to struct kvm */
+	lbz	r0, KVM_RADIX(r9)
+	cmpwi	cr7, r0, 0
+
+	/* Clear out SLB if hash */
+	bne	cr7, 2f
 	li	r6,0
 	slbmte	r6,r6
 	slbia
 	ptesync
-
+2:
 	/*
 	 * POWER7/POWER8 host -> guest partition switch code.
 	 * We don't have to lock against concurrent tlbies,
 	 * but we do have to coordinate across hardware threads.
 	 */
 	/* Set bit in entry map iff exit map is zero. */
-	ld	r5, HSTATE_KVM_VCORE(r13)
 	li	r7, 1
 	lbz	r6, HSTATE_PTID(r13)
 	sld	r7, r7, r6
-	addi	r9, r5, VCORE_ENTRY_EXIT
-21:	lwarx	r3, 0, r9
+	addi	r8, r5, VCORE_ENTRY_EXIT
+21:	lwarx	r3, 0, r8
 	cmpwi	r3, 0x100		/* any threads starting to exit? */
 	bge	secondary_too_late	/* if so we're too late to the party */
 	or	r3, r3, r7
-	stwcx.	r3, 0, r9
+	stwcx.	r3, 0, r8
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
-	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
 	cmpwi	r6,0
 	bne	10f
 	lwz	r7,KVM_LPID(r9)
@@ -658,7 +666,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
 
-	/* Load up guest SLB entries */
+	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
 	lwz	r5,VCPU_SLB_MAX(r4)
 	cmpwi	r5,0
 	beq	9f
@@ -696,8 +704,10 @@ kvmppc_got_guest:
 BEGIN_FTR_SECTION
 	mfspr	r5, SPRN_TIDR
 	mfspr	r6, SPRN_PSSCR
+	mfspr	r7, SPRN_PID
 	std	r5, STACK_SLOT_TID(r1)
 	std	r6, STACK_SLOT_PSSCR(r1)
+	std	r7, STACK_SLOT_PID(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 BEGIN_FTR_SECTION
@@ -1293,11 +1303,15 @@ mc_cont:
 	mtspr	SPRN_CTRLT,r6
 4:
 	/* Read the guest SLB and save it away */
+	ld	r5, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r5)
+	cmpwi	r0, 0
+	li	r5, 0
+	bne	3f			/* for radix, save 0 entries */
 	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
 	mtctr	r0
 	li	r6,0
 	addi	r7,r9,VCPU_SLB
-	li	r5,0
 1:	slbmfee	r8,r6
 	andis.	r0,r8,SLB_ESID_V@h
 	beq	2f
@@ -1309,7 +1323,7 @@ mc_cont:
 	addi	r5,r5,1
 2:	addi	r6,r6,1
 	bdnz	1b
-	stw	r5,VCPU_SLB_MAX(r9)
+3:	stw	r5,VCPU_SLB_MAX(r9)
 
 	/*
 	 * Save the guest PURR/SPURR
@@ -1558,8 +1572,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 BEGIN_FTR_SECTION
 	ld	r5, STACK_SLOT_TID(r1)
 	ld	r6, STACK_SLOT_PSSCR(r1)
+	ld	r7, STACK_SLOT_PID(r1)
 	mtspr	SPRN_TIDR, r5
 	mtspr	SPRN_PSSCR, r6
+	mtspr	SPRN_PID, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 	/*
@@ -1671,6 +1687,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	isync
 
 	/* load host SLB entries */
+BEGIN_MMU_FTR_SECTION
+	b	0f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 	ld	r8,PACA_SLBSHADOWPTR(r13)
 
 	.rept	SLB_NUM_BOLTED
@@ -1683,7 +1702,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbmte	r6,r5
 1:	addi	r8,r8,16
 	.endr
-
+0:
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 	/* Finish timing, if we have a vcpu */
 	ld	r4, HSTATE_KVM_VCPU(r13)
@@ -1710,8 +1729,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  * reflect the HDSI to the guest as a DSI.
  */
 kvmppc_hdsi:
+	ld	r3, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r3)
+	cmpwi	r0, 0
 	mfspr	r4, SPRN_HDAR
 	mfspr	r6, SPRN_HDSISR
+	bne	.Lradix_hdsi		/* on radix, just save DAR/DSISR/ASDR */
 	/* HPTE not found fault or protection fault? */
 	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
 	beq	1f			/* if not, send it to the guest */
@@ -1788,11 +1811,23 @@ fast_interrupt_c_return:
 	stb	r0, HSTATE_IN_GUEST(r13)
 	b	guest_exit_cont
 
+.Lradix_hdsi:
+	std	r4, VCPU_FAULT_DAR(r9)
+	stw	r6, VCPU_FAULT_DSISR(r9)
+.Lradix_hisi:
+	mfspr	r5, SPRN_ASDR
+	std	r5, VCPU_FAULT_GPA(r9)
+	b	guest_exit_cont
+
 /*
  * Similarly for an HISI, reflect it to the guest as an ISI unless
  * it is an HPTE not found fault for a page that we have paged out.
  */
 kvmppc_hisi:
+	ld	r3, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r3)
+	cmpwi	r0, 0
+	bne	.Lradix_hisi		/* for radix, just save ASDR */
 	andis.	r0, r11, SRR1_ISI_NOPT@h
 	beq	1f
 BEGIN_FTR_SECTION
-- 
cgit v1.2.3


From 5a319350a46572d073042a3194676099dd2c135d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:46 +1100
Subject: KVM: PPC: Book3S HV: Page table construction and page faults for
 radix guests

This adds the code to construct the second-level ("partition-scoped" in
architecturese) page tables for guests using the radix MMU.  Apart from
the PGD level, which is allocated when the guest is created, the rest
of the tree is all constructed in response to hypervisor page faults.

As well as hypervisor page faults for missing pages, we also get faults
for reference/change (RC) bits needing to be set, as well as various
other error conditions.  For now, we only set the R or C bit in the
guest page table if the same bit is set in the host PTE for the
backing page.

This code can take advantage of the guest being backed with either
transparent or ordinary 2MB huge pages, and insert 2MB page entries
into the guest page tables.  There is no support for 1GB huge pages
yet.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |   8 +
 arch/powerpc/kvm/book3s.c              |   1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |   7 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 385 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c           |  17 +-
 5 files changed, 415 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 7adfcc03a35f..ff5cd5c5ce8d 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
 			unsigned long status);
 extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
 			unsigned long slb_v, unsigned long valid);
+extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			unsigned long gpa, gva_t ea, int is_store);
 
 extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
@@ -182,8 +184,14 @@ extern void kvmppc_mmu_hpte_sysexit(void);
 extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 
+extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
+			struct kvm_vcpu *vcpu,
+			unsigned long ea, unsigned long dsisr);
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 			struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern void kvmppc_free_radix(struct kvm *kvm);
+extern int kvmppc_radix_init(void);
+extern void kvmppc_radix_exit(void);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 019f008775b9..b6b5c185bd92 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
 	kvmppc_set_dsisr(vcpu, flags);
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
 }
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage);	/* used by kvm_hv */
 
 void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
 {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index c208bf3b252f..57690c22716d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -395,8 +395,8 @@ static int instruction_is_store(unsigned int instr)
 	return (instr & mask) != 0;
 }
 
-static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
-				  unsigned long gpa, gva_t ea, int is_store)
+int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			   unsigned long gpa, gva_t ea, int is_store)
 {
 	u32 last_inst;
 
@@ -461,6 +461,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	unsigned long rcbits;
 	long mmio_update;
 
+	if (kvm_is_radix(kvm))
+		return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
+
 	/*
 	 * Real-mode code has already searched the HPT and found the
 	 * entry we're interested in.  Lock the entry and check that
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 9091407fbfd4..865ea9bca364 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -137,3 +137,388 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	return 0;
 }
 
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_BASE_PSIZE	MMU_PAGE_64K
+#else
+#define MMU_BASE_PSIZE	MMU_PAGE_4K
+#endif
+
+static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
+				    unsigned int pshift)
+{
+	int psize = MMU_BASE_PSIZE;
+
+	if (pshift >= PMD_SHIFT)
+		psize = MMU_PAGE_2M;
+	addr &= ~0xfffUL;
+	addr |= mmu_psize_defs[psize].ap << 5;
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
+		     : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
+	asm volatile("ptesync": : :"memory");
+}
+
+void kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr,
+			     unsigned long set, unsigned long addr,
+			     unsigned int shift)
+{
+	if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
+	    pte_present(*ptep)) {
+		/* have to invalidate it first */
+		__radix_pte_update(ptep, _PAGE_PRESENT, 0);
+		kvmppc_radix_tlbie_page(kvm, addr, shift);
+		set |= _PAGE_PRESENT;
+	}
+	__radix_pte_update(ptep, clr, set);
+}
+
+void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
+			     pte_t *ptep, pte_t pte)
+{
+	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
+}
+
+static struct kmem_cache *kvm_pte_cache;
+
+static pte_t *kvmppc_pte_alloc(void)
+{
+	return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
+}
+
+static void kvmppc_pte_free(pte_t *ptep)
+{
+	kmem_cache_free(kvm_pte_cache, ptep);
+}
+
+static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
+			     unsigned int level, unsigned long mmu_seq)
+{
+	pgd_t *pgd;
+	pud_t *pud, *new_pud = NULL;
+	pmd_t *pmd, *new_pmd = NULL;
+	pte_t *ptep, *new_ptep = NULL;
+	int ret;
+
+	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
+	pgd = kvm->arch.pgtable + pgd_index(gpa);
+	pud = NULL;
+	if (pgd_present(*pgd))
+		pud = pud_offset(pgd, gpa);
+	else
+		new_pud = pud_alloc_one(kvm->mm, gpa);
+
+	pmd = NULL;
+	if (pud && pud_present(*pud))
+		pmd = pmd_offset(pud, gpa);
+	else
+		new_pmd = pmd_alloc_one(kvm->mm, gpa);
+
+	if (level == 0 && !(pmd && pmd_present(*pmd)))
+		new_ptep = kvmppc_pte_alloc();
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	spin_lock(&kvm->mmu_lock);
+	ret = -EAGAIN;
+	if (mmu_notifier_retry(kvm, mmu_seq))
+		goto out_unlock;
+
+	/* Now traverse again under the lock and change the tree */
+	ret = -ENOMEM;
+	if (pgd_none(*pgd)) {
+		if (!new_pud)
+			goto out_unlock;
+		pgd_populate(kvm->mm, pgd, new_pud);
+		new_pud = NULL;
+	}
+	pud = pud_offset(pgd, gpa);
+	if (pud_none(*pud)) {
+		if (!new_pmd)
+			goto out_unlock;
+		pud_populate(kvm->mm, pud, new_pmd);
+		new_pmd = NULL;
+	}
+	pmd = pmd_offset(pud, gpa);
+	if (pmd_large(*pmd)) {
+		/* Someone else has instantiated a large page here; retry */
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+	if (level == 1 && !pmd_none(*pmd)) {
+		/*
+		 * There's a page table page here, but we wanted
+		 * to install a large page.  Tell the caller and let
+		 * it try installing a normal page if it wants.
+		 */
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	if (level == 0) {
+		if (pmd_none(*pmd)) {
+			if (!new_ptep)
+				goto out_unlock;
+			pmd_populate(kvm->mm, pmd, new_ptep);
+			new_ptep = NULL;
+		}
+		ptep = pte_offset_kernel(pmd, gpa);
+		if (pte_present(*ptep)) {
+			/* PTE was previously valid, so invalidate it */
+			kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+						0, gpa, 0);
+			kvmppc_radix_tlbie_page(kvm, gpa, 0);
+		}
+		kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+	} else {
+		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+	}
+	ret = 0;
+
+ out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	if (new_pud)
+		pud_free(kvm->mm, new_pud);
+	if (new_pmd)
+		pmd_free(kvm->mm, new_pmd);
+	if (new_ptep)
+		kvmppc_pte_free(new_ptep);
+	return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				   unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long mmu_seq, pte_size;
+	unsigned long gpa, gfn, hva, pfn;
+	struct kvm_memory_slot *memslot;
+	struct page *page = NULL, *pages[1];
+	long ret, npages, ok;
+	unsigned int writing;
+	struct vm_area_struct *vma;
+	unsigned long flags;
+	pte_t pte, *ptep;
+	unsigned long pgflags;
+	unsigned int shift, level;
+
+	/* Check for unusual errors */
+	if (dsisr & DSISR_UNSUPP_MMU) {
+		pr_err("KVM: Got unsupported MMU fault\n");
+		return -EFAULT;
+	}
+	if (dsisr & DSISR_BADACCESS) {
+		/* Reflect to the guest as DSI */
+		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+		return RESUME_GUEST;
+	}
+
+	/* Translate the logical address and get the page */
+	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+	gpa &= ~0xF000000000000000ul;
+	gfn = gpa >> PAGE_SHIFT;
+	if (!(dsisr & DSISR_PGDIRFAULT))
+		gpa |= ea & 0xfff;
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+		if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS |
+			     DSISR_SET_RC)) {
+			/*
+			 * Bad address in guest page table tree, or other
+			 * unusual error - reflect it to the guest as DSI.
+			 */
+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+			return RESUME_GUEST;
+		}
+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
+					      dsisr & DSISR_ISSTORE);
+	}
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	writing = (dsisr & DSISR_ISSTORE) != 0;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	if (dsisr & DSISR_SET_RC) {
+		/*
+		 * Need to set an R or C bit in the 2nd-level tables;
+		 * if the relevant bits aren't already set in the linux
+		 * page tables, fall through to do the gup_fast to
+		 * set them in the linux page tables too.
+		 */
+		ok = 0;
+		pgflags = _PAGE_ACCESSED;
+		if (writing)
+			pgflags |= _PAGE_DIRTY;
+		local_irq_save(flags);
+		ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva,
+						   NULL, NULL);
+		if (ptep) {
+			pte = READ_ONCE(*ptep);
+			if (pte_present(pte) &&
+			    (pte_val(pte) & pgflags) == pgflags)
+				ok = 1;
+		}
+		local_irq_restore(flags);
+		if (ok) {
+			spin_lock(&kvm->mmu_lock);
+			if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
+				spin_unlock(&kvm->mmu_lock);
+				return RESUME_GUEST;
+			}
+			ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable,
+							gpa, NULL, &shift);
+			if (ptep && pte_present(*ptep)) {
+				kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
+							gpa, shift);
+				spin_unlock(&kvm->mmu_lock);
+				return RESUME_GUEST;
+			}
+			spin_unlock(&kvm->mmu_lock);
+		}
+	}
+
+	ret = -EFAULT;
+	pfn = 0;
+	pte_size = PAGE_SIZE;
+	pgflags = _PAGE_READ | _PAGE_EXEC;
+	level = 0;
+	npages = get_user_pages_fast(hva, 1, writing, pages);
+	if (npages < 1) {
+		/* Check if it's an I/O mapping */
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, hva);
+		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			pfn = vma->vm_pgoff +
+				((hva - vma->vm_start) >> PAGE_SHIFT);
+			pgflags = pgprot_val(vma->vm_page_prot);
+		}
+		up_read(&current->mm->mmap_sem);
+		if (!pfn)
+			return -EFAULT;
+	} else {
+		page = pages[0];
+		pfn = page_to_pfn(page);
+		if (PageHuge(page)) {
+			page = compound_head(page);
+			pte_size <<= compound_order(page);
+			/* See if we can insert a 2MB large-page PTE here */
+			if (pte_size >= PMD_SIZE &&
+			    (gpa & PMD_MASK & PAGE_MASK) ==
+			    (hva & PMD_MASK & PAGE_MASK)) {
+				level = 1;
+				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
+			}
+		}
+		/* See if we can provide write access */
+		if (writing) {
+			/*
+			 * We assume gup_fast has set dirty on the host PTE.
+			 */
+			pgflags |= _PAGE_WRITE;
+		} else {
+			local_irq_save(flags);
+			ptep = __find_linux_pte_or_hugepte(current->mm->pgd,
+							hva, NULL, NULL);
+			if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
+				pgflags |= _PAGE_WRITE;
+			local_irq_restore(flags);
+		}
+	}
+
+	/*
+	 * Compute the PTE value that we need to insert.
+	 */
+	pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
+	if (pgflags & _PAGE_WRITE)
+		pgflags |= _PAGE_DIRTY;
+	pte = pfn_pte(pfn, __pgprot(pgflags));
+
+	/* Allocate space in the tree and write the PTE */
+	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+	if (ret == -EBUSY) {
+		/*
+		 * There's already a PMD where wanted to install a large page;
+		 * for now, fall back to installing a small page.
+		 */
+		level = 0;
+		pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
+		pte = pfn_pte(pfn, __pgprot(pgflags));
+		ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+	}
+	if (ret == 0 || ret == -EAGAIN)
+		ret = RESUME_GUEST;
+
+	if (page) {
+		/*
+		 * We drop pages[0] here, not page because page might
+		 * have been set to the head page of a compound, but
+		 * we have to drop the reference on the correct tail
+		 * page to match the get inside gup()
+		 */
+		put_page(pages[0]);
+	}
+	return ret;
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+	unsigned long ig, iu, im;
+	pte_t *pte;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+
+	if (!kvm->arch.pgtable)
+		return;
+	pgd = kvm->arch.pgtable;
+	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
+		if (!pgd_present(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+		for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
+			if (!pud_present(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+			for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
+				if (pmd_huge(*pmd)) {
+					pmd_clear(pmd);
+					continue;
+				}
+				if (!pmd_present(*pmd))
+					continue;
+				pte = pte_offset_map(pmd, 0);
+				memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
+				kvmppc_pte_free(pte);
+				pmd_clear(pmd);
+			}
+			pmd_free(kvm->mm, pmd_offset(pud, 0));
+			pud_clear(pud);
+		}
+		pud_free(kvm->mm, pud_offset(pgd, 0));
+		pgd_clear(pgd);
+	}
+	pgd_free(kvm->mm, kvm->arch.pgtable);
+}
+
+static void pte_ctor(void *addr)
+{
+	memset(addr, 0, PTE_TABLE_SIZE);
+}
+
+int kvmppc_radix_init(void)
+{
+	unsigned long size = sizeof(void *) << PTE_INDEX_SIZE;
+
+	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
+	if (!kvm_pte_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void kvmppc_radix_exit(void)
+{
+	kmem_cache_destroy(kvm_pte_cache);
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bb2854314ba4..d50251f9a3c9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3356,7 +3356,10 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
 	kvmppc_free_vcores(kvm);
 
-	kvmppc_free_hpt(kvm);
+	if (kvm_is_radix(kvm))
+		kvmppc_free_radix(kvm);
+	else
+		kvmppc_free_hpt(kvm);
 
 	kvmppc_free_pimap(kvm);
 }
@@ -3768,6 +3771,11 @@ static int kvm_init_subcore_bitmap(void)
 	return 0;
 }
 
+static int kvmppc_radix_possible(void)
+{
+	return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled();
+}
+
 static int kvmppc_book3s_init_hv(void)
 {
 	int r;
@@ -3807,12 +3815,19 @@ static int kvmppc_book3s_init_hv(void)
 	init_vcore_lists();
 
 	r = kvmppc_mmu_hv_init();
+	if (r)
+		return r;
+
+	if (kvmppc_radix_possible())
+		r = kvmppc_radix_init();
 	return r;
 }
 
 static void kvmppc_book3s_exit_hv(void)
 {
 	kvmppc_free_host_rm_ops();
+	if (kvmppc_radix_possible())
+		kvmppc_radix_exit();
 	kvmppc_hv_ops = NULL;
 }
 
-- 
cgit v1.2.3


From 01756099e0a5f431bbada9693d566269acfb51f9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:47 +1100
Subject: KVM: PPC: Book3S HV: MMU notifier callbacks for radix guests

This adapts our implementations of the MMU notifier callbacks
(unmap_hva, unmap_hva_range, age_hva, test_age_hva, set_spte_hva)
to call radix functions when the guest is using radix.  These
implementations are much simpler than for HPT guests because we
have only one PTE to deal with, so we don't need to traverse
rmap chains.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |  6 ++++
 arch/powerpc/kvm/book3s_64_mmu_hv.c    | 64 +++++++++++++++++++++++-----------
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 54 ++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index ff5cd5c5ce8d..952cc4b954a1 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -192,6 +192,12 @@ extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 extern void kvmppc_free_radix(struct kvm *kvm);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
+extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
+extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
+extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			unsigned long gfn);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 57690c22716d..088c82bb7ba4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -701,12 +701,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
+typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			      unsigned long gfn);
+
 static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long start,
 				unsigned long end,
-				int (*handler)(struct kvm *kvm,
-					       unsigned long *rmapp,
-					       unsigned long gfn))
+				hva_handler_fn handler)
 {
 	int ret;
 	int retval = 0;
@@ -731,9 +732,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
 		for (; gfn < gfn_end; ++gfn) {
-			gfn_t gfn_offset = gfn - memslot->base_gfn;
-
-			ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
+			ret = handler(kvm, memslot, gfn);
 			retval |= ret;
 		}
 	}
@@ -742,20 +741,21 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 }
 
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
-					 unsigned long gfn))
+			  hva_handler_fn handler)
 {
 	return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			   unsigned long gfn)
 {
 	struct revmap_entry *rev = kvm->arch.revmap;
 	unsigned long h, i, j;
 	__be64 *hptep;
 	unsigned long ptel, psize, rcbits;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	for (;;) {
 		lock_rmap(rmapp);
 		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
@@ -816,26 +816,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
 {
-	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva(kvm, hva, handler);
 	return 0;
 }
 
 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva_range(kvm, start, end, handler);
 	return 0;
 }
 
 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 				  struct kvm_memory_slot *memslot)
 {
-	unsigned long *rmapp;
 	unsigned long gfn;
 	unsigned long n;
+	unsigned long *rmapp;
 
-	rmapp = memslot->arch.rmap;
 	gfn = memslot->base_gfn;
-	for (n = memslot->npages; n; --n) {
+	rmapp = memslot->arch.rmap;
+	for (n = memslot->npages; n; --n, ++gfn) {
+		if (kvm_is_radix(kvm)) {
+			kvm_unmap_radix(kvm, memslot, gfn);
+			continue;
+		}
 		/*
 		 * Testing the present bit without locking is OK because
 		 * the memslot has been marked invalid already, and hence
@@ -843,20 +853,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 		 * thus the present bit can't go from 0 to 1.
 		 */
 		if (*rmapp & KVMPPC_RMAP_PRESENT)
-			kvm_unmap_rmapp(kvm, rmapp, gfn);
+			kvm_unmap_rmapp(kvm, memslot, gfn);
 		++rmapp;
-		++gfn;
 	}
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			 unsigned long gfn)
 {
 	struct revmap_entry *rev = kvm->arch.revmap;
 	unsigned long head, i, j;
 	__be64 *hptep;
 	int ret = 0;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  retry:
 	lock_rmap(rmapp);
 	if (*rmapp & KVMPPC_RMAP_REFERENCED) {
@@ -904,17 +915,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
+	return kvm_handle_hva_range(kvm, start, end, handler);
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			      unsigned long gfn)
 {
 	struct revmap_entry *rev = kvm->arch.revmap;
 	unsigned long head, i, j;
 	unsigned long *hp;
 	int ret = 1;
+	unsigned long *rmapp;
 
+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 	if (*rmapp & KVMPPC_RMAP_REFERENCED)
 		return 1;
 
@@ -940,12 +956,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
+	return kvm_handle_hva(kvm, hva, handler);
 }
 
 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
-	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	hva_handler_fn handler;
+
+	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
+	kvm_handle_hva(kvm, hva, handler);
 }
 
 static int vcpus_running(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 865ea9bca364..69cabadc121a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -463,6 +463,60 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+/* Called with kvm->lock held */
+int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		    unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep)) {
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
+					gpa, shift);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+	}
+	return 0;				
+}
+
+/* Called with kvm->lock held */
+int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		  unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	int ref = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
+					gpa, shift);
+		/* XXX need to flush tlb here? */
+		ref = 1;
+	}
+	return ref;
+}
+
+/* Called with kvm->lock held */
+int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		       unsigned long gfn)
+{
+	pte_t *ptep;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	unsigned int shift;
+	int ref = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_young(*ptep))
+		ref = 1;
+	return ref;
+}
+
 void kvmppc_free_radix(struct kvm *kvm)
 {
 	unsigned long ig, iu, im;
-- 
cgit v1.2.3


From 8f7b79b8379a85fb8dd0c3f42d9f452ec5552161 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:48 +1100
Subject: KVM: PPC: Book3S HV: Implement dirty page logging for radix guests

This adds code to keep track of dirty pages when requested (that is,
when memslot->dirty_bitmap is non-NULL) for radix guests.  We use the
dirty bits in the PTEs in the second-level (partition-scoped) page
tables, together with a bitmap of pages that were dirty when their
PTE was invalidated (e.g., when the page was paged out).  This bitmap
is stored in the first half of the memslot->dirty_bitmap area, and
kvm_vm_ioctl_get_dirty_log_hv() now uses the second half for the
bitmap that gets returned to userspace.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |   7 ++-
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  28 ++++-----
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 111 ++++++++++++++++++++++++++++++---
 arch/powerpc/kvm/book3s_hv.c           |  31 +++++++--
 4 files changed, 144 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 952cc4b954a1..57dc407cec4a 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -198,6 +198,8 @@ extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
 extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
+extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
@@ -228,8 +230,11 @@ extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 			unsigned long pte_index, unsigned long avpn,
 			unsigned long *hpret);
-extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
+extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map);
+extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+			struct kvm_memory_slot *memslot,
+			unsigned long *map);
 extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr,
 			unsigned long mask);
 extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 088c82bb7ba4..c9e587a2849d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1068,7 +1068,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 	return npages_dirty;
 }
 
-static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 			      struct kvm_memory_slot *memslot,
 			      unsigned long *map)
 {
@@ -1086,12 +1086,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 		__set_bit_le(gfn - memslot->base_gfn, map);
 }
 
-long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			     unsigned long *map)
+long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
 {
 	unsigned long i, j;
 	unsigned long *rmapp;
-	struct kvm_vcpu *vcpu;
 
 	preempt_disable();
 	rmapp = memslot->arch.rmap;
@@ -1107,15 +1106,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				__set_bit_le(j, map);
 		++rmapp;
 	}
-
-	/* Harvest dirty bits from VPA and DTL updates */
-	/* Note: we never modify the SLB shadow buffer areas */
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		spin_lock(&vcpu->arch.vpa_update_lock);
-		harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
-		harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
-		spin_unlock(&vcpu->arch.vpa_update_lock);
-	}
 	preempt_enable();
 	return 0;
 }
@@ -1170,10 +1160,14 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot) {
-		rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
-		lock_rmap(rmap);
-		*rmap |= KVMPPC_RMAP_CHANGED;
-		unlock_rmap(rmap);
+		if (!kvm_is_radix(kvm)) {
+			rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+			lock_rmap(rmap);
+			*rmap |= KVMPPC_RMAP_CHANGED;
+			unlock_rmap(rmap);
+		} else if (memslot->dirty_bitmap) {
+			mark_page_dirty(kvm, gfn);
+		}
 	}
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 69cabadc121a..125cc7ce1525 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -158,18 +158,21 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
 	asm volatile("ptesync": : :"memory");
 }
 
-void kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr,
-			     unsigned long set, unsigned long addr,
-			     unsigned int shift)
+unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
+				      unsigned long clr, unsigned long set,
+				      unsigned long addr, unsigned int shift)
 {
+	unsigned long old = 0;
+
 	if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) &&
 	    pte_present(*ptep)) {
 		/* have to invalidate it first */
-		__radix_pte_update(ptep, _PAGE_PRESENT, 0);
+		old = __radix_pte_update(ptep, _PAGE_PRESENT, 0);
 		kvmppc_radix_tlbie_page(kvm, addr, shift);
 		set |= _PAGE_PRESENT;
+		old &= _PAGE_PRESENT;
 	}
-	__radix_pte_update(ptep, clr, set);
+	return __radix_pte_update(ptep, clr, set) | old;
 }
 
 void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
@@ -197,6 +200,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 	pud_t *pud, *new_pud = NULL;
 	pmd_t *pmd, *new_pmd = NULL;
 	pte_t *ptep, *new_ptep = NULL;
+	unsigned long old;
 	int ret;
 
 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
@@ -262,9 +266,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 		ptep = pte_offset_kernel(pmd, gpa);
 		if (pte_present(*ptep)) {
 			/* PTE was previously valid, so invalidate it */
-			kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
-						0, gpa, 0);
+			old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+						      0, gpa, 0);
 			kvmppc_radix_tlbie_page(kvm, gpa, 0);
+			if (old & _PAGE_DIRTY)
+				mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
 		}
 		kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
 	} else {
@@ -463,6 +469,26 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     unsigned long gfn, unsigned int order)
+{
+	unsigned long i, limit;
+	unsigned long *dp;
+
+	if (!memslot->dirty_bitmap)
+		return;
+	limit = 1ul << order;
+	if (limit < BITS_PER_LONG) {
+		for (i = 0; i < limit; ++i)
+			mark_page_dirty(kvm, gfn + i);
+		return;
+	}
+	dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
+	limit /= BITS_PER_LONG;
+	for (i = 0; i < limit; ++i)
+		*dp++ = ~0ul;
+}
+
 /* Called with kvm->lock held */
 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		    unsigned long gfn)
@@ -470,13 +496,21 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	pte_t *ptep;
 	unsigned long gpa = gfn << PAGE_SHIFT;
 	unsigned int shift;
+	unsigned long old;
 
 	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
 					   NULL, &shift);
 	if (ptep && pte_present(*ptep)) {
-		kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
-					gpa, shift);
+		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
+					      gpa, shift);
 		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+		if (old & _PAGE_DIRTY) {
+			if (!shift)
+				mark_page_dirty(kvm, gfn);
+			else
+				mark_pages_dirty(kvm, memslot,
+						 gfn, shift - PAGE_SHIFT);
+		}
 	}
 	return 0;				
 }
@@ -517,6 +551,65 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	return ref;
 }
 
+/* Returns the number of PAGE_SIZE pages that are dirty */
+static int kvm_radix_test_clear_dirty(struct kvm *kvm,
+				struct kvm_memory_slot *memslot, int pagenum)
+{
+	unsigned long gfn = memslot->base_gfn + pagenum;
+	unsigned long gpa = gfn << PAGE_SHIFT;
+	pte_t *ptep;
+	unsigned int shift;
+	int ret = 0;
+
+	ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa,
+					   NULL, &shift);
+	if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
+		ret = 1;
+		if (shift)
+			ret = 1 << (shift - PAGE_SHIFT);
+		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
+					gpa, shift);
+		kvmppc_radix_tlbie_page(kvm, gpa, shift);
+	}
+	return ret;
+}
+
+long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
+			struct kvm_memory_slot *memslot, unsigned long *map)
+{
+	unsigned long i, j;
+	unsigned long n, *p;
+	int npages;
+
+	/*
+	 * Radix accumulates dirty bits in the first half of the
+	 * memslot's dirty_bitmap area, for when pages are paged
+	 * out or modified by the host directly.  Pick up these
+	 * bits and add them to the map.
+	 */
+	n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
+	p = memslot->dirty_bitmap;
+	for (i = 0; i < n; ++i)
+		map[i] |= xchg(&p[i], 0);
+
+	for (i = 0; i < memslot->npages; i = j) {
+		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
+
+		/*
+		 * Note that if npages > 0 then i must be a multiple of npages,
+		 * since huge pages are only used to back the guest at guest
+		 * real addresses that are a multiple of their size.
+		 * Since we have at most one PTE covering any given guest
+		 * real address, if npages > 1 we can skip to i + npages.
+		 */
+		j = i + 1;
+		if (npages)
+			for (j = i; npages; ++j, --npages)
+				__set_bit_le(j, map);
+	}
+	return 0;
+}
+
 void kvmppc_free_radix(struct kvm *kvm)
 {
 	unsigned long ig, iu, im;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d50251f9a3c9..401e4cc8a91f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2961,8 +2961,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	int r;
+	int i, r;
 	unsigned long n;
+	unsigned long *buf;
+	struct kvm_vcpu *vcpu;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -2976,15 +2978,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 	if (!memslot->dirty_bitmap)
 		goto out;
 
+	/*
+	 * Use second half of bitmap area because radix accumulates
+	 * bits in the first half.
+	 */
 	n = kvm_dirty_bitmap_bytes(memslot);
-	memset(memslot->dirty_bitmap, 0, n);
+	buf = memslot->dirty_bitmap + n / sizeof(long);
+	memset(buf, 0, n);
 
-	r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
+	if (kvm_is_radix(kvm))
+		r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf);
+	else
+		r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf);
 	if (r)
 		goto out;
 
+	/* Harvest dirty bits from VPA and DTL updates */
+	/* Note: we never modify the SLB shadow buffer areas */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
+		kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+	}
+
 	r = -EFAULT;
-	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+	if (copy_to_user(log->dirty_bitmap, buf, n))
 		goto out;
 
 	r = 0;
@@ -3037,7 +3056,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	if (npages)
 		atomic64_inc(&kvm->arch.mmio_update);
 
-	if (npages && old->npages) {
+	if (npages && old->npages && !kvm_is_radix(kvm)) {
 		/*
 		 * If modifying a memslot, reset all the rmap dirty bits.
 		 * If this is a new memslot, we don't need to do anything
@@ -3046,7 +3065,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 		 */
 		slots = kvm_memslots(kvm);
 		memslot = id_to_memslot(slots, mem->slot);
-		kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
+		kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
 	}
 }
 
-- 
cgit v1.2.3


From 65dae5403a162fe6ef7cd8b2835de9d23c303891 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:49 +1100
Subject: KVM: PPC: Book3S HV: Make HPT-specific hypercalls return error in
 radix mode

If the guest is in radix mode, then it doesn't have a hashed page
table (HPT), so all of the hypercalls that manipulate the HPT can't
work and should return an error.  This adds checks to make them
return H_FUNCTION ("function not supported").

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 9ef3c4be952f..6c1ac3d21b91 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -182,6 +182,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	unsigned long mmu_seq;
 	unsigned long rcbits, irq_flags = 0;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	psize = hpte_page_size(pteh, ptel);
 	if (!psize)
 		return H_PARAMETER;
@@ -458,6 +460,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	struct revmap_entry *rev;
 	u64 pte, orig_pte, pte_r;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
@@ -529,6 +533,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 	struct revmap_entry *rev, *revs[4];
 	u64 hp0, hp1;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	global = global_invalidates(kvm, 0);
 	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
 		n = 0;
@@ -642,6 +648,8 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long v, r, rb, mask, bits;
 	u64 pte_v, pte_r;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 
@@ -711,6 +719,8 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 	int i, n = 1;
 	struct revmap_entry *rev = NULL;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 	if (flags & H_READ_4) {
@@ -750,6 +760,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *rmap;
 	long ret = H_NOT_FOUND;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 
@@ -796,6 +808,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 	unsigned long *rmap;
 	long ret = H_NOT_FOUND;
 
+	if (kvm_is_radix(kvm))
+		return H_FUNCTION;
 	if (pte_index >= kvm->arch.hpt_npte)
 		return H_PARAMETER;
 
-- 
cgit v1.2.3


From a29ebeaf5575d03eef178bb87c425a1e46cae1ca Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:50 +1100
Subject: KVM: PPC: Book3S HV: Invalidate TLB on radix guest vcpu movement

With radix, the guest can do TLB invalidations itself using the tlbie
(global) and tlbiel (local) TLB invalidation instructions.  Linux guests
use local TLB invalidations for translations that have only ever been
accessed on one vcpu.  However, that doesn't mean that the translations
have only been accessed on one physical cpu (pcpu) since vcpus can move
around from one pcpu to another.  Thus a tlbiel might leave behind stale
TLB entries on a pcpu where the vcpu previously ran, and if that task
then moves back to that previous pcpu, it could see those stale TLB
entries and thus access memory incorrectly.  The usual symptom of this
is random segfaults in userspace programs in the guest.

To cope with this, we detect when a vcpu is about to start executing on
a thread in a core that is a different core from the last time it
executed.  If that is the case, then we mark the core as needing a
TLB flush and then send an interrupt to any thread in the core that is
currently running a vcpu from the same guest.  This will get those vcpus
out of the guest, and the first one to re-enter the guest will do the
TLB flush.  The reason for interrupting the vcpus executing on the old
core is to cope with the following scenario:

	CPU 0			CPU 1			CPU 4
	(core 0)			(core 0)			(core 1)

	VCPU 0 runs task X      VCPU 1 runs
	core 0 TLB gets
	entries from task X
	VCPU 0 moves to CPU 4
							VCPU 0 runs task X
							Unmap pages of task X
							tlbiel

				(still VCPU 1)			task X moves to VCPU 1
				task X runs
				task X sees stale TLB
				entries

That is, as soon as the VCPU starts executing on the new core, it
could unmap and tlbiel some page table entries, and then the task
could migrate to one of the VCPUs running on the old core and
potentially see stale TLB entries.

Since the TLB is shared between all the threads in a core, we only
use the bit of kvm->arch.need_tlb_flush corresponding to the first
thread in the core.  To ensure that we don't have a window where we
can miss a flush, this moves the clearing of the bit from before the
actual flush to after it.  This way, two threads might both do the
flush, but we prevent the situation where one thread can enter the
guest before the flush is finished.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_host.h     |  2 ++
 arch/powerpc/kvm/book3s_hv.c            | 45 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c     | 11 ++++++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 38 +++++++++++++++++++---------
 4 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index da1421a4d6f2..b2dbeac3f450 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -263,6 +263,7 @@ struct kvm_arch {
 	unsigned long hpt_mask;
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
+	cpumask_t cpu_in_guest;
 	int hpt_cma_alloc;
 	u8 radix;
 	pgd_t *pgtable;
@@ -661,6 +662,7 @@ struct kvm_vcpu_arch {
 	int state;
 	int ptid;
 	int thread_cpu;
+	int prev_cpu;
 	bool timer_running;
 	wait_queue_head_t cpu_run;
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 401e4cc8a91f..50c230e83f9b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1821,6 +1821,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	vcpu->arch.vcore = vcore;
 	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
 	vcpu->arch.thread_cpu = -1;
+	vcpu->arch.prev_cpu = -1;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -1950,11 +1951,33 @@ static void kvmppc_release_hwthread(int cpu)
 	tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
+static void do_nothing(void *x)
+{
+}
+
+static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	cpu = cpu_first_thread_sibling(cpu);
+	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+	/*
+	 * Make sure setting of bit in need_tlb_flush precedes
+	 * testing of cpu_in_guest bits.  The matching barrier on
+	 * the other side is the first smp_mb() in kvmppc_run_core().
+	 */
+	smp_mb();
+	for (i = 0; i < threads_per_core; ++i)
+		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+}
+
 static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
 	int cpu;
 	struct paca_struct *tpaca;
 	struct kvmppc_vcore *mvc = vc->master_vcore;
+	struct kvm *kvm = vc->kvm;
 
 	cpu = vc->pcpu;
 	if (vcpu) {
@@ -1965,6 +1988,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 		cpu += vcpu->arch.ptid;
 		vcpu->cpu = mvc->pcpu;
 		vcpu->arch.thread_cpu = cpu;
+
+		/*
+		 * With radix, the guest can do TLB invalidations itself,
+		 * and it could choose to use the local form (tlbiel) if
+		 * it is invalidating a translation that has only ever been
+		 * used on one vcpu.  However, that doesn't mean it has
+		 * only ever been used on one physical cpu, since vcpus
+		 * can move around between pcpus.  To cope with this, when
+		 * a vcpu moves from one pcpu to another, we need to tell
+		 * any vcpus running on the same core as this vcpu previously
+		 * ran to flush the TLB.  The TLB is shared between threads,
+		 * so we use a single bit in .need_tlb_flush for all 4 threads.
+		 */
+		if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
+			if (vcpu->arch.prev_cpu >= 0 &&
+			    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+			    cpu_first_thread_sibling(cpu))
+				radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
+			vcpu->arch.prev_cpu = cpu;
+		}
+		cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
 	}
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
@@ -2552,6 +2596,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		kvmppc_release_hwthread(pcpu + i);
 		if (sip && sip->napped[i])
 			kvmppc_ipi_thread(pcpu + i);
+		cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
 	}
 
 	kvmppc_set_host_core(pcpu);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6c1ac3d21b91..b095afcd4309 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x)
 static int global_invalidates(struct kvm *kvm, unsigned long flags)
 {
 	int global;
+	int cpu;
 
 	/*
 	 * If there is only one vcore, and it's currently running,
@@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags)
 		/* any other core might now have stale TLB entries... */
 		smp_wmb();
 		cpumask_setall(&kvm->arch.need_tlb_flush);
-		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
-				  &kvm->arch.need_tlb_flush);
+		cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
+		/*
+		 * On POWER9, threads are independent but the TLB is shared,
+		 * so use the bit for the first thread to represent the core.
+		 */
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			cpu = cpu_first_thread_sibling(cpu);
+		cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
 	}
 
 	return global;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7fc7a9221509..dcc67a87d688 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -598,30 +598,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 	/* See if we need to flush the TLB */
 	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
+BEGIN_FTR_SECTION
+	/*
+	 * On POWER9, individual threads can come in here, but the
+	 * TLB is shared between the 4 threads in a core, hence
+	 * invalidating on one thread invalidates for all.
+	 * Thus we make all 4 threads use the same bit here.
+	 */
+	clrrdi	r6,r6,2
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
 	srdi	r6,r6,6			/* doubleword number */
 	sldi	r6,r6,3			/* address offset */
 	add	r6,r6,r9
 	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
-	li	r0,1
-	sld	r0,r0,r7
+	li	r8,1
+	sld	r8,r8,r7
 	ld	r7,0(r6)
-	and.	r7,r7,r0
+	and.	r7,r7,r8
 	beq	22f
-23:	ldarx	r7,0,r6			/* if set, clear the bit */
-	andc	r7,r7,r0
-	stdcx.	r7,0,r6
-	bne	23b
 	/* Flush the TLB of any entries for this LPID */
-	lwz	r6,KVM_TLB_SETS(r9)
-	li	r0,0			/* RS for P9 version of tlbiel */
-	mtctr	r6
+	lwz	r0,KVM_TLB_SETS(r9)
+	mtctr	r0
 	li	r7,0x800		/* IS field = 0b10 */
 	ptesync
-28:	tlbiel	r7
+	li	r0,0			/* RS for P9 version of tlbiel */
+	bne	cr7, 29f
+28:	tlbiel	r7			/* On P9, rs=0, RIC=0, PRS=0, R=0 */
 	addi	r7,r7,0x1000
 	bdnz	28b
-	ptesync
+	b	30f
+29:	PPC_TLBIEL(7,0,2,1,1)		/* for radix, RIC=2, PRS=1, R=1 */
+	addi	r7,r7,0x1000
+	bdnz	29b
+30:	ptesync
+23:	ldarx	r7,0,r6			/* clear the bit after TLB flushed */
+	andc	r7,r7,r8
+	stdcx.	r7,0,r6
+	bne	23b
 
 	/* Add timebase offset onto timebase */
 22:	ld	r8,VCORE_TB_OFFSET(r5)
-- 
cgit v1.2.3


From 53af3ba2e8195f504d6a3a0667ccb5e7d4c57599 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:51 +1100
Subject: KVM: PPC: Book3S HV: Allow guest exit path to have MMU on

If we allow LPCR[AIL] to be set for radix guests, then interrupts from
the guest to the host can be delivered by the hardware with relocation
on, and thus the code path starting at kvmppc_interrupt_hv can be
executed in virtual mode (MMU on) for radix guests (previously it was
only ever executed in real mode).

Most of the code is indifferent to whether the MMU is on or off, but
the calls to OPAL that use the real-mode OPAL entry code need to
be switched to use the virtual-mode code instead.  The affected
calls are the calls to the OPAL XICS emulation functions in
kvmppc_read_one_intr() and related functions.  We test the MSR[IR]
bit to detect whether we are in real or virtual mode, and call the
opal_rm_* or opal_* function as appropriate.

The other place that depends on the MMU being off is the optimization
where the guest exit code jumps to the external interrupt vector or
hypervisor doorbell interrupt vector, or returns to its caller (which
is __kvmppc_vcore_entry).  If the MMU is on and we are returning to
the caller, then we don't need to use an rfid instruction since the
MMU is already on; a simple blr suffices.  If there is an external
or hypervisor doorbell interrupt to handle, we branch to the
relocation-on version of the interrupt vector.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_builtin.c    | 38 ++++++++++++++++++++++-----------
 arch/powerpc/kvm/book3s_hv_rm_xics.c    |  8 +++----
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 29 +++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5bb24be0b346..fe08fea54b70 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -29,6 +29,11 @@
 #include <asm/opal.h>
 #include <asm/smp.h>
 
+static bool in_realmode(void)
+{
+	return !(mfmsr() & MSR_IR);
+}
+
 #define KVM_CMA_CHUNK_ORDER	18
 
 /*
@@ -200,7 +205,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
 
 /*
  * Send an interrupt or message to another CPU.
- * This can only be called in real mode.
  * The caller needs to include any barrier needed to order writes
  * to memory vs. the IPI/message.
  */
@@ -226,7 +230,9 @@ void kvmhv_rm_send_ipi(int cpu)
 
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	if (xics_phys)
+	if (!in_realmode())
+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
+	else if (xics_phys)
 		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
 	else
 		opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
@@ -412,14 +418,15 @@ static long kvmppc_read_one_intr(bool *again)
 
 	/* Now read the interrupt from the ICP */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
-	if (!xics_phys) {
-		/* Use OPAL to read the XIRR */
+	rc = 0;
+	if (!in_realmode())
+		rc = opal_int_get_xirr(&xirr, false);
+	else if (!xics_phys)
 		rc = opal_rm_int_get_xirr(&xirr, false);
-		if (rc < 0)
-			return 1;
-	} else {
+	else
 		xirr = _lwzcix(xics_phys + XICS_XIRR);
-	}
+	if (rc < 0)
+		return 1;
 
 	/*
 	 * Save XIRR for later. Since we get control in reverse endian
@@ -445,15 +452,19 @@ static long kvmppc_read_one_intr(bool *again)
 	 * If it is an IPI, clear the MFRR and EOI it.
 	 */
 	if (xisr == XICS_IPI) {
-		if (xics_phys) {
+		rc = 0;
+		if (!in_realmode()) {
+			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
+			rc = opal_int_eoi(h_xirr);
+		} else if (xics_phys) {
 			_stbcix(xics_phys + XICS_MFRR, 0xff);
 			_stwcix(xics_phys + XICS_XIRR, xirr);
 		} else {
 			opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
 			rc = opal_rm_int_eoi(h_xirr);
-			/* If rc > 0, there is another interrupt pending */
-			*again = rc > 0;
 		}
+		/* If rc > 0, there is another interrupt pending */
+		*again = rc > 0;
 
 		/*
 		 * Need to ensure side effects of above stores
@@ -471,7 +482,10 @@ static long kvmppc_read_one_intr(bool *again)
 			/* We raced with the host,
 			 * we need to resend that IPI, bummer
 			 */
-			if (xics_phys)
+			if (!in_realmode())
+				opal_int_set_mfrr(hard_smp_processor_id(),
+						  IPI_PRIORITY);
+			else if (xics_phys)
 				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
 			else
 				opal_rm_int_set_mfrr(hard_smp_processor_id(),
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 06edc4366639..7e2eb3e865b3 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -70,11 +70,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
 	hcpu = hcore << threads_shift;
 	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
 	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
-	if (paca[hcpu].kvm_hstate.xics_phys)
-		icp_native_cause_ipi_rm(hcpu);
-	else
-		opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
-				     IPI_PRIORITY);
+	kvmppc_set_host_ipi(hcpu, 1);
+	smp_mb();
+	kvmhv_rm_send_ipi(hcpu);
 }
 #else
 static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index dcc67a87d688..46c1c1fe55c8 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	addi	r1, r1, 112
 	ld	r7, HSTATE_HOST_MSR(r13)
 
+	/*
+	 * If we came back from the guest via a relocation-on interrupt,
+	 * we will be in virtual mode at this point, which makes it a
+	 * little easier to get back to the caller.
+	 */
+	mfmsr	r0
+	andi.	r0, r0, MSR_IR		/* in real mode? */
+	bne	.Lvirt_return
+
 	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
 	beq	11f
@@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr SPRN_HSRR1, r7
 	ba    0xe80
 
+	/* Virtual-mode return - can't get here for HMI or machine check */
+.Lvirt_return:
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	16f
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	beq	17f
+	andi.	r0, r7, MSR_EE		/* were interrupts hard-enabled? */
+	beq	18f
+	mtmsrd	r7, 1			/* if so then re-enable them */
+18:	mtlr	r8
+	blr
+
+16:	mtspr	SPRN_HSRR0, r8		/* jump to reloc-on external vector */
+	mtspr	SPRN_HSRR1, r7
+	b	exc_virt_0x4500_hardware_interrupt
+
+17:	mtspr	SPRN_HSRR0, r8
+	mtspr	SPRN_HSRR1, r7
+	b	exc_virt_0x4e80_h_doorbell
+
 kvmppc_primary_no_guest:
 	/* We handle this much like a ceded vcpu */
 	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
-- 
cgit v1.2.3


From f11f6f79b606fb54bb388d0ea652ed889b2fdf86 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:52 +1100
Subject: KVM: PPC: Book3S HV: Invalidate ERAT on guest entry/exit for POWER9
 DD1

On POWER9 DD1, we need to invalidate the ERAT (effective to real
address translation cache) when changing the PIDR register, which
we do as part of guest entry and exit.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 46c1c1fe55c8..47414a6fe2dd 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -876,6 +876,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_BESCR, r6
 	mtspr	SPRN_PID, r7
 	mtspr	SPRN_WORT, r8
+BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
 BEGIN_FTR_SECTION
 	/* POWER8-only registers */
 	ld	r5, VCPU_TCSCR(r4)
@@ -1620,6 +1623,9 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_PSSCR, r6
 	mtspr	SPRN_PID, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
 
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
-- 
cgit v1.2.3


From 8cf4ecc0ca9bd9bdc9b4ca0a99f7445a1e74afed Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 30 Jan 2017 21:21:53 +1100
Subject: KVM: PPC: Book3S HV: Enable radix guest support

This adds a few last pieces of the support for radix guests:

* Implement the backends for the KVM_PPC_CONFIGURE_V3_MMU and
  KVM_PPC_GET_RMMU_INFO ioctls for radix guests

* On POWER9, allow secondary threads to be on/off-lined while guests
  are running.

* Set up LPCR and the partition table entry for radix guests.

* Don't allocate the rmap array in the kvm_memory_slot structure
  on radix.

* Don't try to initialize the HPT for radix guests, since they don't
  have an HPT.

* Take out the code that prevents the HV KVM module from
  initializing on radix hosts.

At this stage, we only support radix guests if the host is running
in radix mode, and only support HPT guests if the host is running in
HPT mode.  Thus a guest cannot switch from one mode to the other,
which enables some simplifications.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s.h  |  2 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c    |  1 -
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 45 +++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c           | 88 ++++++++++++++++++++++++----------
 arch/powerpc/kvm/powerpc.c             |  2 +-
 5 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 57dc407cec4a..2bf35017ffc0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -189,6 +189,7 @@ extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
 			unsigned long ea, unsigned long dsisr);
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 			struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern int kvmppc_init_vm_radix(struct kvm *kvm);
 extern void kvmppc_free_radix(struct kvm *kvm);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
@@ -200,6 +201,7 @@ extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			unsigned long gfn);
 extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map);
+extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
 
 /* XXX remove this export when load_last_inst() is generic */
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index c9e587a2849d..9df3d940acec 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -155,7 +155,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	kvmppc_free_lpid(kvm->arch.lpid);
 	vfree(kvm->arch.revmap);
 	if (kvm->arch.hpt_cma_alloc)
 		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 125cc7ce1525..4344651f408c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -610,6 +610,51 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 	return 0;
 }
 
+static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
+				 int psize, int *indexp)
+{
+	if (!mmu_psize_defs[psize].shift)
+		return;
+	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
+		(mmu_psize_defs[psize].ap << 29);
+	++(*indexp);
+}
+
+int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
+{
+	int i;
+
+	if (!radix_enabled())
+		return -EINVAL;
+	memset(info, 0, sizeof(*info));
+
+	/* 4k page size */
+	info->geometries[0].page_shift = 12;
+	info->geometries[0].level_bits[0] = 9;
+	for (i = 1; i < 4; ++i)
+		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
+	/* 64k page size */
+	info->geometries[1].page_shift = 16;
+	for (i = 0; i < 4; ++i)
+		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
+
+	i = 0;
+	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
+	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
+
+	return 0;
+}
+
+int kvmppc_init_vm_radix(struct kvm *kvm)
+{
+	kvm->arch.pgtable = pgd_alloc(kvm->mm);
+	if (!kvm->arch.pgtable)
+		return -ENOMEM;
+	return 0;
+}
+
 void kvmppc_free_radix(struct kvm *kvm)
 {
 	unsigned long ig, iu, im;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 50c230e83f9b..e4a79679342e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1135,7 +1135,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
 	/*
 	 * Userspace can only modify DPFD (default prefetch depth),
 	 * ILE (interrupt little-endian) and TC (translation control).
-	 * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
+	 * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.).
 	 */
 	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -2922,7 +2922,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	smp_mb();
 
 	/* On the first time here, set up HTAB and VRMA */
-	if (!vcpu->kvm->arch.hpte_setup_done) {
+	if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
 		r = kvmppc_hv_setup_htab_rma(vcpu);
 		if (r)
 			goto out;
@@ -2984,6 +2984,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 {
 	struct kvm_ppc_one_seg_page_size *sps;
 
+	/*
+	 * Since we don't yet support HPT guests on a radix host,
+	 * return an error if the host uses radix.
+	 */
+	if (radix_enabled())
+		return -EINVAL;
+
 	info->flags = KVM_PPC_PAGE_SIZES_REAL;
 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
 		info->flags |= KVM_PPC_1T_SEGMENTS;
@@ -3069,6 +3076,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
 					 unsigned long npages)
 {
+	/*
+	 * For now, if radix_enabled() then we only support radix guests,
+	 * and in that case we don't need the rmap array.
+	 */
+	if (radix_enabled()) {
+		slot->arch.rmap = NULL;
+		return 0;
+	}
+
 	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
 	if (!slot->arch.rmap)
 		return -ENOMEM;
@@ -3149,14 +3165,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
 {
 	unsigned long dw0, dw1;
 
-	/* PS field - page size for VRMA */
-	dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
-		((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
-	/* HTABSIZE and HTABORG fields */
-	dw0 |= kvm->arch.sdr1;
+	if (!kvm_is_radix(kvm)) {
+		/* PS field - page size for VRMA */
+		dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
+			((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
+		/* HTABSIZE and HTABORG fields */
+		dw0 |= kvm->arch.sdr1;
 
-	/* Second dword as set by userspace */
-	dw1 = kvm->arch.process_table;
+		/* Second dword as set by userspace */
+		dw1 = kvm->arch.process_table;
+	} else {
+		dw0 = PATB_HR | radix__get_tree_size() |
+			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
+		dw1 = PATB_GR | kvm->arch.process_table;
+	}
 
 	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
 }
@@ -3326,6 +3348,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
 	unsigned long lpcr, lpid;
 	char buf[32];
+	int ret;
 
 	/* Allocate the guest's logical partition ID */
 
@@ -3373,13 +3396,30 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 		lpcr |= LPCR_HVICE;
 	}
 
+	/*
+	 * For now, if the host uses radix, the guest must be radix.
+	 */
+	if (radix_enabled()) {
+		kvm->arch.radix = 1;
+		lpcr &= ~LPCR_VPM1;
+		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+		ret = kvmppc_init_vm_radix(kvm);
+		if (ret) {
+			kvmppc_free_lpid(kvm->arch.lpid);
+			return ret;
+		}
+		kvmppc_setup_partition_table(kvm);
+	}
+
 	kvm->arch.lpcr = lpcr;
 
 	/*
 	 * Work out how many sets the TLB has, for the use of
 	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
 	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
+	if (kvm_is_radix(kvm))
+		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
+	else if (cpu_has_feature(CPU_FTR_ARCH_300))
 		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
 	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
@@ -3389,8 +3429,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/*
 	 * Track that we now have a HV mode VM active. This blocks secondary
 	 * CPU threads from coming online.
+	 * On POWER9, we only need to do this for HPT guests on a radix
+	 * host, which is not yet supported.
 	 */
-	kvm_hv_vm_activated();
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_activated();
 
 	/*
 	 * Create a debugfs directory for the VM
@@ -3416,10 +3459,13 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
 	debugfs_remove_recursive(kvm->arch.debugfs_dir);
 
-	kvm_hv_vm_deactivated();
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm_hv_vm_deactivated();
 
 	kvmppc_free_vcores(kvm);
 
+	kvmppc_free_lpid(kvm->arch.lpid);
+
 	if (kvm_is_radix(kvm))
 		kvmppc_free_radix(kvm);
 	else
@@ -3452,11 +3498,6 @@ static int kvmppc_core_check_processor_compat_hv(void)
 	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
 	    !cpu_has_feature(CPU_FTR_ARCH_206))
 		return -EIO;
-	/*
-	 * Disable KVM for Power9 in radix mode.
-	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
-		return -EIO;
 
 	return 0;
 }
@@ -3727,6 +3768,7 @@ static void init_default_hcalls(void)
 static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 {
 	unsigned long lpcr;
+	int radix;
 
 	/* If not on a POWER9, reject it */
 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
@@ -3736,12 +3778,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
 		return -EINVAL;
 
-	/* We can't do radix yet */
-	if (cfg->flags & KVM_PPC_MMUV3_RADIX)
+	/* We can't change a guest to/from radix yet */
+	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
+	if (radix != kvm_is_radix(kvm))
 		return -EINVAL;
 
 	/* GR (guest radix) bit in process_table field must match */
-	if (cfg->process_table & PATB_GR)
+	if (!!(cfg->process_table & PATB_GR) != radix)
 		return -EINVAL;
 
 	/* Process table size field must be reasonable, i.e. <= 24 */
@@ -3757,11 +3800,6 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	return 0;
 }
 
-static int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
-{
-	return -EINVAL;
-}
-
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1476a480745e..40a5b2d75ed1 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -566,7 +566,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = kvmppc_hwrng_present();
 		break;
 	case KVM_CAP_PPC_MMU_RADIX:
-		r = !!(0 && hv_enabled && radix_enabled());
+		r = !!(hv_enabled && radix_enabled());
 		break;
 	case KVM_CAP_PPC_MMU_HASH_V3:
 		r = !!(hv_enabled && !radix_enabled() &&
-- 
cgit v1.2.3


From 0b1c764339ef35dccaa49d8466ec2bb08362c233 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 2 Feb 2017 16:21:44 +1100
Subject: powerpc/powernv: Fix section mismatch from opal_lpc_init()

opal_lpc_init() is called from an __init routine, and calls other __init
routines, so should also be __init, init?

Fixes: 023b13a50183 ("powerpc/powernv: Add support for direct mapped LPC on POWER9")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-lpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 1a8cd54c1e74..399908bd9954 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -387,7 +387,7 @@ static int opal_lpc_init_debugfs(void)
 machine_device_initcall(powernv, opal_lpc_init_debugfs);
 #endif  /* CONFIG_DEBUG_FS */
 
-void opal_lpc_init(void)
+void __init opal_lpc_init(void)
 {
 	struct device_node *np;
 
-- 
cgit v1.2.3


From f84ed59a612d866cde0bd17ad2a52acb524d44c9 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Mon, 30 Jan 2017 17:41:53 +1100
Subject: powerpc/sparse: Constify the address pointer in __get_user_check()

In __get_user_check, we create an intermediate pointer for the
user address we're about to fetch. We currently don't tag this
pointer as const. Make it const, as we are simply dereferencing
it, and it's scope is limited to the __get_user_check macro.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index a15d84d59356..71d81cbe3781 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -274,7 +274,7 @@ do {								\
 ({									\
 	long __gu_err = -EFAULT;					\
 	unsigned long  __gu_val = 0;					\
-	__typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
+	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
 	might_fault();							\
 	if (access_ok(VERIFY_READ, __gu_addr, (size)))			\
 		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
-- 
cgit v1.2.3


From d466f6c5cac17e0c9f22bd4250020bf885049db7 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Mon, 30 Jan 2017 17:41:54 +1100
Subject: powerpc/sparse: Constify the address pointer in __get_user_nocheck()

In __get_user_nocheck, we create an intermediate pointer for the
user address we're about to fetch. We currently don't tag this
pointer as const. Make it const, as we are simply dereferencing
it, and it's scope is limited to the __get_user_nocheck macro.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 71d81cbe3781..44ded4193001 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -261,7 +261,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	unsigned long __gu_val;					\
-	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	if (!is_kernel_addr((unsigned long)__gu_addr))		\
 		might_fault();					\
-- 
cgit v1.2.3


From f2ca809059294b27703d709a3c4218197c5f16dc Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Mon, 30 Jan 2017 17:41:55 +1100
Subject: powerpc/sparse: Constify the address pointer in __get_user_nosleep()

In __get_user_nosleep, we create an intermediate pointer for the
user address we're about to fetch. We currently don't tag this
pointer as const. Make it const, as we are simply dereferencing
it, and it's scope is limited to the __get_user_nosleep macro.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 44ded4193001..0e6add3187bc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -286,7 +286,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	unsigned long __gu_val;					\
-	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-- 
cgit v1.2.3


From d6c569b99558b219fcf0ce0d3af8ec8f077ba924 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 12 Jan 2017 21:17:33 +1100
Subject: powerpc/64: Move HAVE_CONTEXT_TRACKING from pseries to common Kconfig

We added support for HAVE_CONTEXT_TRACKING, but placed the option inside
PPC_PSERIES.

This has the undesirable effect that NO_HZ_FULL can be enabled on a
kernel with both powernv and pseries support, but cannot on a kernel
with powernv only support.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                   | 1 +
 arch/powerpc/platforms/pseries/Kconfig | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e022859340b7..33f5b8380a7d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -165,6 +165,7 @@ config PPC
 	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_KERNEL_GZIP
 	select HAVE_CC_STACKPROTECTOR
+	select HAVE_CONTEXT_TRACKING if PPC64
 
 config GENERIC_CSUM
 	def_bool n
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index e1c280a95d58..30ec04f1c67c 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -17,7 +17,6 @@ config PPC_PSERIES
 	select PPC_UDBG_16550
 	select PPC_NATIVE
 	select PPC_DOORBELL
-	select HAVE_CONTEXT_TRACKING
 	select HOTPLUG_CPU if SMP
 	select ARCH_RANDOM
 	select PPC_DOORBELL
-- 
cgit v1.2.3


From 1925febe4bc4a6f3abc3b2e322f39348d090509c Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 12 Jan 2017 21:17:34 +1100
Subject: powerpc/64: Add BPF_JIT to powernv and pseries defconfigs

Commit db9112173b18 ("powerpc: Turn on BPF_JIT in ppc64_defconfig")
only added BPF_JIT to the ppc64 defconfig. Add it to our powernv
and pseries defconfigs too.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 1 +
 arch/powerpc/configs/pseries_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index e4d53fe5976a..b793550fac91 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -79,6 +79,7 @@ CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
+CONFIG_BPF_JIT=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 5a06bdde1674..d99734f3b868 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -82,6 +82,7 @@ CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
+CONFIG_BPF_JIT=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
-- 
cgit v1.2.3


From 1c877f71b7b9c0a5144e29d599eac2c62c91070c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 1 Feb 2017 13:21:22 +1100
Subject: powerpc/debug: PTDUMP should depend on DEBUG_FS

CONFIG_PPC_PTDUMP currently selects CONFIG_DEBUG_FS. But CONFIG_DEBUG_FS
is user-selectable, so we shouldn't select it. Instead depend on it.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig.debug | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 949258d412d0..c86df246339e 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -356,8 +356,7 @@ config FAIL_IOMMU
 
 config PPC_PTDUMP
         bool "Export kernel pagetable layout to userspace via debugfs"
-        depends on DEBUG_KERNEL
-        select DEBUG_FS
+        depends on DEBUG_KERNEL && DEBUG_FS
         help
 	  This option exports the state of the kernel pagetables to a
 	  debugfs file. This is only useful for kernel developers who are
-- 
cgit v1.2.3


From 4eb43875a1859b8d4fb6c56a441a18bcf5957413 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 1 Feb 2017 16:59:35 +1100
Subject: powerpc/boot: Update .gitignore

Add a few things that have been missed from .gitignore over the years.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/boot/.gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/boot/.gitignore b/arch/powerpc/boot/.gitignore
index d61c03525777..84774ccba1c2 100644
--- a/arch/powerpc/boot/.gitignore
+++ b/arch/powerpc/boot/.gitignore
@@ -1,4 +1,5 @@
 addnote
+decompress_inflate.c
 empty.c
 hack-coff
 inffast.c
@@ -13,11 +14,13 @@ infutil.h
 kernel-vmlinux.strip.c
 kernel-vmlinux.strip.gz
 mktree
+otheros.bld
 uImage
 cuImage.*
 dtbImage.*
 *.dtb
 treeImage.*
+vmlinux.strip
 zImage
 zImage.initrd
 zImage.bin.*
@@ -26,6 +29,7 @@ zImage.coff
 zImage.epapr
 zImage.holly
 zImage.*lds
+zImage.maple
 zImage.miboot
 zImage.pmac
 zImage.pseries
-- 
cgit v1.2.3


From e71ff89c712cb387914abff373ac830d6298b012 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Thu, 5 Jan 2017 16:38:15 +0530
Subject: powerpc/xmon: Cleanup to use is_kernel_addr macro

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17cf6886..a44b049b9cf6 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1403,7 +1403,7 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr,
 	struct pt_regs regs;
 
 	while (max_to_print--) {
-		if (sp < PAGE_OFFSET) {
+		if (!is_kernel_addr(sp)) {
 			if (sp != 0)
 				printf("SP (%lx) is in userspace\n", sp);
 			break;
@@ -1431,12 +1431,12 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr,
 				mread(newsp + LRSAVE_OFFSET, &nextip,
 				      sizeof(unsigned long));
 			if (lr == ip) {
-				if (lr < PAGE_OFFSET
+				if (!is_kernel_addr(lr)
 				    || (fnstart <= lr && lr < fnend))
 					printip = 0;
 			} else if (lr == nextip) {
 				printip = 0;
-			} else if (lr >= PAGE_OFFSET
+			} else if (is_kernel_addr(lr)
 				   && !(fnstart <= lr && lr < fnend)) {
 				printf("[link register   ] ");
 				xmon_print_symbol(lr, " ", "\n");
@@ -1496,7 +1496,7 @@ static void print_bug_trap(struct pt_regs *regs)
 	if (regs->msr & MSR_PR)
 		return;		/* not in kernel */
 	addr = regs->nip;	/* address of trap instruction */
-	if (addr < PAGE_OFFSET)
+	if (!is_kernel_addr(addr))
 		return;
 	bug = find_bug(regs->nip);
 	if (bug == NULL)
-- 
cgit v1.2.3


From c21f515c743687c6c2b3d38227e6ad8e6b733409 Mon Sep 17 00:00:00 2001
From: John Allen <jallen@linux.vnet.ibm.com>
Date: Fri, 6 Jan 2017 13:25:53 -0600
Subject: powerpc/pseries: Make the acquire/release of the drc for memory a
 seperate step

When adding and removing LMBs we should make the acquire/release of
the DRC a separate step to allow for a few improvements. First
this will ensure that LMBs removed during a remove by count operation
are all available if a error occurs and we need to add them back. By
first removeing all the LMBs from the kernel before releasing their
DRCs the LMBs are available to add back should an error occur.

Also, this will allow for faster re-add operations of memory for
PRRN event handling since we can skip the unneeded step of having
to release the DRC and the acquire it back.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 34 ++++++++++++++++---------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2617f9f356bd..be11fc3cdeb0 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -446,9 +446,7 @@ static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
 	/* Update memory regions for memory remove */
 	memblock_remove(lmb->base_addr, block_sz);
 
-	dlpar_release_drc(lmb->drc_index);
 	dlpar_remove_device_tree_lmb(lmb);
-
 	return 0;
 }
 
@@ -516,6 +514,7 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove,
 			if (!lmbs[i].reserved)
 				continue;
 
+			dlpar_release_drc(lmbs[i].drc_index);
 			pr_info("Memory at %llx was hot-removed\n",
 				lmbs[i].base_addr);
 
@@ -545,6 +544,9 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop)
 		if (lmbs[i].drc_index == drc_index) {
 			lmb_found = 1;
 			rc = dlpar_remove_lmb(&lmbs[i]);
+			if (!rc)
+				dlpar_release_drc(lmbs[i].drc_index);
+
 			break;
 		}
 	}
@@ -599,10 +601,6 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb)
 	if (lmb->flags & DRCONF_MEM_ASSIGNED)
 		return -EINVAL;
 
-	rc = dlpar_acquire_drc(lmb->drc_index);
-	if (rc)
-		return rc;
-
 	rc = dlpar_add_device_tree_lmb(lmb);
 	if (rc) {
 		pr_err("Couldn't update device tree for drc index %x\n",
@@ -618,12 +616,10 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb)
 
 	/* Add the memory */
 	rc = add_memory(nid, lmb->base_addr, block_sz);
-	if (rc) {
+	if (rc)
 		dlpar_remove_device_tree_lmb(lmb);
-		dlpar_release_drc(lmb->drc_index);
-	} else {
+	else
 		lmb->flags |= DRCONF_MEM_ASSIGNED;
-	}
 
 	return rc;
 }
@@ -655,10 +651,16 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop)
 		return -EINVAL;
 
 	for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) {
-		rc = dlpar_add_lmb(&lmbs[i]);
+		rc = dlpar_acquire_drc(lmbs[i].drc_index);
 		if (rc)
 			continue;
 
+		rc = dlpar_add_lmb(&lmbs[i]);
+		if (rc) {
+			dlpar_release_drc(lmbs[i].drc_index);
+			continue;
+		}
+
 		lmbs_added++;
 
 		/* Mark this lmb so we can remove it later if all of the
@@ -678,6 +680,8 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop)
 			if (rc)
 				pr_err("Failed to remove LMB, drc index %x\n",
 				       be32_to_cpu(lmbs[i].drc_index));
+			else
+				dlpar_release_drc(lmbs[i].drc_index);
 		}
 		rc = -EINVAL;
 	} else {
@@ -711,7 +715,13 @@ static int dlpar_memory_add_by_index(u32 drc_index, struct property *prop)
 	for (i = 0; i < num_lmbs; i++) {
 		if (lmbs[i].drc_index == drc_index) {
 			lmb_found = 1;
-			rc = dlpar_add_lmb(&lmbs[i]);
+			rc = dlpar_acquire_drc(lmbs[i].drc_index);
+			if (!rc) {
+				rc = dlpar_add_lmb(&lmbs[i]);
+				if (rc)
+					dlpar_release_drc(lmbs[i].drc_index);
+			}
+
 			break;
 		}
 	}
-- 
cgit v1.2.3


From e70d59700fc32c9249b26acd4120303c497e84f1 Mon Sep 17 00:00:00 2001
From: John Allen <jallen@linux.vnet.ibm.com>
Date: Fri, 6 Jan 2017 13:27:26 -0600
Subject: powerpc/pseries: Introduce memory hotplug READD operation

Currently, memory must be hot removed and subsequently re-added in order
to dynamically update the affinity of LMBs specified by a PRRN event.
Earlier implementations of the PRRN event handler ran into issues in which
the hot remove would occur successfully, but a hotplug event would be
initiated from another source and grab the hotplug lock preventing the hot
add from occurring. To prevent this situation, this patch introduces the
notion of a hot "readd" action for memory which atomizes a hot remove and
a hot add into a single, serialized operation on the hotplug queue.

Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
Reviewed-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/rtas.h                 |  1 +
 arch/powerpc/platforms/pseries/hotplug-memory.c | 41 +++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 9c23baa10b81..076b89247ab5 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -318,6 +318,7 @@ struct pseries_hp_errorlog {
 
 #define PSERIES_HP_ELOG_ACTION_ADD	1
 #define PSERIES_HP_ELOG_ACTION_REMOVE	2
+#define PSERIES_HP_ELOG_ACTION_READD	3
 
 #define PSERIES_HP_ELOG_ID_DRC_NAME	1
 #define PSERIES_HP_ELOG_ID_DRC_INDEX	2
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index be11fc3cdeb0..3381c20edbc0 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -563,6 +563,44 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop)
 	return rc;
 }
 
+static int dlpar_memory_readd_by_index(u32 drc_index, struct property *prop)
+{
+	struct of_drconf_cell *lmbs;
+	u32 num_lmbs, *p;
+	int lmb_found;
+	int i, rc;
+
+	pr_info("Attempting to update LMB, drc index %x\n", drc_index);
+
+	p = prop->value;
+	num_lmbs = *p++;
+	lmbs = (struct of_drconf_cell *)p;
+
+	lmb_found = 0;
+	for (i = 0; i < num_lmbs; i++) {
+		if (lmbs[i].drc_index == drc_index) {
+			lmb_found = 1;
+			rc = dlpar_remove_lmb(&lmbs[i]);
+			if (!rc) {
+				rc = dlpar_add_lmb(&lmbs[i]);
+				if (rc)
+					dlpar_release_drc(lmbs[i].drc_index);
+			}
+			break;
+		}
+	}
+
+	if (!lmb_found)
+		rc = -EINVAL;
+
+	if (rc)
+		pr_info("Failed to update memory at %llx\n",
+			lmbs[i].base_addr);
+	else
+		pr_info("Memory at %llx was updated\n", lmbs[i].base_addr);
+
+	return rc;
+}
 #else
 static inline int pseries_remove_memblock(unsigned long base,
 					  unsigned int memblock_size)
@@ -779,6 +817,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
 		else
 			rc = -EINVAL;
 		break;
+	case PSERIES_HP_ELOG_ACTION_READD:
+		rc = dlpar_memory_readd_by_index(drc_index, prop);
+		break;
 	default:
 		pr_err("Invalid action (%d) specified\n", hp_elog->action);
 		rc = -EINVAL;
-- 
cgit v1.2.3


From 675d8ee685195249d1b35e12a29f614a9888b16d Mon Sep 17 00:00:00 2001
From: John Allen <jallen@linux.vnet.ibm.com>
Date: Fri, 6 Jan 2017 13:28:54 -0600
Subject: powerpc/pseries: Update affinity for memory and cpus specified in a
 PRRN event

Extend the existing PRRN infrastructure to perform the actual affinity
updating for cpus and memory in addition to the device tree updating.
For cpus, dynamic affinity updating already appears to exist in the
kernel in the form of arch_update_cpu_topology(). For memory, we must
place a READD operation on the hotplug queue for any phandle included in
the PRRN event that is determined to be an LMB.

Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
Reviewed-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/rtasd.c               |  7 ++++++-
 arch/powerpc/platforms/pseries/mobility.c | 34 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 2bf1f9b5b34b..3650732639ed 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -21,6 +21,7 @@
 #include <linux/cpu.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
+#include <linux/topology.h>
 
 #include <linux/uaccess.h>
 #include <asm/io.h>
@@ -282,6 +283,7 @@ static void prrn_work_fn(struct work_struct *work)
 	 * the RTAS event.
 	 */
 	pseries_devicetree_update(-prrn_update_scope);
+	arch_update_cpu_topology();
 }
 
 static DECLARE_WORK(prrn_work, prrn_work_fn);
@@ -434,7 +436,10 @@ static void do_event_scan(void)
 		}
 
 		if (error == 0) {
-			pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+			if (rtas_error_type((struct rtas_error_log *)logdata) !=
+			    RTAS_TYPE_PRRN)
+				pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG,
+						  0);
 			handle_rtas_event((struct rtas_error_log *)logdata);
 		}
 
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index a560a98bcf3b..5a0c7ba429ce 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -39,6 +39,7 @@ struct update_props_workarea {
 #define ADD_DT_NODE	0x03000000
 
 #define MIGRATION_SCOPE	(1)
+#define PRRN_SCOPE -2
 
 static int mobility_rtas_call(int token, char *buf, s32 scope)
 {
@@ -236,6 +237,35 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index)
 	return rc;
 }
 
+static void prrn_update_node(__be32 phandle)
+{
+	struct pseries_hp_errorlog *hp_elog;
+	struct device_node *dn;
+
+	/*
+	 * If a node is found from a the given phandle, the phandle does not
+	 * represent the drc index of an LMB and we can ignore.
+	 */
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+	if (dn) {
+		of_node_put(dn);
+		return;
+	}
+
+	hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
+	if(!hp_elog)
+		return;
+
+	hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM;
+	hp_elog->action = PSERIES_HP_ELOG_ACTION_READD;
+	hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
+	hp_elog->_drc_u.drc_index = phandle;
+
+	queue_hotplug_event(hp_elog, NULL, NULL);
+
+	kfree(hp_elog);
+}
+
 int pseries_devicetree_update(s32 scope)
 {
 	char *rtas_buf;
@@ -274,6 +304,10 @@ int pseries_devicetree_update(s32 scope)
 					break;
 				case UPDATE_DT_NODE:
 					update_dt_node(phandle, scope);
+
+					if (scope == PRRN_SCOPE)
+						prrn_update_node(phandle);
+
 					break;
 				case ADD_DT_NODE:
 					drc_index = *data++;
-- 
cgit v1.2.3


From 673bc4354d42731018494bb69d63b6513f9ae2bb Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 11 Jan 2017 12:00:58 -0500
Subject: powerpc/pseries: Report DLPAR capabilities

As we add the ability to do DLPAR of additional devices through
the sysfs interface we need to know which devices are supported.
This adds the reporting of supported devices with a comma separated
list reported in the existing /sys/kernel/dlpar.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/dlpar.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 5cb2e4beffc5..d3a81e746fc4 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -551,7 +551,13 @@ dlpar_store_out:
 	return rc ? rc : count;
 }
 
-static CLASS_ATTR(dlpar, S_IWUSR, NULL, dlpar_store);
+static ssize_t dlpar_show(struct class *class, struct class_attribute *attr,
+			  char *buf)
+{
+	return sprintf(buf, "%s\n", "memory,cpu");
+}
+
+static CLASS_ATTR(dlpar, S_IWUSR | S_IRUSR, dlpar_show, dlpar_store);
 
 static int __init pseries_dlpar_init(void)
 {
-- 
cgit v1.2.3


From 39d40871526627fd0e2cfc1e2fb88500a5049c4c Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Wed, 1 Feb 2017 14:22:07 +1100
Subject: cxl: Fix build when CONFIG_DEBUG_FS=n

Stub out the debugfs functions so that the build doesn't break when
CONFIG_DEBUG_FS=n.

Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/Makefile |  3 ++-
 drivers/misc/cxl/cxl.h    | 59 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile
index 56e9a4732ef0..c14fd6b65b5a 100644
--- a/drivers/misc/cxl/Makefile
+++ b/drivers/misc/cxl/Makefile
@@ -2,9 +2,10 @@ ccflags-y			:= $(call cc-disable-warning, unused-const-variable)
 ccflags-$(CONFIG_PPC_WERROR)	+= -Werror
 
 cxl-y				+= main.o file.o irq.o fault.o native.o
-cxl-y				+= context.o sysfs.o debugfs.o pci.o trace.o
+cxl-y				+= context.o sysfs.o pci.o trace.o
 cxl-y				+= vphb.o phb.o api.o
 cxl-$(CONFIG_PPC_PSERIES)	+= flash.o guest.o of.o hcalls.o
+cxl-$(CONFIG_DEBUG_FS)		+= debugfs.o
 obj-$(CONFIG_CXL)		+= cxl.o
 obj-$(CONFIG_CXL_BASE)		+= base.o
 
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index b4a43fd14b99..6c722d96b775 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -802,12 +802,67 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count);
 void afu_release_irqs(struct cxl_context *ctx, void *cookie);
 void afu_irq_name_free(struct cxl_context *ctx);
 
+#ifdef CONFIG_DEBUG_FS
+
 int cxl_debugfs_init(void);
 void cxl_debugfs_exit(void);
 int cxl_debugfs_adapter_add(struct cxl *adapter);
 void cxl_debugfs_adapter_remove(struct cxl *adapter);
 int cxl_debugfs_afu_add(struct cxl_afu *afu);
 void cxl_debugfs_afu_remove(struct cxl_afu *afu);
+void cxl_stop_trace(struct cxl *cxl);
+void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir);
+
+#else /* CONFIG_DEBUG_FS */
+
+static inline int __init cxl_debugfs_init(void)
+{
+	return 0;
+}
+
+static inline void cxl_debugfs_exit(void)
+{
+}
+
+static inline int cxl_debugfs_adapter_add(struct cxl *adapter)
+{
+	return 0;
+}
+
+static inline void cxl_debugfs_adapter_remove(struct cxl *adapter)
+{
+}
+
+static inline int cxl_debugfs_afu_add(struct cxl_afu *afu)
+{
+	return 0;
+}
+
+static inline void cxl_debugfs_afu_remove(struct cxl_afu *afu)
+{
+}
+
+static inline void cxl_stop_trace(struct cxl *cxl)
+{
+}
+
+static inline void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter,
+						    struct dentry *dir)
+{
+}
+
+static inline void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter,
+						    struct dentry *dir)
+{
+}
+
+static inline void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
 
 void cxl_handle_fault(struct work_struct *work);
 void cxl_prefault(struct cxl_context *ctx, u64 wed);
@@ -872,12 +927,8 @@ int cxl_data_cache_flush(struct cxl *adapter);
 int cxl_afu_disable(struct cxl_afu *afu);
 int cxl_psl_purge(struct cxl_afu *afu);
 
-void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir);
 void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx);
 void cxl_native_err_irq_dump_regs(struct cxl *adapter);
-void cxl_stop_trace(struct cxl *cxl);
 int cxl_pci_vphb_add(struct cxl_afu *afu);
 void cxl_pci_vphb_remove(struct cxl_afu *afu);
 void cxl_release_mapping(struct cxl_context *ctx);
-- 
cgit v1.2.3


From 7b4010edff09929c253e6626ab19cade9e250505 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Tue, 6 Dec 2016 17:27:58 +1100
Subject: gcc-plugins: Fix definition of DISABLE_LATENT_ENTROPY_PLUGIN

The variable DISABLE_LATENT_ENTROPY_PLUGIN is defined when
CONFIG_PAX_LATENT_ENTROPY is set. This is leftover from the original PaX
version of the plugin code and doesn't actually exist. Change the condition
to depend on CONFIG_GCC_PLUGIN_LATENT_ENTROPY instead.

Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin")
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 scripts/Makefile.gcc-plugins | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 060d2cb373db..26c67b735dc9 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -8,7 +8,7 @@ ifdef CONFIG_GCC_PLUGINS
 
   gcc-plugin-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY)	+= latent_entropy_plugin.so
   gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY)	+= -DLATENT_ENTROPY_PLUGIN
-  ifdef CONFIG_PAX_LATENT_ENTROPY
+  ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
     DISABLE_LATENT_ENTROPY_PLUGIN			+= -fplugin-arg-latent_entropy_plugin-disable
   endif
 
-- 
cgit v1.2.3


From eac6f8b0c7adb003776dbad9d037ee2fc64f9d62 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Tue, 6 Dec 2016 17:27:59 +1100
Subject: powerpc: Correctly disable latent entropy GCC plugin on prom_init.o

Commit 38addce8b600 ("gcc-plugins: Add latent_entropy plugin") excludes
certain powerpc early boot code from the latent entropy plugin by adding
appropriate CFLAGS. It looks like this was supposed to cover
prom_init.o, but ended up saying init.o (which doesn't exist) instead.
Fix the typo.

Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin")
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 23f8082d7bfa..f4898e6ad18d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -15,7 +15,7 @@ CFLAGS_btext.o		+= -fPIC
 endif
 
 CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
-CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
+CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 
-- 
cgit v1.2.3


From 65c059bcaa73197ca71e8d4cc9a6c903560506c1 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Tue, 6 Dec 2016 17:28:00 +1100
Subject: powerpc: Enable support for GCC plugins

Enable support for GCC plugins on powerpc.

Add an additional version check in gcc-plugins-check to advise users to
upgrade to gcc 5.2+ on powerpc to avoid issues with header files (gcc <=
4.6) or missing copies of rs6000-cpus.def (4.8 to 5.1 on 64-bit
targets).

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig         | 1 +
 scripts/Makefile.gcc-plugins | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 33f5b8380a7d..bfdd80e7754c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -93,6 +93,7 @@ config PPC
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_GCC_PLUGINS
 	select SYSCTL_EXCEPTION_TRACE
 	select VIRT_TO_BUS if !PPC64
 	select HAVE_IDE
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 26c67b735dc9..9835a757d52a 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -47,6 +47,14 @@ gcc-plugins-check: FORCE
 ifdef CONFIG_GCC_PLUGINS
   ifeq ($(PLUGINCC),)
     ifneq ($(GCC_PLUGINS_CFLAGS),)
+      # Various gccs between 4.5 and 5.1 have bugs on powerpc due to missing
+      # header files. gcc <= 4.6 doesn't work at all, gccs from 4.8 to 5.1 have
+      # issues with 64-bit targets.
+      ifeq ($(ARCH),powerpc)
+        ifeq ($(call cc-ifversion, -le, 0501, y), y)
+	  @echo "Cannot use CONFIG_GCC_PLUGINS: plugin support on gcc <= 5.1 is buggy on powerpc, please upgrade to gcc 5.2 or newer" >&2 && exit 1
+        endif
+      endif
       ifeq ($(call cc-ifversion, -ge, 0405, y), y)
 	$(Q)$(srctree)/scripts/gcc-plugin.sh --show-error "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)" || true
 	@echo "Cannot use CONFIG_GCC_PLUGINS: your gcc installation does not support plugins, perhaps the necessary headers are missing?" >&2 && exit 1
-- 
cgit v1.2.3


From 2a196e24b39aa85351ecd9eb7cf511914157f14b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:42 -0600
Subject: powerpc: Move ARCH_DLINFO out of uapi

It's an kernel private macro, it doesn't belong there

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/elf.h      | 22 ++++++++++++++++++++++
 arch/powerpc/include/uapi/asm/elf.h | 23 -----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index ee46ffef608e..730c27ed10e1 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -136,4 +136,26 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 
 #endif /* CONFIG_SPU_BASE */
 
+/*
+ * The requirements here are:
+ * - keep the final alignment of sp (sp & 0xf)
+ * - make sure the 32-bit value at the first 16 byte aligned position of
+ *   AUXV is greater than 16 for glibc compatibility.
+ *   AT_IGNOREPPC is used for that.
+ * - for compatibility with glibc ARCH_DLINFO must always be defined on PPC,
+ *   even if DLINFO_ARCH_ITEMS goes to zero or is undefined.
+ * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes
+ */
+#define ARCH_DLINFO							\
+do {									\
+	/* Handle glibc compatibility. */				\
+	NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC);			\
+	NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC);			\
+	/* Cache size items */						\
+	NEW_AUX_ENT(AT_DCACHEBSIZE, dcache_bsize);			\
+	NEW_AUX_ENT(AT_ICACHEBSIZE, icache_bsize);			\
+	NEW_AUX_ENT(AT_UCACHEBSIZE, ucache_bsize);			\
+	VDSO_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso_base);	\
+} while (0)
+
 #endif /* _ASM_POWERPC_ELF_H */
diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h
index 3a9e44c45c78..b2c6fdd5ac30 100644
--- a/arch/powerpc/include/uapi/asm/elf.h
+++ b/arch/powerpc/include/uapi/asm/elf.h
@@ -162,29 +162,6 @@ typedef elf_vrreg_t elf_vrregset_t32[ELF_NVRREG32];
 typedef elf_fpreg_t elf_vsrreghalf_t32[ELF_NVSRHALFREG];
 #endif
 
-
-/*
- * The requirements here are:
- * - keep the final alignment of sp (sp & 0xf)
- * - make sure the 32-bit value at the first 16 byte aligned position of
- *   AUXV is greater than 16 for glibc compatibility.
- *   AT_IGNOREPPC is used for that.
- * - for compatibility with glibc ARCH_DLINFO must always be defined on PPC,
- *   even if DLINFO_ARCH_ITEMS goes to zero or is undefined.
- * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes
- */
-#define ARCH_DLINFO							\
-do {									\
-	/* Handle glibc compatibility. */				\
-	NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC);			\
-	NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC);			\
-	/* Cache size items */						\
-	NEW_AUX_ENT(AT_DCACHEBSIZE, dcache_bsize);			\
-	NEW_AUX_ENT(AT_ICACHEBSIZE, icache_bsize);			\
-	NEW_AUX_ENT(AT_UCACHEBSIZE, ucache_bsize);			\
-	VDSO_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso_base);	\
-} while (0)
-
 /* PowerPC64 relocations defined by the ABIs */
 #define R_PPC64_NONE    R_PPC_NONE
 #define R_PPC64_ADDR32  R_PPC_ADDR32  /* 32bit absolute address.  */
-- 
cgit v1.2.3


From 33ec723cac63529e5d0efa1125c893d2049c023d Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:43 -0600
Subject: powerpc: Move {d,i,u}cache_bsize definitions to a common place

The variables are defined twice in setup_32.c and setup_64.c, do it
once in setup-common.c instead

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup-common.c | 9 +++++++++
 arch/powerpc/kernel/setup_32.c     | 8 --------
 arch/powerpc/kernel/setup_64.c     | 8 --------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index f516ac508ae3..4697da895133 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -87,6 +87,15 @@ EXPORT_SYMBOL(machine_id);
 int boot_cpuid = -1;
 EXPORT_SYMBOL_GPL(boot_cpuid);
 
+/*
+ * These are used in binfmt_elf.c to put aux entries on the stack
+ * for each elf executable being started.
+ */
+int dcache_bsize;
+int icache_bsize;
+int ucache_bsize;
+
+
 unsigned long klimit = (unsigned long) _end;
 
 /*
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 7fcf1f7f01c1..2f88f6cf1a42 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -58,14 +58,6 @@ EXPORT_SYMBOL(ISA_DMA_THRESHOLD);
 EXPORT_SYMBOL(DMA_MODE_READ);
 EXPORT_SYMBOL(DMA_MODE_WRITE);
 
-/*
- * These are used in binfmt_elf.c to put aux entries on the stack
- * for each elf executable being started.
- */
-int dcache_bsize;
-int icache_bsize;
-int ucache_bsize;
-
 /*
  * We're called here very early in the boot.
  *
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 6824157e4d2e..ed3362bc9a2a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -88,14 +88,6 @@ struct ppc64_caches ppc64_caches = {
 };
 EXPORT_SYMBOL_GPL(ppc64_caches);
 
-/*
- * These are used in binfmt_elf.c to put aux entries on the stack
- * for each elf executable being started.
- */
-int dcache_bsize;
-int icache_bsize;
-int ucache_bsize;
-
 #if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
 void __init setup_tlb_core_data(void)
 {
-- 
cgit v1.2.3


From f9e473f1aa7597affff87bc6a599cf0aa389f0c1 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:44 -0600
Subject: powerpc: Remove obsolete comment about patching instructions

We don't patch instructions based on the cache lines or block
sizes these days.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup_64.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ed3362bc9a2a..ae84d345c13c 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -77,9 +77,6 @@
 int spinning_secondaries;
 u64 ppc64_pft_size;
 
-/* Pick defaults since we might want to patch instructions
- * before we've read this from the device tree.
- */
 struct ppc64_caches ppc64_caches = {
 	.dline_size = 0x40,
 	.log_dline_size = 6,
-- 
cgit v1.2.3


From bd067f83b0840e798328d14133ce4542d3bf9e71 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:45 -0600
Subject: powerpc/64: Fix naming of cache block vs. cache line

In a number of places we called "cache line size" what is actually
the cache block size, which in the powerpc architecture, means the
effective size to use with cache management instructions (it can
be different from the actual cache line size).

We fix the naming across the board and properly retrieve both
pieces of information when available in the device-tree.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cache.h   | 12 ++++---
 arch/powerpc/include/asm/page_64.h |  4 +--
 arch/powerpc/kernel/align.c        |  2 +-
 arch/powerpc/kernel/asm-offsets.c  | 12 +++----
 arch/powerpc/kernel/misc_64.S      | 28 ++++++++--------
 arch/powerpc/kernel/setup_64.c     | 65 +++++++++++++++++++++-----------------
 arch/powerpc/kernel/vdso.c         | 10 +++---
 arch/powerpc/lib/copypage_64.S     |  4 +--
 arch/powerpc/lib/string_64.S       |  6 ++--
 9 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 7657aa897a38..25ee433a8261 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -33,12 +33,14 @@
 struct ppc64_caches {
 	u32	dsize;			/* L1 d-cache size */
 	u32	dline_size;		/* L1 d-cache line size	*/
-	u32	log_dline_size;
-	u32	dlines_per_page;
+	u32	dblock_size;		/* L1 d-cache block size */
+	u32	log_dblock_size;
+	u32	dblocks_per_page;
 	u32	isize;			/* L1 i-cache size */
-	u32	iline_size;		/* L1 i-cache line size	*/
-	u32	log_iline_size;
-	u32	ilines_per_page;
+	u32	iline_size;		/* L1 d-cache line size	*/
+	u32	iblock_size;		/* L1 i-cache block size */
+	u32	log_iblock_size;
+	u32	iblocks_per_page;
 };
 
 extern struct ppc64_caches ppc64_caches;
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index dd5f0712afa2..c50a666308dd 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -47,14 +47,14 @@ static inline void clear_page(void *addr)
 	unsigned long iterations;
 	unsigned long onex, twox, fourx, eightx;
 
-	iterations = ppc64_caches.dlines_per_page / 8;
+	iterations = ppc64_caches.dblocks_per_page / 8;
 
 	/*
 	 * Some verisions of gcc use multiply instructions to
 	 * calculate the offsets so lets give it a hand to
 	 * do better.
 	 */
-	onex = ppc64_caches.dline_size;
+	onex = ppc64_caches.dblock_size;
 	twox = onex << 1;
 	fourx = onex << 2;
 	eightx = onex << 3;
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 8d58c61908f7..30ff6590a2dd 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -204,7 +204,7 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
 	int i, size;
 
 #ifdef __powerpc64__
-	size = ppc64_caches.dline_size;
+	size = ppc64_caches.dblock_size;
 #else
 	size = L1_CACHE_BYTES;
 #endif
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0601e6a7297c..125442107b40 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -163,12 +163,12 @@ int main(void)
 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 
 #ifdef CONFIG_PPC64
-	DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size));
-	DEFINE(DCACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_dline_size));
-	DEFINE(DCACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, dlines_per_page));
-	DEFINE(ICACHEL1LINESIZE, offsetof(struct ppc64_caches, iline_size));
-	DEFINE(ICACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_iline_size));
-	DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
+	DEFINE(DCACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, dblock_size));
+	DEFINE(DCACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, log_dblock_size));
+	DEFINE(DCACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, dblocks_per_page));
+	DEFINE(ICACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, iblock_size));
+	DEFINE(ICACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, log_iblock_size));
+	DEFINE(ICACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, iblocks_per_page));
 	/* paca */
 	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
 	DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 32be2a844947..ae179cb1bb3c 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -80,12 +80,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
  * each other.
  */
  	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)/* Get cache line size */
+	lwz	r7,DCACHEL1BLOCKSIZE(r10)/* Get cache block size */
 	addi	r5,r7,-1
 	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of cache line size */
+	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)	/* Get log-2 of cache block size */
 	srw.	r8,r8,r9		/* compute line count */
 	beqlr				/* nothing to do? */
 	mtctr	r8
@@ -96,12 +96,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 
 /* Now invalidate the instruction cache */
 	
-	lwz	r7,ICACHEL1LINESIZE(r10)	/* Get Icache line size */
+	lwz	r7,ICACHEL1BLOCKSIZE(r10)	/* Get Icache block size */
 	addi	r5,r7,-1
 	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5
-	lwz	r9,ICACHEL1LOGLINESIZE(r10)	/* Get log-2 of Icache line size */
+	lwz	r9,ICACHEL1LOGBLOCKSIZE(r10)	/* Get log-2 of Icache block size */
 	srw.	r8,r8,r9		/* compute line count */
 	beqlr				/* nothing to do? */
 	mtctr	r8
@@ -128,12 +128,12 @@ _GLOBAL(flush_dcache_range)
  * Different systems have different cache line sizes
  */
  	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
+	lwz	r7,DCACHEL1BLOCKSIZE(r10)	/* Get dcache block size */
 	addi	r5,r7,-1
 	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
+	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)	/* Get log-2 of dcache block size */
 	srw.	r8,r8,r9		/* compute line count */
 	beqlr				/* nothing to do? */
 	mtctr	r8
@@ -156,12 +156,12 @@ EXPORT_SYMBOL(flush_dcache_range)
  */
 _GLOBAL(flush_dcache_phys_range)
  	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
+	lwz	r7,DCACHEL1BLOCKSIZE(r10)	/* Get dcache block size */
 	addi	r5,r7,-1
 	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)	/* Get log-2 of dcache line size */
+	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)	/* Get log-2 of dcache block size */
 	srw.	r8,r8,r9		/* compute line count */
 	beqlr				/* nothing to do? */
 	mfmsr	r5			/* Disable MMU Data Relocation */
@@ -184,12 +184,12 @@ _GLOBAL(flush_dcache_phys_range)
 
 _GLOBAL(flush_inval_dcache_range)
  	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1LINESIZE(r10)	/* Get dcache line size */
+	lwz	r7,DCACHEL1BLOCKSIZE(r10)	/* Get dcache block size */
 	addi	r5,r7,-1
 	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGLINESIZE(r10)/* Get log-2 of dcache line size */
+	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)/* Get log-2 of dcache block size */
 	srw.	r8,r8,r9		/* compute line count */
 	beqlr				/* nothing to do? */
 	sync
@@ -225,8 +225,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 /* Flush the dcache */
  	ld	r7,PPC64_CACHES@toc(r2)
 	clrrdi	r3,r3,PAGE_SHIFT           	    /* Page align */
-	lwz	r4,DCACHEL1LINESPERPAGE(r7)	/* Get # dcache lines per page */
-	lwz	r5,DCACHEL1LINESIZE(r7)		/* Get dcache line size */
+	lwz	r4,DCACHEL1BLOCKSPERPAGE(r7)	/* Get # dcache blocks per page */
+	lwz	r5,DCACHEL1BLOCKSIZE(r7)	/* Get dcache block size */
 	mr	r6,r3
 	mtctr	r4
 0:	dcbst	0,r6
@@ -236,8 +236,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 
 /* Now invalidate the icache */	
 
-	lwz	r4,ICACHEL1LINESPERPAGE(r7)	/* Get # icache lines per page */
-	lwz	r5,ICACHEL1LINESIZE(r7)		/* Get icache line size */
+	lwz	r4,ICACHEL1BLOCKSPERPAGE(r7)	/* Get # icache blocks per page */
+	lwz	r5,ICACHEL1BLOCKSIZE(r7)	/* Get icache block size */
 	mtctr	r4
 1:	icbi	0,r3
 	add	r3,r3,r5
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index ae84d345c13c..08cccb2501e7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -78,10 +78,10 @@ int spinning_secondaries;
 u64 ppc64_pft_size;
 
 struct ppc64_caches ppc64_caches = {
-	.dline_size = 0x40,
-	.log_dline_size = 6,
-	.iline_size = 0x40,
-	.log_iline_size = 6
+	.dblock_size = 0x40,
+	.log_dblock_size = 6,
+	.iblock_size = 0x40,
+	.log_iblock_size = 6
 };
 EXPORT_SYMBOL_GPL(ppc64_caches);
 
@@ -412,59 +412,66 @@ void __init initialize_cache_info(void)
 		 * d-cache and i-cache sizes... -Peter
 		 */
 		if (num_cpus == 1) {
-			const __be32 *sizep, *lsizep;
-			u32 size, lsize;
+			const __be32 *sizep, *lsizep, *bsizep;
+			u32 size, lsize, bsize;
 
 			size = 0;
-			lsize = cur_cpu_spec->dcache_bsize;
+			lsize = bsize = cur_cpu_spec->dcache_bsize;
 			sizep = of_get_property(np, "d-cache-size", NULL);
 			if (sizep != NULL)
 				size = be32_to_cpu(*sizep);
-			lsizep = of_get_property(np, "d-cache-block-size",
+			bsizep = of_get_property(np, "d-cache-block-size",
 						 NULL);
-			/* fallback if block size missing */
-			if (lsizep == NULL)
-				lsizep = of_get_property(np,
-							 "d-cache-line-size",
-							 NULL);
+			lsizep = of_get_property(np, "d-cache-line-size",
+						 NULL);
+			if (bsizep == NULL)
+				bsizep = lsizep;
 			if (lsizep != NULL)
 				lsize = be32_to_cpu(*lsizep);
-			if (sizep == NULL || lsizep == NULL)
+			if (bsizep != NULL)
+				bsize = be32_to_cpu(*bsizep);
+			if (sizep == NULL || bsizep == NULL || lsizep == NULL)
 				DBG("Argh, can't find dcache properties ! "
-				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
+				    "sizep: %p, bsizep: %p, lsizep: %p\n",
+				    sizep, bsizep, lsizep);
 
 			ppc64_caches.dsize = size;
 			ppc64_caches.dline_size = lsize;
-			ppc64_caches.log_dline_size = __ilog2(lsize);
-			ppc64_caches.dlines_per_page = PAGE_SIZE / lsize;
+			ppc64_caches.dblock_size = bsize;
+			ppc64_caches.log_dblock_size = __ilog2(bsize);
+			ppc64_caches.dblocks_per_page = PAGE_SIZE / bsize;
 
 			size = 0;
-			lsize = cur_cpu_spec->icache_bsize;
+			lsize = bsize = cur_cpu_spec->icache_bsize;
 			sizep = of_get_property(np, "i-cache-size", NULL);
 			if (sizep != NULL)
 				size = be32_to_cpu(*sizep);
-			lsizep = of_get_property(np, "i-cache-block-size",
+			bsizep = of_get_property(np, "i-cache-block-size",
+						 NULL);
+			lsizep = of_get_property(np, "i-cache-line-size",
 						 NULL);
-			if (lsizep == NULL)
-				lsizep = of_get_property(np,
-							 "i-cache-line-size",
-							 NULL);
+			if (bsizep == NULL)
+				bsizep = lsizep;
 			if (lsizep != NULL)
 				lsize = be32_to_cpu(*lsizep);
-			if (sizep == NULL || lsizep == NULL)
+			if (bsizep != NULL)
+				bsize = be32_to_cpu(*bsizep);
+			if (sizep == NULL || bsizep == NULL || lsizep == NULL)
 				DBG("Argh, can't find icache properties ! "
-				    "sizep: %p, lsizep: %p\n", sizep, lsizep);
+				    "sizep: %p, bsizep: %p, lsizep: %p\n",
+				    sizep, bsizep, lsizep);
 
 			ppc64_caches.isize = size;
 			ppc64_caches.iline_size = lsize;
-			ppc64_caches.log_iline_size = __ilog2(lsize);
-			ppc64_caches.ilines_per_page = PAGE_SIZE / lsize;
+			ppc64_caches.iblock_size = bsize;
+			ppc64_caches.log_iblock_size = __ilog2(bsize);
+			ppc64_caches.iblocks_per_page = PAGE_SIZE / bsize;
 		}
 	}
 
 	/* For use by binfmt_elf */
-	dcache_bsize = ppc64_caches.dline_size;
-	icache_bsize = ppc64_caches.iline_size;
+	dcache_bsize = ppc64_caches.dblock_size;
+	icache_bsize = ppc64_caches.iblock_size;
 
 	DBG(" <- initialize_cache_info()\n");
 }
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 4111d30badfa..9c0a85776c6c 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -740,12 +740,10 @@ static int __init vdso_init(void)
 	vdso_data->dcache_line_size = ppc64_caches.dline_size;
 	vdso_data->icache_size = ppc64_caches.isize;
 	vdso_data->icache_line_size = ppc64_caches.iline_size;
-
-	/* XXXOJN: Blocks should be added to ppc64_caches and used instead */
-	vdso_data->dcache_block_size = ppc64_caches.dline_size;
-	vdso_data->icache_block_size = ppc64_caches.iline_size;
-	vdso_data->dcache_log_block_size = ppc64_caches.log_dline_size;
-	vdso_data->icache_log_block_size = ppc64_caches.log_iline_size;
+	vdso_data->dcache_block_size = ppc64_caches.dblock_size;
+	vdso_data->icache_block_size = ppc64_caches.iblock_size;
+	vdso_data->dcache_log_block_size = ppc64_caches.log_dblock_size;
+	vdso_data->icache_log_block_size = ppc64_caches.log_iblock_size;
 
 	/*
 	 * Calculate the size of the 64 bits vDSO
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 21367b3a8146..4bcc9e76fb55 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -26,8 +26,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 	ori	r5,r5,PAGE_SIZE@l
 BEGIN_FTR_SECTION
 	ld      r10,PPC64_CACHES@toc(r2)
-	lwz	r11,DCACHEL1LOGLINESIZE(r10)	/* log2 of cache line size */
-	lwz     r12,DCACHEL1LINESIZE(r10)	/* get cache line size */
+	lwz	r11,DCACHEL1LOGBLOCKSIZE(r10)	/* log2 of cache block size */
+	lwz     r12,DCACHEL1BLOCKSIZE(r10)	/* get cache block size */
 	li	r9,0
 	srd	r8,r5,r11
 
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
index c100f4d5d5d0..d5b4d9498c54 100644
--- a/arch/powerpc/lib/string_64.S
+++ b/arch/powerpc/lib/string_64.S
@@ -152,9 +152,9 @@ err2;	std	r0,0(r3)
 	addi	r3,r3,8
 	addi	r4,r4,-8
 
-	/* Destination is 16 byte aligned, need to get it cacheline aligned */
-11:	lwz	r7,DCACHEL1LOGLINESIZE(r5)
-	lwz	r9,DCACHEL1LINESIZE(r5)
+	/* Destination is 16 byte aligned, need to get it cache block aligned */
+11:	lwz	r7,DCACHEL1LOGBLOCKSIZE(r5)
+	lwz	r9,DCACHEL1BLOCKSIZE(r5)
 
 	/*
 	 * With worst case alignment the long clear loop takes a minimum
-- 
cgit v1.2.3


From 5d451a87e5ebbde18c2b48284778f29d308816c2 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:46 -0600
Subject: powerpc/64: Retrieve number of L1 cache sets from device-tree

It will be used to calculate the associativity

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cache.h |  2 ++
 arch/powerpc/kernel/setup_64.c   | 28 ++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 25ee433a8261..1fa364340146 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -36,11 +36,13 @@ struct ppc64_caches {
 	u32	dblock_size;		/* L1 d-cache block size */
 	u32	log_dblock_size;
 	u32	dblocks_per_page;
+	u32	dsets;
 	u32	isize;			/* L1 i-cache size */
 	u32	iline_size;		/* L1 d-cache line size	*/
 	u32	iblock_size;		/* L1 i-cache block size */
 	u32	log_iblock_size;
 	u32	iblocks_per_page;
+	u32	isets;
 };
 
 extern struct ppc64_caches ppc64_caches;
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 08cccb2501e7..75c9a8641ba1 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -412,14 +412,18 @@ void __init initialize_cache_info(void)
 		 * d-cache and i-cache sizes... -Peter
 		 */
 		if (num_cpus == 1) {
-			const __be32 *sizep, *lsizep, *bsizep;
-			u32 size, lsize, bsize;
+			const __be32 *sizep, *lsizep, *bsizep, *setsp;
+			u32 size, lsize, bsize, sets;
 
 			size = 0;
+			sets = -1u;
 			lsize = bsize = cur_cpu_spec->dcache_bsize;
 			sizep = of_get_property(np, "d-cache-size", NULL);
 			if (sizep != NULL)
 				size = be32_to_cpu(*sizep);
+			setsp = of_get_property(np, "d-cache-sets", NULL);
+			if (setsp != NULL)
+				sets = be32_to_cpu(*setsp);
 			bsizep = of_get_property(np, "d-cache-block-size",
 						 NULL);
 			lsizep = of_get_property(np, "d-cache-line-size",
@@ -435,17 +439,32 @@ void __init initialize_cache_info(void)
 				    "sizep: %p, bsizep: %p, lsizep: %p\n",
 				    sizep, bsizep, lsizep);
 
+			/*
+			 * OF is weird .. it represents fully associative caches
+			 * as "1 way" which doesn't make much sense and doesn't
+			 * leave room for direct mapped. We'll assume that 0
+			 * in OF means direct mapped for that reason.
+			 */
+			if (sets == 1)
+				sets = 0;
+			else if (sets == 0)
+				sets = 1;
 			ppc64_caches.dsize = size;
+			ppc64_caches.dsets = sets;
 			ppc64_caches.dline_size = lsize;
 			ppc64_caches.dblock_size = bsize;
 			ppc64_caches.log_dblock_size = __ilog2(bsize);
 			ppc64_caches.dblocks_per_page = PAGE_SIZE / bsize;
 
 			size = 0;
+			sets = -1u;
 			lsize = bsize = cur_cpu_spec->icache_bsize;
 			sizep = of_get_property(np, "i-cache-size", NULL);
 			if (sizep != NULL)
 				size = be32_to_cpu(*sizep);
+			setsp = of_get_property(np, "i-cache-sets", NULL);
+			if (setsp != NULL)
+				sets = be32_to_cpu(*setsp);
 			bsizep = of_get_property(np, "i-cache-block-size",
 						 NULL);
 			lsizep = of_get_property(np, "i-cache-line-size",
@@ -461,7 +480,12 @@ void __init initialize_cache_info(void)
 				    "sizep: %p, bsizep: %p, lsizep: %p\n",
 				    sizep, bsizep, lsizep);
 
+			if (sets == 1)
+				sets = 0;
+			else if (sets == 0)
+				sets = 1;
 			ppc64_caches.isize = size;
+			ppc64_caches.isets = sets;
 			ppc64_caches.iline_size = lsize;
 			ppc64_caches.iblock_size = bsize;
 			ppc64_caches.log_iblock_size = __ilog2(bsize);
-- 
cgit v1.2.3


From e2827fe5c1566f66a922dd7493cbe4522c50580a Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:47 -0600
Subject: powerpc/64: Clean up ppc64_caches using a struct per cache

We have two set of identical struct members for the I and D sides
and mostly identical bunches of code to parse the device-tree to
populate them. Instead make a ppc_cache_info structure with one
copy for I and one for D

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cache.h   |  24 ++---
 arch/powerpc/include/asm/page_64.h |   4 +-
 arch/powerpc/kernel/align.c        |   2 +-
 arch/powerpc/kernel/asm-offsets.c  |  12 +--
 arch/powerpc/kernel/setup_64.c     | 183 ++++++++++++++++++-------------------
 arch/powerpc/kernel/vdso.c         |  16 ++--
 6 files changed, 119 insertions(+), 122 deletions(-)

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 1fa364340146..823750fa6e66 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -30,19 +30,19 @@
 #define IFETCH_ALIGN_BYTES	(1 << IFETCH_ALIGN_SHIFT)
 
 #if defined(__powerpc64__) && !defined(__ASSEMBLY__)
+
+struct ppc_cache_info {
+	u32 size;
+	u32 line_size;
+	u32 block_size;	/* L1 only */
+	u32 log_block_size;
+	u32 blocks_per_page;
+	u32 sets;
+};
+
 struct ppc64_caches {
-	u32	dsize;			/* L1 d-cache size */
-	u32	dline_size;		/* L1 d-cache line size	*/
-	u32	dblock_size;		/* L1 d-cache block size */
-	u32	log_dblock_size;
-	u32	dblocks_per_page;
-	u32	dsets;
-	u32	isize;			/* L1 i-cache size */
-	u32	iline_size;		/* L1 d-cache line size	*/
-	u32	iblock_size;		/* L1 i-cache block size */
-	u32	log_iblock_size;
-	u32	iblocks_per_page;
-	u32	isets;
+	struct ppc_cache_info l1d;
+	struct ppc_cache_info l1i;
 };
 
 extern struct ppc64_caches ppc64_caches;
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index c50a666308dd..3e83d2a20b6f 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -47,14 +47,14 @@ static inline void clear_page(void *addr)
 	unsigned long iterations;
 	unsigned long onex, twox, fourx, eightx;
 
-	iterations = ppc64_caches.dblocks_per_page / 8;
+	iterations = ppc64_caches.l1d.blocks_per_page / 8;
 
 	/*
 	 * Some verisions of gcc use multiply instructions to
 	 * calculate the offsets so lets give it a hand to
 	 * do better.
 	 */
-	onex = ppc64_caches.dblock_size;
+	onex = ppc64_caches.l1d.block_size;
 	twox = onex << 1;
 	fourx = onex << 2;
 	eightx = onex << 3;
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 30ff6590a2dd..cbc7c42cdb74 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -204,7 +204,7 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
 	int i, size;
 
 #ifdef __powerpc64__
-	size = ppc64_caches.dblock_size;
+	size = ppc64_caches.l1d.block_size;
 #else
 	size = L1_CACHE_BYTES;
 #endif
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 125442107b40..b4324db94741 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -163,12 +163,12 @@ int main(void)
 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
 
 #ifdef CONFIG_PPC64
-	DEFINE(DCACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, dblock_size));
-	DEFINE(DCACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, log_dblock_size));
-	DEFINE(DCACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, dblocks_per_page));
-	DEFINE(ICACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, iblock_size));
-	DEFINE(ICACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, log_iblock_size));
-	DEFINE(ICACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, iblocks_per_page));
+	DEFINE(DCACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, l1d.block_size));
+	DEFINE(DCACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, l1d.log_block_size));
+	DEFINE(DCACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, l1d.blocks_per_page));
+	DEFINE(ICACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, l1i.block_size));
+	DEFINE(ICACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, l1i.log_block_size));
+	DEFINE(ICACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, l1i.blocks_per_page));
 	/* paca */
 	DEFINE(PACA_SIZE, sizeof(struct paca_struct));
 	DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 75c9a8641ba1..b87dcb2968d9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -78,10 +78,14 @@ int spinning_secondaries;
 u64 ppc64_pft_size;
 
 struct ppc64_caches ppc64_caches = {
-	.dblock_size = 0x40,
-	.log_dblock_size = 6,
-	.iblock_size = 0x40,
-	.log_iblock_size = 6
+	.l1d = {
+		.block_size = 0x40,
+		.log_block_size = 6,
+	},
+	.l1i = {
+		.block_size = 0x40,
+		.log_block_size = 6
+	},
 };
 EXPORT_SYMBOL_GPL(ppc64_caches);
 
@@ -397,105 +401,98 @@ void smp_release_cpus(void)
  * cache informations about the CPU that will be used by cache flush
  * routines and/or provided to userland
  */
+
+static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize,
+			    u32 bsize, u32 sets)
+{
+	info->size = size;
+	info->sets = sets;
+	info->line_size = lsize;
+	info->block_size = bsize;
+	info->log_block_size = __ilog2(bsize);
+	info->blocks_per_page = PAGE_SIZE / bsize;
+}
+
+static bool __init parse_cache_info(struct device_node *np,
+				    bool icache,
+				    struct ppc_cache_info *info)
+{
+	static const char *ipropnames[] __initdata = {
+		"i-cache-size",
+		"i-cache-sets",
+		"i-cache-block-size",
+		"i-cache-line-size",
+	};
+	static const char *dpropnames[] __initdata = {
+		"d-cache-size",
+		"d-cache-sets",
+		"d-cache-block-size",
+		"d-cache-line-size",
+	};
+	const char **propnames = icache ? ipropnames : dpropnames;
+	const __be32 *sizep, *lsizep, *bsizep, *setsp;
+	u32 size, lsize, bsize, sets;
+	bool success = true;
+
+	size = 0;
+	sets = -1u;
+	lsize = bsize = cur_cpu_spec->dcache_bsize;
+	sizep = of_get_property(np, propnames[0], NULL);
+	if (sizep != NULL)
+		size = be32_to_cpu(*sizep);
+	setsp = of_get_property(np, propnames[1], NULL);
+	if (setsp != NULL)
+		sets = be32_to_cpu(*setsp);
+	bsizep = of_get_property(np, propnames[2], NULL);
+	lsizep = of_get_property(np, propnames[3], NULL);
+	if (bsizep == NULL)
+		bsizep = lsizep;
+	if (lsizep != NULL)
+		lsize = be32_to_cpu(*lsizep);
+	if (bsizep != NULL)
+		bsize = be32_to_cpu(*bsizep);
+	if (sizep == NULL || bsizep == NULL || lsizep == NULL)
+		success = false;
+
+	/*
+	 * OF is weird .. it represents fully associative caches
+	 * as "1 way" which doesn't make much sense and doesn't
+	 * leave room for direct mapped. We'll assume that 0
+	 * in OF means direct mapped for that reason.
+	 */
+	if (sets == 1)
+		sets = 0;
+	else if (sets == 0)
+		sets = 1;
+
+	init_cache_info(info, size, lsize, bsize, sets);
+
+	return success;
+}
+
 void __init initialize_cache_info(void)
 {
 	struct device_node *np;
-	unsigned long num_cpus = 0;
 
 	DBG(" -> initialize_cache_info()\n");
 
-	for_each_node_by_type(np, "cpu") {
-		num_cpus += 1;
+	np  = of_find_node_by_type(NULL, "cpu");
 
-		/*
-		 * We're assuming *all* of the CPUs have the same
-		 * d-cache and i-cache sizes... -Peter
-		 */
-		if (num_cpus == 1) {
-			const __be32 *sizep, *lsizep, *bsizep, *setsp;
-			u32 size, lsize, bsize, sets;
-
-			size = 0;
-			sets = -1u;
-			lsize = bsize = cur_cpu_spec->dcache_bsize;
-			sizep = of_get_property(np, "d-cache-size", NULL);
-			if (sizep != NULL)
-				size = be32_to_cpu(*sizep);
-			setsp = of_get_property(np, "d-cache-sets", NULL);
-			if (setsp != NULL)
-				sets = be32_to_cpu(*setsp);
-			bsizep = of_get_property(np, "d-cache-block-size",
-						 NULL);
-			lsizep = of_get_property(np, "d-cache-line-size",
-						 NULL);
-			if (bsizep == NULL)
-				bsizep = lsizep;
-			if (lsizep != NULL)
-				lsize = be32_to_cpu(*lsizep);
-			if (bsizep != NULL)
-				bsize = be32_to_cpu(*bsizep);
-			if (sizep == NULL || bsizep == NULL || lsizep == NULL)
-				DBG("Argh, can't find dcache properties ! "
-				    "sizep: %p, bsizep: %p, lsizep: %p\n",
-				    sizep, bsizep, lsizep);
-
-			/*
-			 * OF is weird .. it represents fully associative caches
-			 * as "1 way" which doesn't make much sense and doesn't
-			 * leave room for direct mapped. We'll assume that 0
-			 * in OF means direct mapped for that reason.
-			 */
-			if (sets == 1)
-				sets = 0;
-			else if (sets == 0)
-				sets = 1;
-			ppc64_caches.dsize = size;
-			ppc64_caches.dsets = sets;
-			ppc64_caches.dline_size = lsize;
-			ppc64_caches.dblock_size = bsize;
-			ppc64_caches.log_dblock_size = __ilog2(bsize);
-			ppc64_caches.dblocks_per_page = PAGE_SIZE / bsize;
-
-			size = 0;
-			sets = -1u;
-			lsize = bsize = cur_cpu_spec->icache_bsize;
-			sizep = of_get_property(np, "i-cache-size", NULL);
-			if (sizep != NULL)
-				size = be32_to_cpu(*sizep);
-			setsp = of_get_property(np, "i-cache-sets", NULL);
-			if (setsp != NULL)
-				sets = be32_to_cpu(*setsp);
-			bsizep = of_get_property(np, "i-cache-block-size",
-						 NULL);
-			lsizep = of_get_property(np, "i-cache-line-size",
-						 NULL);
-			if (bsizep == NULL)
-				bsizep = lsizep;
-			if (lsizep != NULL)
-				lsize = be32_to_cpu(*lsizep);
-			if (bsizep != NULL)
-				bsize = be32_to_cpu(*bsizep);
-			if (sizep == NULL || bsizep == NULL || lsizep == NULL)
-				DBG("Argh, can't find icache properties ! "
-				    "sizep: %p, bsizep: %p, lsizep: %p\n",
-				    sizep, bsizep, lsizep);
-
-			if (sets == 1)
-				sets = 0;
-			else if (sets == 0)
-				sets = 1;
-			ppc64_caches.isize = size;
-			ppc64_caches.isets = sets;
-			ppc64_caches.iline_size = lsize;
-			ppc64_caches.iblock_size = bsize;
-			ppc64_caches.log_iblock_size = __ilog2(bsize);
-			ppc64_caches.iblocks_per_page = PAGE_SIZE / bsize;
-		}
+	/*
+	 * We're assuming *all* of the CPUs have the same
+	 * d-cache and i-cache sizes... -Peter
+	 */
+	if (np) {
+		if (!parse_cache_info(np, false, &ppc64_caches.l1d))
+			DBG("Argh, can't find dcache properties !\n");
+
+		if (!parse_cache_info(np, true, &ppc64_caches.l1i))
+			DBG("Argh, can't find icache properties !\n");
 	}
 
 	/* For use by binfmt_elf */
-	dcache_bsize = ppc64_caches.dblock_size;
-	icache_bsize = ppc64_caches.iblock_size;
+	dcache_bsize = ppc64_caches.l1d.block_size;
+	icache_bsize = ppc64_caches.l1i.block_size;
 
 	DBG(" <- initialize_cache_info()\n");
 }
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 9c0a85776c6c..22b01a3962f0 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -736,14 +736,14 @@ static int __init vdso_init(void)
 	if (firmware_has_feature(FW_FEATURE_LPAR))
 		vdso_data->platform |= 1;
 	vdso_data->physicalMemorySize = memblock_phys_mem_size();
-	vdso_data->dcache_size = ppc64_caches.dsize;
-	vdso_data->dcache_line_size = ppc64_caches.dline_size;
-	vdso_data->icache_size = ppc64_caches.isize;
-	vdso_data->icache_line_size = ppc64_caches.iline_size;
-	vdso_data->dcache_block_size = ppc64_caches.dblock_size;
-	vdso_data->icache_block_size = ppc64_caches.iblock_size;
-	vdso_data->dcache_log_block_size = ppc64_caches.log_dblock_size;
-	vdso_data->icache_log_block_size = ppc64_caches.log_iblock_size;
+	vdso_data->dcache_size = ppc64_caches.l1d.size;
+	vdso_data->dcache_line_size = ppc64_caches.l1d.line_size;
+	vdso_data->icache_size = ppc64_caches.l1i.size;
+	vdso_data->icache_line_size = ppc64_caches.l1i.line_size;
+	vdso_data->dcache_block_size = ppc64_caches.l1d.block_size;
+	vdso_data->icache_block_size = ppc64_caches.l1i.block_size;
+	vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size;
+	vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size;
 
 	/*
 	 * Calculate the size of the 64 bits vDSO
-- 
cgit v1.2.3


From 65e01f386fcddb3460be78fc886856889f80ecc7 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:48 -0600
Subject: powerpc/64: Add L2 and L3 cache shape info

Retrieved from device-tree when available

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cache.h |  2 ++
 arch/powerpc/kernel/setup_64.c   | 26 +++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 823750fa6e66..d7cf60f87604 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -43,6 +43,8 @@ struct ppc_cache_info {
 struct ppc64_caches {
 	struct ppc_cache_info l1d;
 	struct ppc_cache_info l1i;
+	struct ppc_cache_info l2;
+	struct ppc_cache_info l3;
 };
 
 extern struct ppc64_caches ppc64_caches;
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index b87dcb2968d9..db18f7b68a1d 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -472,22 +472,38 @@ static bool __init parse_cache_info(struct device_node *np,
 
 void __init initialize_cache_info(void)
 {
-	struct device_node *np;
+	struct device_node *cpu, *l2, *l3 = NULL;
 
 	DBG(" -> initialize_cache_info()\n");
 
-	np  = of_find_node_by_type(NULL, "cpu");
+	cpu = of_find_node_by_type(NULL, "cpu");
 
 	/*
 	 * We're assuming *all* of the CPUs have the same
 	 * d-cache and i-cache sizes... -Peter
 	 */
-	if (np) {
-		if (!parse_cache_info(np, false, &ppc64_caches.l1d))
+	if (cpu) {
+		if (!parse_cache_info(cpu, false, &ppc64_caches.l1d))
 			DBG("Argh, can't find dcache properties !\n");
 
-		if (!parse_cache_info(np, true, &ppc64_caches.l1i))
+		if (!parse_cache_info(cpu, true, &ppc64_caches.l1i))
 			DBG("Argh, can't find icache properties !\n");
+
+		/*
+		 * Try to find the L2 and L3 if any. Assume they are
+		 * unified and use the D-side properties.
+		 */
+		l2 = of_find_next_cache_node(cpu);
+		of_node_put(cpu);
+		if (l2) {
+			parse_cache_info(l2, false, &ppc64_caches.l2);
+			l3 = of_find_next_cache_node(l2);
+			of_node_put(l2);
+		}
+		if (l3) {
+			parse_cache_info(l3, false, &ppc64_caches.l3);
+			of_node_put(l3);
+		}
 	}
 
 	/* For use by binfmt_elf */
-- 
cgit v1.2.3


From 608b42140e966a65cabc68d997875065f3e63c2f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sun, 8 Jan 2017 17:31:49 -0600
Subject: powerpc/64: Hard code cache geometry on POWER8

All shipping firmware versions have it wrong in the device-tree

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup_64.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index db18f7b68a1d..364fbffd7e83 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -472,11 +472,27 @@ static bool __init parse_cache_info(struct device_node *np,
 
 void __init initialize_cache_info(void)
 {
-	struct device_node *cpu, *l2, *l3 = NULL;
+	struct device_node *cpu = NULL, *l2, *l3 = NULL;
+	u32 pvr;
 
 	DBG(" -> initialize_cache_info()\n");
 
-	cpu = of_find_node_by_type(NULL, "cpu");
+	/*
+	 * All shipping POWER8 machines have a firmware bug that
+	 * puts incorrect information in the device-tree. This will
+	 * be (hopefully) fixed for future chips but for now hard
+	 * code the values if we are running on one of these
+	 */
+	pvr = PVR_VER(mfspr(SPRN_PVR));
+	if (pvr == PVR_POWER8 || pvr == PVR_POWER8E ||
+	    pvr == PVR_POWER8NVL) {
+						/* size    lsize   blk  sets */
+		init_cache_info(&ppc64_caches.l1i, 0x8000,   128,  128, 32);
+		init_cache_info(&ppc64_caches.l1d, 0x10000,  128,  128, 64);
+		init_cache_info(&ppc64_caches.l2,  0x80000,  128,  0,   512);
+		init_cache_info(&ppc64_caches.l3,  0x800000, 128,  0,   8192);
+	} else
+		cpu = of_find_node_by_type(NULL, "cpu");
 
 	/*
 	 * We're assuming *all* of the CPUs have the same
-- 
cgit v1.2.3


From 98a5f361b8625c6f4841d6ba013bbf0e80d08147 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 3 Feb 2017 17:20:07 +1100
Subject: powerpc: Add new cache geometry aux vectors

This adds AUX vectors for the L1I,D, L2 and L3 cache levels
providing for each cache level the size of the cache in bytes
and the geometry (line size and number of ways).

We chose to not use the existing alpha/sh definition which
packs all the information in a single entry per cache level as
it is too restricted to represent some of the geometries used
on POWER.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cache.h       |  1 +
 arch/powerpc/include/asm/elf.h         | 20 ++++++++++++++++++++
 arch/powerpc/include/uapi/asm/auxvec.h | 33 ++++++++++++++++++++++++++++++++-
 arch/powerpc/kernel/setup_64.c         |  5 +++++
 4 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index d7cf60f87604..5a90292afbad 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -38,6 +38,7 @@ struct ppc_cache_info {
 	u32 log_block_size;
 	u32 blocks_per_page;
 	u32 sets;
+	u32 assoc;
 };
 
 struct ppc64_caches {
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index 730c27ed10e1..93b9b84568e8 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -136,6 +136,25 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 
 #endif /* CONFIG_SPU_BASE */
 
+#ifdef CONFIG_PPC64
+
+#define get_cache_geometry(level) \
+	(ppc64_caches.level.assoc << 16 | ppc64_caches.level.line_size)
+
+#define ARCH_DLINFO_CACHE_GEOMETRY					\
+	NEW_AUX_ENT(AT_L1I_CACHESIZE, ppc64_caches.l1i.size);		\
+	NEW_AUX_ENT(AT_L1I_CACHEGEOMETRY, get_cache_geometry(l1i));	\
+	NEW_AUX_ENT(AT_L1D_CACHESIZE, ppc64_caches.l1i.size);		\
+	NEW_AUX_ENT(AT_L1D_CACHEGEOMETRY, get_cache_geometry(l1i));	\
+	NEW_AUX_ENT(AT_L2_CACHESIZE, ppc64_caches.l2.size);		\
+	NEW_AUX_ENT(AT_L2_CACHEGEOMETRY, get_cache_geometry(l2));	\
+	NEW_AUX_ENT(AT_L3_CACHESIZE, ppc64_caches.l3.size);		\
+	NEW_AUX_ENT(AT_L3_CACHEGEOMETRY, get_cache_geometry(l3))
+
+#else
+#define ARCH_DLINFO_CACHE_GEOMETRY
+#endif
+
 /*
  * The requirements here are:
  * - keep the final alignment of sp (sp & 0xf)
@@ -156,6 +175,7 @@ do {									\
 	NEW_AUX_ENT(AT_ICACHEBSIZE, icache_bsize);			\
 	NEW_AUX_ENT(AT_UCACHEBSIZE, ucache_bsize);			\
 	VDSO_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso_base);	\
+	ARCH_DLINFO_CACHE_GEOMETRY;					\
 } while (0)
 
 #endif /* _ASM_POWERPC_ELF_H */
diff --git a/arch/powerpc/include/uapi/asm/auxvec.h b/arch/powerpc/include/uapi/asm/auxvec.h
index ce17d2c9eb4e..be6e94ecec42 100644
--- a/arch/powerpc/include/uapi/asm/auxvec.h
+++ b/arch/powerpc/include/uapi/asm/auxvec.h
@@ -16,6 +16,37 @@
  */
 #define AT_SYSINFO_EHDR		33
 
-#define AT_VECTOR_SIZE_ARCH 6 /* entries in ARCH_DLINFO */
+/*
+ * AT_*CACHEBSIZE above represent the cache *block* size which is
+ * the size that is affected by the cache management instructions.
+ *
+ * It doesn't nececssarily matches the cache *line* size which is
+ * more of a performance tuning hint. Additionally the latter can
+ * be different for the different cache levels.
+ *
+ * The set of entries below represent more extensive information
+ * about the caches, in the form of two entry per cache type,
+ * one entry containing the cache size in bytes, and the other
+ * containing the cache line size in bytes in the bottom 16 bits
+ * and the cache associativity in the next 16 bits.
+ *
+ * The associativity is such that if N is the 16-bit value, the
+ * cache is N way set associative. A value if 0xffff means fully
+ * associative, a value of 1 means directly mapped.
+ *
+ * For all these fields, a value of 0 means that the information
+ * is not known.
+ */
+
+#define AT_L1I_CACHESIZE	40
+#define AT_L1I_CACHEGEOMETRY	41
+#define AT_L1D_CACHESIZE	42
+#define AT_L1D_CACHEGEOMETRY	43
+#define AT_L2_CACHESIZE		44
+#define AT_L2_CACHEGEOMETRY	45
+#define AT_L3_CACHESIZE		46
+#define AT_L3_CACHEGEOMETRY	47
+
+#define AT_VECTOR_SIZE_ARCH	14 /* entries in ARCH_DLINFO */
 
 #endif
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 364fbffd7e83..b9855f1b290a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -411,6 +411,11 @@ static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize,
 	info->block_size = bsize;
 	info->log_block_size = __ilog2(bsize);
 	info->blocks_per_page = PAGE_SIZE / bsize;
+
+	if (sets == 0)
+		info->assoc = 0xffff;
+	else
+		info->assoc = size / (sets * lsize);
 }
 
 static bool __init parse_cache_info(struct device_node *np,
-- 
cgit v1.2.3


From 852e5da99d15d0631c09e718abaa4b2fccda1185 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 6 Dec 2016 11:40:15 +1000
Subject: powerpc/64s: Tidy up after exception handler rework

Somewhere along the line, search/replace left some naming garbled,
and untidy alignment (aka. mpe stuffed it up). Might as well fix them
all up now while git blame history doesn't extend too far.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/head-64.h   | 160 +++++++++++++++++------------------
 arch/powerpc/kernel/exceptions-64s.S |   2 +-
 2 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index fca7033839a9..c691fc2f5dae 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -102,7 +102,7 @@ name:
 #define FIXED_SECTION_ENTRY_BEGIN(sname, name)			\
 	__FIXED_SECTION_ENTRY_BEGIN(sname, name, IFETCH_ALIGN_BYTES)
 
-#define FIXED_SECTION_ENTRY_BEGIN_LOCATION(sname, name, start)		\
+#define FIXED_SECTION_ENTRY_BEGIN_LOCATION(sname, name, start)	\
 	USE_FIXED_SECTION(sname);				\
 	name##_start = (start);					\
 	.if (start) < sname##_start;				\
@@ -113,7 +113,7 @@ name:
 	.global name;						\
 name:
 
-#define FIXED_SECTION_ENTRY_END_LOCATION(sname, name, end)		\
+#define FIXED_SECTION_ENTRY_END_LOCATION(sname, name, end)	\
 	.if (end) > sname##_end;				\
 	.error "Fixed section overflow";			\
 	.abort;							\
@@ -147,12 +147,12 @@ name:
  * Following are the BOOK3S exception handler helper macros.
  * Handlers come in a number of types, and each type has a number of varieties.
  *
- * EXC_REAL_*        - real, unrelocated exception vectors
- * EXC_VIRT_*        - virt (AIL), unrelocated exception vectors
+ * EXC_REAL_*     - real, unrelocated exception vectors
+ * EXC_VIRT_*     - virt (AIL), unrelocated exception vectors
  * TRAMP_REAL_*   - real, unrelocated helpers (virt can call these)
- * TRAMP_VIRT_*  - virt, unreloc helpers (in practice, real can use)
- * TRAMP_KVM         - KVM handlers that get put into real, unrelocated
- * EXC_COMMON_*  - virt, relocated common handlers
+ * TRAMP_VIRT_*   - virt, unreloc helpers (in practice, real can use)
+ * TRAMP_KVM      - KVM handlers that get put into real, unrelocated
+ * EXC_COMMON_*   - virt, relocated common handlers
  *
  * The EXC handlers are given a name, and branch to name_common, or the
  * appropriate KVM or masking function. Vector handler verieties are as
@@ -194,20 +194,20 @@ name:
 #define EXC_REAL_BEGIN(name, start, end)			\
 	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start)
 
-#define EXC_REAL_END(name, start, end)			\
+#define EXC_REAL_END(name, start, end)				\
 	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, end)
 
 #define EXC_VIRT_BEGIN(name, start, end)			\
 	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start)
 
-#define EXC_VIRT_END(name, start, end)			\
+#define EXC_VIRT_END(name, start, end)				\
 	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, end)
 
-#define EXC_COMMON_BEGIN(name)						\
-	USE_TEXT_SECTION();						\
-	.balign IFETCH_ALIGN_BYTES;					\
-	.global name;							\
-	DEFINE_FIXED_SYMBOL(name);					\
+#define EXC_COMMON_BEGIN(name)					\
+	USE_TEXT_SECTION();					\
+	.balign IFETCH_ALIGN_BYTES;				\
+	.global name;						\
+	DEFINE_FIXED_SYMBOL(name);				\
 name:
 
 #define TRAMP_REAL_BEGIN(name)					\
@@ -217,7 +217,7 @@ name:
 	FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name)
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#define TRAMP_KVM_BEGIN(name)						\
+#define TRAMP_KVM_BEGIN(name)					\
 	TRAMP_REAL_BEGIN(name)
 #else
 #define TRAMP_KVM_BEGIN(name)
@@ -232,132 +232,132 @@ name:
 	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, end);
 
 
-#define EXC_REAL(name, start, end)				\
-	EXC_REAL_BEGIN(name, start, end);			\
+#define EXC_REAL(name, start, end)					\
+	EXC_REAL_BEGIN(name, start, end);				\
 	STD_EXCEPTION_PSERIES(start, name##_common);			\
 	EXC_REAL_END(name, start, end);
 
-#define EXC_VIRT(name, start, end, realvec)			\
-	EXC_VIRT_BEGIN(name, start, end);			\
+#define EXC_VIRT(name, start, end, realvec)				\
+	EXC_VIRT_BEGIN(name, start, end);				\
 	STD_RELON_EXCEPTION_PSERIES(start, realvec, name##_common);	\
 	EXC_VIRT_END(name, start, end);
 
-#define EXC_REAL_MASKABLE(name, start, end)			\
-	EXC_REAL_BEGIN(name, start, end);			\
+#define EXC_REAL_MASKABLE(name, start, end)				\
+	EXC_REAL_BEGIN(name, start, end);				\
 	MASKABLE_EXCEPTION_PSERIES(start, start, name##_common);	\
 	EXC_REAL_END(name, start, end);
 
-#define EXC_VIRT_MASKABLE(name, start, end, realvec)		\
-	EXC_VIRT_BEGIN(name, start, end);			\
+#define EXC_VIRT_MASKABLE(name, start, end, realvec)			\
+	EXC_VIRT_BEGIN(name, start, end);				\
 	MASKABLE_RELON_EXCEPTION_PSERIES(start, realvec, name##_common); \
 	EXC_VIRT_END(name, start, end);
 
-#define EXC_REAL_HV(name, start, end)			\
-	EXC_REAL_BEGIN(name, start, end);			\
+#define EXC_REAL_HV(name, start, end)					\
+	EXC_REAL_BEGIN(name, start, end);				\
 	STD_EXCEPTION_HV(start, start, name##_common);			\
 	EXC_REAL_END(name, start, end);
 
-#define EXC_VIRT_HV(name, start, end, realvec)		\
-	EXC_VIRT_BEGIN(name, start, end);			\
+#define EXC_VIRT_HV(name, start, end, realvec)				\
+	EXC_VIRT_BEGIN(name, start, end);				\
 	STD_RELON_EXCEPTION_HV(start, realvec, name##_common);		\
 	EXC_VIRT_END(name, start, end);
 
-#define __EXC_REAL_OOL(name, start, end)			\
-	EXC_REAL_BEGIN(name, start, end);			\
+#define __EXC_REAL_OOL(name, start, end)				\
+	EXC_REAL_BEGIN(name, start, end);				\
 	__OOL_EXCEPTION(start, label, tramp_real_##name);		\
 	EXC_REAL_END(name, start, end);
 
-#define __TRAMP_REAL_REAL_OOL(name, vec)				\
+#define __TRAMP_REAL_OOL(name, vec)					\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	STD_EXCEPTION_PSERIES_OOL(vec, name##_common);			\
 
-#define EXC_REAL_OOL(name, start, end)			\
-	__EXC_REAL_OOL(name, start, end);			\
-	__TRAMP_REAL_REAL_OOL(name, start);
+#define EXC_REAL_OOL(name, start, end)					\
+	__EXC_REAL_OOL(name, start, end);				\
+	__TRAMP_REAL_OOL(name, start);
 
-#define __EXC_REAL_OOL_MASKABLE(name, start, end)		\
+#define __EXC_REAL_OOL_MASKABLE(name, start, end)			\
 	__EXC_REAL_OOL(name, start, end);
 
-#define __TRAMP_REAL_REAL_OOL_MASKABLE(name, vec)			\
+#define __TRAMP_REAL_OOL_MASKABLE(name, vec)				\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	MASKABLE_EXCEPTION_PSERIES_OOL(vec, name##_common);		\
 
-#define EXC_REAL_OOL_MASKABLE(name, start, end)		\
-	__EXC_REAL_OOL_MASKABLE(name, start, end);		\
-	__TRAMP_REAL_REAL_OOL_MASKABLE(name, start);
+#define EXC_REAL_OOL_MASKABLE(name, start, end)				\
+	__EXC_REAL_OOL_MASKABLE(name, start, end);			\
+	__TRAMP_REAL_OOL_MASKABLE(name, start);
 
-#define __EXC_REAL_OOL_HV_DIRECT(name, start, end, handler)	\
-	EXC_REAL_BEGIN(name, start, end);			\
+#define __EXC_REAL_OOL_HV_DIRECT(name, start, end, handler)		\
+	EXC_REAL_BEGIN(name, start, end);				\
 	__OOL_EXCEPTION(start, label, handler);				\
 	EXC_REAL_END(name, start, end);
 
-#define __EXC_REAL_OOL_HV(name, start, end)			\
+#define __EXC_REAL_OOL_HV(name, start, end)				\
 	__EXC_REAL_OOL(name, start, end);
 
-#define __TRAMP_REAL_REAL_OOL_HV(name, vec)				\
+#define __TRAMP_REAL_OOL_HV(name, vec)					\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	STD_EXCEPTION_HV_OOL(vec, name##_common);			\
 
-#define EXC_REAL_OOL_HV(name, start, end)			\
-	__EXC_REAL_OOL_HV(name, start, end);			\
-	__TRAMP_REAL_REAL_OOL_HV(name, start);
+#define EXC_REAL_OOL_HV(name, start, end)				\
+	__EXC_REAL_OOL_HV(name, start, end);				\
+	__TRAMP_REAL_OOL_HV(name, start);
 
-#define __EXC_REAL_OOL_MASKABLE_HV(name, start, end)		\
+#define __EXC_REAL_OOL_MASKABLE_HV(name, start, end)			\
 	__EXC_REAL_OOL(name, start, end);
 
-#define __TRAMP_REAL_REAL_OOL_MASKABLE_HV(name, vec)			\
+#define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec)				\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	MASKABLE_EXCEPTION_HV_OOL(vec, name##_common);			\
 
-#define EXC_REAL_OOL_MASKABLE_HV(name, start, end)		\
-	__EXC_REAL_OOL_MASKABLE_HV(name, start, end);	\
-	__TRAMP_REAL_REAL_OOL_MASKABLE_HV(name, start);
+#define EXC_REAL_OOL_MASKABLE_HV(name, start, end)			\
+	__EXC_REAL_OOL_MASKABLE_HV(name, start, end);			\
+	__TRAMP_REAL_OOL_MASKABLE_HV(name, start);
 
-#define __EXC_VIRT_OOL(name, start, end)			\
-	EXC_VIRT_BEGIN(name, start, end);			\
+#define __EXC_VIRT_OOL(name, start, end)				\
+	EXC_VIRT_BEGIN(name, start, end);				\
 	__OOL_EXCEPTION(start, label, tramp_virt_##name);		\
 	EXC_VIRT_END(name, start, end);
 
-#define __TRAMP_REAL_VIRT_OOL(name, realvec)				\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+#define __TRAMP_VIRT_OOL(name, realvec)					\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	STD_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common);	\
 
-#define EXC_VIRT_OOL(name, start, end, realvec)		\
-	__EXC_VIRT_OOL(name, start, end);			\
-	__TRAMP_REAL_VIRT_OOL(name, realvec);
+#define EXC_VIRT_OOL(name, start, end, realvec)				\
+	__EXC_VIRT_OOL(name, start, end);				\
+	__TRAMP_VIRT_OOL(name, realvec);
 
-#define __EXC_VIRT_OOL_MASKABLE(name, start, end)		\
+#define __EXC_VIRT_OOL_MASKABLE(name, start, end)			\
 	__EXC_VIRT_OOL(name, start, end);
 
-#define __TRAMP_REAL_VIRT_OOL_MASKABLE(name, realvec)		\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+#define __TRAMP_VIRT_OOL_MASKABLE(name, realvec)			\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	MASKABLE_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common);	\
 
-#define EXC_VIRT_OOL_MASKABLE(name, start, end, realvec)	\
-	__EXC_VIRT_OOL_MASKABLE(name, start, end);		\
-	__TRAMP_REAL_VIRT_OOL_MASKABLE(name, realvec);
+#define EXC_VIRT_OOL_MASKABLE(name, start, end, realvec)		\
+	__EXC_VIRT_OOL_MASKABLE(name, start, end);			\
+	__TRAMP_VIRT_OOL_MASKABLE(name, realvec);
 
-#define __EXC_VIRT_OOL_HV(name, start, end)			\
+#define __EXC_VIRT_OOL_HV(name, start, end)				\
 	__EXC_VIRT_OOL(name, start, end);
 
-#define __TRAMP_REAL_VIRT_OOL_HV(name, realvec)			\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+#define __TRAMP_VIRT_OOL_HV(name, realvec)				\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	STD_RELON_EXCEPTION_HV_OOL(realvec, name##_common);		\
 
-#define EXC_VIRT_OOL_HV(name, start, end, realvec)		\
-	__EXC_VIRT_OOL_HV(name, start, end);			\
-	__TRAMP_REAL_VIRT_OOL_HV(name, realvec);
+#define EXC_VIRT_OOL_HV(name, start, end, realvec)			\
+	__EXC_VIRT_OOL_HV(name, start, end);				\
+	__TRAMP_VIRT_OOL_HV(name, realvec);
 
-#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, end)		\
+#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, end)			\
 	__EXC_VIRT_OOL(name, start, end);
 
-#define __TRAMP_REAL_VIRT_OOL_MASKABLE_HV(name, realvec)		\
-	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+#define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec)			\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	MASKABLE_RELON_EXCEPTION_HV_OOL(realvec, name##_common);	\
 
-#define EXC_VIRT_OOL_MASKABLE_HV(name, start, end, realvec)	\
-	__EXC_VIRT_OOL_MASKABLE_HV(name, start, end);	\
-	__TRAMP_REAL_VIRT_OOL_MASKABLE_HV(name, realvec);
+#define EXC_VIRT_OOL_MASKABLE_HV(name, start, end, realvec)		\
+	__EXC_VIRT_OOL_MASKABLE_HV(name, start, end);			\
+	__TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec);
 
 #define TRAMP_KVM(area, n)						\
 	TRAMP_KVM_BEGIN(do_kvm_##n);					\
@@ -378,16 +378,16 @@ name:
 	TRAMP_KVM_BEGIN(do_kvm_H##n);					\
 	KVM_HANDLER_SKIP(area, EXC_HV, n + 0x2);			\
 
-#define EXC_COMMON(name, realvec, hdlr)				\
-	EXC_COMMON_BEGIN(name);					\
+#define EXC_COMMON(name, realvec, hdlr)					\
+	EXC_COMMON_BEGIN(name);						\
 	STD_EXCEPTION_COMMON(realvec, name, hdlr);			\
 
-#define EXC_COMMON_ASYNC(name, realvec, hdlr)			\
-	EXC_COMMON_BEGIN(name);					\
+#define EXC_COMMON_ASYNC(name, realvec, hdlr)				\
+	EXC_COMMON_BEGIN(name);						\
 	STD_EXCEPTION_COMMON_ASYNC(realvec, name, hdlr);		\
 
 #define EXC_COMMON_HV(name, realvec, hdlr)				\
-	EXC_COMMON_BEGIN(name);					\
+	EXC_COMMON_BEGIN(name);						\
 	STD_EXCEPTION_COMMON(realvec + 0x2, name, hdlr);		\
 
 #endif	/* _ASM_POWERPC_HEAD_64_H */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 069aac8af909..bc1c2479a04c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -960,7 +960,7 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
  * mode.
  */
 __EXC_REAL_OOL_HV_DIRECT(hmi_exception, 0xe60, 0xe80, hmi_exception_early)
-__TRAMP_REAL_REAL_OOL_MASKABLE_HV(hmi_exception, 0xe60)
+__TRAMP_REAL_OOL_MASKABLE_HV(hmi_exception, 0xe60)
 EXC_VIRT_NONE(0x4e60, 0x4e80)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
 TRAMP_REAL_BEGIN(hmi_exception_early)
-- 
cgit v1.2.3


From 1a6822d194c3f627eeb6aaca6688a5d0a444663e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 6 Dec 2016 11:41:12 +1000
Subject: powerpc/64s: Use (start, size) rather than (start, end) for exception
 handlers

start,size has the benefit of being easier to search for (start,end
usually gives you the preceeding vector from the one you want, as first
result).

Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/head-64.h   | 158 ++++++++++++++--------------
 arch/powerpc/kernel/exceptions-64s.S | 195 ++++++++++++++++++-----------------
 2 files changed, 185 insertions(+), 168 deletions(-)

diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index c691fc2f5dae..a475711cd9c3 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -38,8 +38,8 @@
  *     li  r10,128
  *     mv  r11,r10
 
- * FIXED_SECTION_ENTRY_BEGIN_LOCATION(section_name, label2, start_address)
- * FIXED_SECTION_ENTRY_END_LOCATION(section_name, label2, end_address)
+ * FIXED_SECTION_ENTRY_BEGIN_LOCATION(section_name, label2, start_address, size)
+ * FIXED_SECTION_ENTRY_END_LOCATION(section_name, label2, start_address, size)
  * CLOSE_FIXED_SECTION(section_name)
  *
  * ZERO_FIXED_SECTION can be used to emit zeroed data.
@@ -102,9 +102,15 @@ name:
 #define FIXED_SECTION_ENTRY_BEGIN(sname, name)			\
 	__FIXED_SECTION_ENTRY_BEGIN(sname, name, IFETCH_ALIGN_BYTES)
 
-#define FIXED_SECTION_ENTRY_BEGIN_LOCATION(sname, name, start)	\
+#define FIXED_SECTION_ENTRY_BEGIN_LOCATION(sname, name, start, size) \
 	USE_FIXED_SECTION(sname);				\
 	name##_start = (start);					\
+	.if ((start) % (size) != 0);				\
+	.error "Fixed section exception vector misalignment";	\
+	.endif;							\
+	.if ((size) != 0x20) && ((size) != 0x80) && ((size) != 0x100); \
+	.error "Fixed section exception vector bad size";	\
+	.endif;							\
 	.if (start) < sname##_start;				\
 	.error "Fixed section underflow";			\
 	.abort;							\
@@ -113,16 +119,16 @@ name:
 	.global name;						\
 name:
 
-#define FIXED_SECTION_ENTRY_END_LOCATION(sname, name, end)	\
-	.if (end) > sname##_end;				\
+#define FIXED_SECTION_ENTRY_END_LOCATION(sname, name, start, size) \
+	.if (start) + (size) > sname##_end;			\
 	.error "Fixed section overflow";			\
 	.abort;							\
 	.endif;							\
-	.if (. - name > end - name##_start);			\
+	.if (. - name > (start) + (size) - name##_start);	\
 	.error "Fixed entry overflow";				\
 	.abort;							\
 	.endif;							\
-	. = ((end) - sname##_start);				\
+	. = ((start) + (size) - sname##_start);			\
 
 
 /*
@@ -191,17 +197,17 @@ name:
  * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers.
  */
 
-#define EXC_REAL_BEGIN(name, start, end)			\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start)
+#define EXC_REAL_BEGIN(name, start, size)			\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
 
-#define EXC_REAL_END(name, start, end)				\
-	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, end)
+#define EXC_REAL_END(name, start, size)				\
+	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size)
 
-#define EXC_VIRT_BEGIN(name, start, end)			\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start)
+#define EXC_VIRT_BEGIN(name, start, size)			\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
 
-#define EXC_VIRT_END(name, start, end)				\
-	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, end)
+#define EXC_VIRT_END(name, start, size)				\
+	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size)
 
 #define EXC_COMMON_BEGIN(name)					\
 	USE_TEXT_SECTION();					\
@@ -223,140 +229,140 @@ name:
 #define TRAMP_KVM_BEGIN(name)
 #endif
 
-#define EXC_REAL_NONE(start, end)				\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start); \
-	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, end)
+#define EXC_REAL_NONE(start, size)				\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \
+	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size)
 
-#define EXC_VIRT_NONE(start, end)				\
-	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start); \
-	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, end);
+#define EXC_VIRT_NONE(start, size)				\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \
+	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size);
 
 
-#define EXC_REAL(name, start, end)					\
-	EXC_REAL_BEGIN(name, start, end);				\
+#define EXC_REAL(name, start, size)					\
+	EXC_REAL_BEGIN(name, start, size);				\
 	STD_EXCEPTION_PSERIES(start, name##_common);			\
-	EXC_REAL_END(name, start, end);
+	EXC_REAL_END(name, start, size);
 
-#define EXC_VIRT(name, start, end, realvec)				\
-	EXC_VIRT_BEGIN(name, start, end);				\
+#define EXC_VIRT(name, start, size, realvec)				\
+	EXC_VIRT_BEGIN(name, start, size);				\
 	STD_RELON_EXCEPTION_PSERIES(start, realvec, name##_common);	\
-	EXC_VIRT_END(name, start, end);
+	EXC_VIRT_END(name, start, size);
 
-#define EXC_REAL_MASKABLE(name, start, end)				\
-	EXC_REAL_BEGIN(name, start, end);				\
+#define EXC_REAL_MASKABLE(name, start, size)				\
+	EXC_REAL_BEGIN(name, start, size);				\
 	MASKABLE_EXCEPTION_PSERIES(start, start, name##_common);	\
-	EXC_REAL_END(name, start, end);
+	EXC_REAL_END(name, start, size);
 
-#define EXC_VIRT_MASKABLE(name, start, end, realvec)			\
-	EXC_VIRT_BEGIN(name, start, end);				\
+#define EXC_VIRT_MASKABLE(name, start, size, realvec)			\
+	EXC_VIRT_BEGIN(name, start, size);				\
 	MASKABLE_RELON_EXCEPTION_PSERIES(start, realvec, name##_common); \
-	EXC_VIRT_END(name, start, end);
+	EXC_VIRT_END(name, start, size);
 
-#define EXC_REAL_HV(name, start, end)					\
-	EXC_REAL_BEGIN(name, start, end);				\
+#define EXC_REAL_HV(name, start, size)					\
+	EXC_REAL_BEGIN(name, start, size);				\
 	STD_EXCEPTION_HV(start, start, name##_common);			\
-	EXC_REAL_END(name, start, end);
+	EXC_REAL_END(name, start, size);
 
-#define EXC_VIRT_HV(name, start, end, realvec)				\
-	EXC_VIRT_BEGIN(name, start, end);				\
+#define EXC_VIRT_HV(name, start, size, realvec)				\
+	EXC_VIRT_BEGIN(name, start, size);				\
 	STD_RELON_EXCEPTION_HV(start, realvec, name##_common);		\
-	EXC_VIRT_END(name, start, end);
+	EXC_VIRT_END(name, start, size);
 
-#define __EXC_REAL_OOL(name, start, end)				\
-	EXC_REAL_BEGIN(name, start, end);				\
+#define __EXC_REAL_OOL(name, start, size)				\
+	EXC_REAL_BEGIN(name, start, size);				\
 	__OOL_EXCEPTION(start, label, tramp_real_##name);		\
-	EXC_REAL_END(name, start, end);
+	EXC_REAL_END(name, start, size);
 
 #define __TRAMP_REAL_OOL(name, vec)					\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	STD_EXCEPTION_PSERIES_OOL(vec, name##_common);			\
 
-#define EXC_REAL_OOL(name, start, end)					\
-	__EXC_REAL_OOL(name, start, end);				\
+#define EXC_REAL_OOL(name, start, size)					\
+	__EXC_REAL_OOL(name, start, size);				\
 	__TRAMP_REAL_OOL(name, start);
 
-#define __EXC_REAL_OOL_MASKABLE(name, start, end)			\
-	__EXC_REAL_OOL(name, start, end);
+#define __EXC_REAL_OOL_MASKABLE(name, start, size)			\
+	__EXC_REAL_OOL(name, start, size);
 
 #define __TRAMP_REAL_OOL_MASKABLE(name, vec)				\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	MASKABLE_EXCEPTION_PSERIES_OOL(vec, name##_common);		\
 
-#define EXC_REAL_OOL_MASKABLE(name, start, end)				\
-	__EXC_REAL_OOL_MASKABLE(name, start, end);			\
+#define EXC_REAL_OOL_MASKABLE(name, start, size)			\
+	__EXC_REAL_OOL_MASKABLE(name, start, size);			\
 	__TRAMP_REAL_OOL_MASKABLE(name, start);
 
-#define __EXC_REAL_OOL_HV_DIRECT(name, start, end, handler)		\
-	EXC_REAL_BEGIN(name, start, end);				\
+#define __EXC_REAL_OOL_HV_DIRECT(name, start, size, handler)		\
+	EXC_REAL_BEGIN(name, start, size);				\
 	__OOL_EXCEPTION(start, label, handler);				\
-	EXC_REAL_END(name, start, end);
+	EXC_REAL_END(name, start, size);
 
-#define __EXC_REAL_OOL_HV(name, start, end)				\
-	__EXC_REAL_OOL(name, start, end);
+#define __EXC_REAL_OOL_HV(name, start, size)				\
+	__EXC_REAL_OOL(name, start, size);
 
 #define __TRAMP_REAL_OOL_HV(name, vec)					\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	STD_EXCEPTION_HV_OOL(vec, name##_common);			\
 
-#define EXC_REAL_OOL_HV(name, start, end)				\
-	__EXC_REAL_OOL_HV(name, start, end);				\
+#define EXC_REAL_OOL_HV(name, start, size)				\
+	__EXC_REAL_OOL_HV(name, start, size);				\
 	__TRAMP_REAL_OOL_HV(name, start);
 
-#define __EXC_REAL_OOL_MASKABLE_HV(name, start, end)			\
-	__EXC_REAL_OOL(name, start, end);
+#define __EXC_REAL_OOL_MASKABLE_HV(name, start, size)			\
+	__EXC_REAL_OOL(name, start, size);
 
 #define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec)				\
 	TRAMP_REAL_BEGIN(tramp_real_##name);				\
 	MASKABLE_EXCEPTION_HV_OOL(vec, name##_common);			\
 
-#define EXC_REAL_OOL_MASKABLE_HV(name, start, end)			\
-	__EXC_REAL_OOL_MASKABLE_HV(name, start, end);			\
+#define EXC_REAL_OOL_MASKABLE_HV(name, start, size)			\
+	__EXC_REAL_OOL_MASKABLE_HV(name, start, size);			\
 	__TRAMP_REAL_OOL_MASKABLE_HV(name, start);
 
-#define __EXC_VIRT_OOL(name, start, end)				\
-	EXC_VIRT_BEGIN(name, start, end);				\
+#define __EXC_VIRT_OOL(name, start, size)				\
+	EXC_VIRT_BEGIN(name, start, size);				\
 	__OOL_EXCEPTION(start, label, tramp_virt_##name);		\
-	EXC_VIRT_END(name, start, end);
+	EXC_VIRT_END(name, start, size);
 
 #define __TRAMP_VIRT_OOL(name, realvec)					\
 	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	STD_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common);	\
 
-#define EXC_VIRT_OOL(name, start, end, realvec)				\
-	__EXC_VIRT_OOL(name, start, end);				\
+#define EXC_VIRT_OOL(name, start, size, realvec)			\
+	__EXC_VIRT_OOL(name, start, size);				\
 	__TRAMP_VIRT_OOL(name, realvec);
 
-#define __EXC_VIRT_OOL_MASKABLE(name, start, end)			\
-	__EXC_VIRT_OOL(name, start, end);
+#define __EXC_VIRT_OOL_MASKABLE(name, start, size)			\
+	__EXC_VIRT_OOL(name, start, size);
 
 #define __TRAMP_VIRT_OOL_MASKABLE(name, realvec)			\
 	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	MASKABLE_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common);	\
 
-#define EXC_VIRT_OOL_MASKABLE(name, start, end, realvec)		\
-	__EXC_VIRT_OOL_MASKABLE(name, start, end);			\
+#define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec)		\
+	__EXC_VIRT_OOL_MASKABLE(name, start, size);			\
 	__TRAMP_VIRT_OOL_MASKABLE(name, realvec);
 
-#define __EXC_VIRT_OOL_HV(name, start, end)				\
-	__EXC_VIRT_OOL(name, start, end);
+#define __EXC_VIRT_OOL_HV(name, start, size)				\
+	__EXC_VIRT_OOL(name, start, size);
 
 #define __TRAMP_VIRT_OOL_HV(name, realvec)				\
 	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	STD_RELON_EXCEPTION_HV_OOL(realvec, name##_common);		\
 
-#define EXC_VIRT_OOL_HV(name, start, end, realvec)			\
-	__EXC_VIRT_OOL_HV(name, start, end);				\
+#define EXC_VIRT_OOL_HV(name, start, size, realvec)			\
+	__EXC_VIRT_OOL_HV(name, start, size);				\
 	__TRAMP_VIRT_OOL_HV(name, realvec);
 
-#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, end)			\
-	__EXC_VIRT_OOL(name, start, end);
+#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, size)			\
+	__EXC_VIRT_OOL(name, start, size);
 
 #define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec)			\
 	TRAMP_VIRT_BEGIN(tramp_virt_##name);				\
 	MASKABLE_RELON_EXCEPTION_HV_OOL(realvec, name##_common);	\
 
-#define EXC_VIRT_OOL_MASKABLE_HV(name, start, end, realvec)		\
-	__EXC_VIRT_OOL_MASKABLE_HV(name, start, end);			\
+#define EXC_VIRT_OOL_MASKABLE_HV(name, start, size, realvec)		\
+	__EXC_VIRT_OOL_MASKABLE_HV(name, start, size);			\
 	__TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec);
 
 #define TRAMP_KVM(area, n)						\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index bc1c2479a04c..a6205a4a3574 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -93,7 +93,7 @@ USE_FIXED_SECTION(real_vectors)
 __start_interrupts:
 
 /* No virt vectors corresponding with 0x0..0x100 */
-EXC_VIRT_NONE(0x4000, 0x4100)
+EXC_VIRT_NONE(0x4000, 0x100)
 
 
 #ifdef CONFIG_PPC_P7_NAP
@@ -114,15 +114,15 @@ EXC_VIRT_NONE(0x4000, 0x4100)
 #define IDLETEST NOTEST
 #endif
 
-EXC_REAL_BEGIN(system_reset, 0x100, 0x200)
+EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 	SET_SCRATCH0(r13)
 	GET_PACA(r13)
 	clrrdi	r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */
 	EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD,
 				 IDLETEST, 0x100)
 
-EXC_REAL_END(system_reset, 0x100, 0x200)
-EXC_VIRT_NONE(0x4100, 0x4200)
+EXC_REAL_END(system_reset, 0x100, 0x100)
+EXC_VIRT_NONE(0x4100, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
 EXC_COMMON_BEGIN(system_reset_idle_common)
@@ -166,7 +166,7 @@ TRAMP_REAL_BEGIN(system_reset_fwnmi)
 #endif /* CONFIG_PPC_PSERIES */
 
 
-EXC_REAL_BEGIN(machine_check, 0x200, 0x300)
+EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 	/* This is moved out of line as it can be patched by FW, but
 	 * some code path might still want to branch into the original
 	 * vector
@@ -186,8 +186,8 @@ BEGIN_FTR_SECTION
 FTR_SECTION_ELSE
 	b	machine_check_pSeries_0
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
-EXC_REAL_END(machine_check, 0x200, 0x300)
-EXC_VIRT_NONE(0x4200, 0x4300)
+EXC_REAL_END(machine_check, 0x200, 0x100)
+EXC_VIRT_NONE(0x4200, 0x100)
 TRAMP_REAL_BEGIN(machine_check_powernv_early)
 BEGIN_FTR_SECTION
 	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
@@ -483,8 +483,8 @@ EXC_COMMON_BEGIN(unrecover_mce)
 	b	1b
 
 
-EXC_REAL(data_access, 0x300, 0x380)
-EXC_VIRT(data_access, 0x4300, 0x4380, 0x300)
+EXC_REAL(data_access, 0x300, 0x80)
+EXC_VIRT(data_access, 0x4300, 0x80, 0x300)
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
 
 EXC_COMMON_BEGIN(data_access_common)
@@ -512,7 +512,7 @@ MMU_FTR_SECTION_ELSE
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
-EXC_REAL_BEGIN(data_access_slb, 0x380, 0x400)
+EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
 	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
@@ -533,9 +533,9 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x400)
 	mtctr	r10
 	bctr
 #endif
-EXC_REAL_END(data_access_slb, 0x380, 0x400)
+EXC_REAL_END(data_access_slb, 0x380, 0x80)
 
-EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x4400)
+EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
 	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
@@ -556,12 +556,12 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x4400)
 	mtctr	r10
 	bctr
 #endif
-EXC_VIRT_END(data_access_slb, 0x4380, 0x4400)
+EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
 TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
 
 
-EXC_REAL(instruction_access, 0x400, 0x480)
-EXC_VIRT(instruction_access, 0x4400, 0x4480, 0x400)
+EXC_REAL(instruction_access, 0x400, 0x80)
+EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400)
 TRAMP_KVM(PACA_EXGEN, 0x400)
 
 EXC_COMMON_BEGIN(instruction_access_common)
@@ -580,7 +580,7 @@ MMU_FTR_SECTION_ELSE
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
-EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x500)
+EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80)
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
 	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
@@ -596,9 +596,9 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x500)
 	mtctr	r10
 	bctr
 #endif
-EXC_REAL_END(instruction_access_slb, 0x480, 0x500)
+EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
 
-EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x4500)
+EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
 	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
@@ -614,7 +614,7 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x4500)
 	mtctr	r10
 	bctr
 #endif
-EXC_VIRT_END(instruction_access_slb, 0x4480, 0x4500)
+EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
 TRAMP_KVM(PACA_EXSLB, 0x480)
 
 
@@ -711,7 +711,7 @@ EXC_COMMON_BEGIN(bad_addr_slb)
 	bl	slb_miss_bad_addr
 	b	ret_from_except
 
-EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x600)
+EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100)
 	.globl hardware_interrupt_hv;
 hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
@@ -725,9 +725,9 @@ do_kvm_H0x500:
 do_kvm_0x500:
 		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-EXC_REAL_END(hardware_interrupt, 0x500, 0x600)
+EXC_REAL_END(hardware_interrupt, 0x500, 0x100)
 
-EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x4600)
+EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
 	.globl hardware_interrupt_relon_hv;
 hardware_interrupt_relon_hv:
 	BEGIN_FTR_SECTION
@@ -735,13 +735,13 @@ hardware_interrupt_relon_hv:
 	FTR_SECTION_ELSE
 		_MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_STD, SOFTEN_TEST_PR)
 	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
-EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600)
+EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
 
 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
 
-EXC_REAL(alignment, 0x600, 0x700)
-EXC_VIRT(alignment, 0x4600, 0x4700, 0x600)
+EXC_REAL(alignment, 0x600, 0x100)
+EXC_VIRT(alignment, 0x4600, 0x100, 0x600)
 TRAMP_KVM(PACA_EXGEN, 0x600)
 EXC_COMMON_BEGIN(alignment_common)
 	mfspr	r10,SPRN_DAR
@@ -760,8 +760,8 @@ EXC_COMMON_BEGIN(alignment_common)
 	b	ret_from_except
 
 
-EXC_REAL(program_check, 0x700, 0x800)
-EXC_VIRT(program_check, 0x4700, 0x4800, 0x700)
+EXC_REAL(program_check, 0x700, 0x100)
+EXC_VIRT(program_check, 0x4700, 0x100, 0x700)
 TRAMP_KVM(PACA_EXGEN, 0x700)
 EXC_COMMON_BEGIN(program_check_common)
 	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
@@ -772,8 +772,8 @@ EXC_COMMON_BEGIN(program_check_common)
 	b	ret_from_except
 
 
-EXC_REAL(fp_unavailable, 0x800, 0x900)
-EXC_VIRT(fp_unavailable, 0x4800, 0x4900, 0x800)
+EXC_REAL(fp_unavailable, 0x800, 0x100)
+EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800)
 TRAMP_KVM(PACA_EXGEN, 0x800)
 EXC_COMMON_BEGIN(fp_unavailable_common)
 	EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
@@ -805,20 +805,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
 
-EXC_REAL_MASKABLE(decrementer, 0x900, 0x980)
-EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x4980, 0x900)
+EXC_REAL_MASKABLE(decrementer, 0x900, 0x80)
+EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900)
 TRAMP_KVM(PACA_EXGEN, 0x900)
 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
 
 
-EXC_REAL_HV(hdecrementer, 0x980, 0xa00)
-EXC_VIRT_HV(hdecrementer, 0x4980, 0x4a00, 0x980)
+EXC_REAL_HV(hdecrementer, 0x980, 0x80)
+EXC_VIRT_HV(hdecrementer, 0x4980, 0x80, 0x980)
 TRAMP_KVM_HV(PACA_EXGEN, 0x980)
 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
 
 
-EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0xb00)
-EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x4b00, 0xa00)
+EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100)
+EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00)
 TRAMP_KVM(PACA_EXGEN, 0xa00)
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception)
@@ -827,8 +827,8 @@ EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception)
 #endif
 
 
-EXC_REAL(trap_0b, 0xb00, 0xc00)
-EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00)
+EXC_REAL(trap_0b, 0xb00, 0x100)
+EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00)
 TRAMP_KVM(PACA_EXGEN, 0xb00)
 EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 
@@ -884,7 +884,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	b	system_call_common ;
 #endif
 
-EXC_REAL_BEGIN(system_call, 0xc00, 0xd00)
+EXC_REAL_BEGIN(system_call, 0xc00, 0x100)
 	 /*
 	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
 	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
@@ -909,25 +909,25 @@ EXC_REAL_BEGIN(system_call, 0xc00, 0xd00)
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_RFID
 	SYSCALL_PSERIES_3
-EXC_REAL_END(system_call, 0xc00, 0xd00)
+EXC_REAL_END(system_call, 0xc00, 0x100)
 
-EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00)
+EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
 	HMT_MEDIUM
 	SYSCALL_PSERIES_1
 	SYSCALL_PSERIES_2_DIRECT
 	SYSCALL_PSERIES_3
-EXC_VIRT_END(system_call, 0x4c00, 0x4d00)
+EXC_VIRT_END(system_call, 0x4c00, 0x100)
 
 TRAMP_KVM(PACA_EXGEN, 0xc00)
 
 
-EXC_REAL(single_step, 0xd00, 0xe00)
-EXC_VIRT(single_step, 0x4d00, 0x4e00, 0xd00)
+EXC_REAL(single_step, 0xd00, 0x100)
+EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00)
 TRAMP_KVM(PACA_EXGEN, 0xd00)
 EXC_COMMON(single_step_common, 0xd00, single_step_exception)
 
-EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20)
-EXC_VIRT_NONE(0x4e00, 0x4e20)
+EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20)
+EXC_VIRT_NONE(0x4e00, 0x20)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
 EXC_COMMON_BEGIN(h_data_storage_common)
 	mfspr   r10,SPRN_HDAR
@@ -942,14 +942,14 @@ EXC_COMMON_BEGIN(h_data_storage_common)
 	b       ret_from_except
 
 
-EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40)
-EXC_VIRT_NONE(0x4e20, 0x4e40)
+EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20)
+EXC_VIRT_NONE(0x4e20, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
 
 
-EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0xe60)
-EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x4e60, 0xe40)
+EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20)
+EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe40)
 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
 
@@ -959,9 +959,9 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
  * first, and then eventaully from there to the trampoline to get into virtual
  * mode.
  */
-__EXC_REAL_OOL_HV_DIRECT(hmi_exception, 0xe60, 0xe80, hmi_exception_early)
+__EXC_REAL_OOL_HV_DIRECT(hmi_exception, 0xe60, 0x20, hmi_exception_early)
 __TRAMP_REAL_OOL_MASKABLE_HV(hmi_exception, 0xe60)
-EXC_VIRT_NONE(0x4e60, 0x4e80)
+EXC_VIRT_NONE(0x4e60, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
 TRAMP_REAL_BEGIN(hmi_exception_early)
 	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60)
@@ -1015,8 +1015,8 @@ hmi_exception_after_realmode:
 EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception)
 
 
-EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0xea0)
-EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x4ea0, 0xe80)
+EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20)
+EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe80)
 #ifdef CONFIG_PPC_DOORBELL
 EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception)
@@ -1025,24 +1025,26 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception)
 #endif
 
 
-EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0xec0)
-EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x4ec0, 0xea0)
+EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20)
+EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0)
 TRAMP_KVM_HV(PACA_EXGEN, 0xea0)
 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
 
 
-EXC_REAL_NONE(0xec0, 0xf00)
-EXC_VIRT_NONE(0x4ec0, 0x4f00)
+EXC_REAL_NONE(0xec0, 0x20)
+EXC_VIRT_NONE(0x4ec0, 0x20)
+EXC_REAL_NONE(0xee0, 0x20)
+EXC_VIRT_NONE(0x4ee0, 0x20)
 
 
-EXC_REAL_OOL(performance_monitor, 0xf00, 0xf20)
-EXC_VIRT_OOL(performance_monitor, 0x4f00, 0x4f20, 0xf00)
+EXC_REAL_OOL(performance_monitor, 0xf00, 0x20)
+EXC_VIRT_OOL(performance_monitor, 0x4f00, 0x20, 0xf00)
 TRAMP_KVM(PACA_EXGEN, 0xf00)
 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
 
 
-EXC_REAL_OOL(altivec_unavailable, 0xf20, 0xf40)
-EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x4f40, 0xf20)
+EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20)
+EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20)
 TRAMP_KVM(PACA_EXGEN, 0xf20)
 EXC_COMMON_BEGIN(altivec_unavailable_common)
 	EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN)
@@ -1078,8 +1080,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	b	ret_from_except
 
 
-EXC_REAL_OOL(vsx_unavailable, 0xf40, 0xf60)
-EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x4f60, 0xf40)
+EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20)
+EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40)
 TRAMP_KVM(PACA_EXGEN, 0xf40)
 EXC_COMMON_BEGIN(vsx_unavailable_common)
 	EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN)
@@ -1114,41 +1116,50 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	b	ret_from_except
 
 
-EXC_REAL_OOL(facility_unavailable, 0xf60, 0xf80)
-EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x4f80, 0xf60)
+EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20)
+EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60)
 TRAMP_KVM(PACA_EXGEN, 0xf60)
 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
 
 
-EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0xfa0)
-EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x4fa0, 0xf80)
+EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20)
+EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80)
 TRAMP_KVM_HV(PACA_EXGEN, 0xf80)
 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
 
 
-EXC_REAL_NONE(0xfa0, 0x1200)
-EXC_VIRT_NONE(0x4fa0, 0x5200)
+EXC_REAL_NONE(0xfa0, 0x20)
+EXC_VIRT_NONE(0x4fa0, 0x20)
+EXC_REAL_NONE(0xfc0, 0x20)
+EXC_VIRT_NONE(0x4fc0, 0x20)
+EXC_REAL_NONE(0xfe0, 0x20)
+EXC_VIRT_NONE(0x4fe0, 0x20)
+
+EXC_REAL_NONE(0x1000, 0x100)
+EXC_VIRT_NONE(0x5000, 0x100)
+EXC_REAL_NONE(0x1100, 0x100)
+EXC_VIRT_NONE(0x5100, 0x100)
 
 #ifdef CONFIG_CBE_RAS
-EXC_REAL_HV(cbe_system_error, 0x1200, 0x1300)
-EXC_VIRT_NONE(0x5200, 0x5300)
+EXC_REAL_HV(cbe_system_error, 0x1200, 0x100)
+EXC_VIRT_NONE(0x5200, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200)
 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception)
 #else /* CONFIG_CBE_RAS */
-EXC_REAL_NONE(0x1200, 0x1300)
-EXC_VIRT_NONE(0x5200, 0x5300)
+EXC_REAL_NONE(0x1200, 0x100)
+EXC_VIRT_NONE(0x5200, 0x100)
 #endif
 
 
-EXC_REAL(instruction_breakpoint, 0x1300, 0x1400)
-EXC_VIRT(instruction_breakpoint, 0x5300, 0x5400, 0x1300)
+EXC_REAL(instruction_breakpoint, 0x1300, 0x100)
+EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300)
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300)
 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
 
-EXC_REAL_NONE(0x1400, 0x1500)
-EXC_VIRT_NONE(0x5400, 0x5500)
+EXC_REAL_NONE(0x1400, 0x100)
+EXC_VIRT_NONE(0x5400, 0x100)
 
-EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x1600)
+EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
 	mtspr	SPRN_SPRG_HSCRATCH0,r13
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500)
@@ -1163,14 +1174,14 @@ EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x1600)
 
 	KVMTEST_PR(0x1500)
 	EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV)
-EXC_REAL_END(denorm_exception_hv, 0x1500, 0x1600)
+EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100)
 
 #ifdef CONFIG_PPC_DENORMALISATION
-EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x5600)
+EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100)
 	b	exc_real_0x1500_denorm_exception_hv
-EXC_VIRT_END(denorm_exception, 0x5500, 0x5600)
+EXC_VIRT_END(denorm_exception, 0x5500, 0x100)
 #else
-EXC_VIRT_NONE(0x5500, 0x5600)
+EXC_VIRT_NONE(0x5500, 0x100)
 #endif
 
 TRAMP_KVM_SKIP(PACA_EXGEN, 0x1500)
@@ -1243,18 +1254,18 @@ EXC_COMMON_HV(denorm_common, 0x1500, unknown_exception)
 
 
 #ifdef CONFIG_CBE_RAS
-EXC_REAL_HV(cbe_maintenance, 0x1600, 0x1700)
-EXC_VIRT_NONE(0x5600, 0x5700)
+EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100)
+EXC_VIRT_NONE(0x5600, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600)
 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception)
 #else /* CONFIG_CBE_RAS */
-EXC_REAL_NONE(0x1600, 0x1700)
-EXC_VIRT_NONE(0x5600, 0x5700)
+EXC_REAL_NONE(0x1600, 0x100)
+EXC_VIRT_NONE(0x5600, 0x100)
 #endif
 
 
-EXC_REAL(altivec_assist, 0x1700, 0x1800)
-EXC_VIRT(altivec_assist, 0x5700, 0x5800, 0x1700)
+EXC_REAL(altivec_assist, 0x1700, 0x100)
+EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700)
 TRAMP_KVM(PACA_EXGEN, 0x1700)
 #ifdef CONFIG_ALTIVEC
 EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception)
@@ -1264,13 +1275,13 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception)
 
 
 #ifdef CONFIG_CBE_RAS
-EXC_REAL_HV(cbe_thermal, 0x1800, 0x1900)
-EXC_VIRT_NONE(0x5800, 0x5900)
+EXC_REAL_HV(cbe_thermal, 0x1800, 0x100)
+EXC_VIRT_NONE(0x5800, 0x100)
 TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800)
 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception)
 #else /* CONFIG_CBE_RAS */
-EXC_REAL_NONE(0x1800, 0x1900)
-EXC_VIRT_NONE(0x5800, 0x5900)
+EXC_REAL_NONE(0x1800, 0x100)
+EXC_VIRT_NONE(0x5800, 0x100)
 #endif
 
 
-- 
cgit v1.2.3


From a5ecdad4847897007399d7a14c9109b65ce4c9b7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 7 Feb 2017 00:09:27 +0530
Subject: powerpc/mm: Add MMU_FTR_KERNEL_RO to possible feature mask

Without this we will always find the feature disabled.

Fixes: 984d7a1ec6 ("powerpc/mm: Fixup kernel read only mapping")
Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index a34c764ca8dd..e5616bf83623 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -136,6 +136,7 @@ enum {
 		MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL |
 		MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
 		MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA |
+		MMU_FTR_KERNEL_RO |
 #ifdef CONFIG_PPC_RADIX_MMU
 		MMU_FTR_TYPE_RADIX |
 #endif
-- 
cgit v1.2.3


From 2337d207288f163e10bd8d4d7eeb0c1c75046a0c Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 27 Jan 2017 14:24:33 +1000
Subject: powerpc/64: CONFIG_RELOCATABLE support for hmi interrupts

The branch from hmi_exception_early to hmi_exception_realmode must use
a "relocatable-style" branch, because it is branching from unrelocated
exception code to beyond __end_interrupts.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 8 ++++++++
 arch/powerpc/kernel/exceptions-64s.S     | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 8fa09fa500f0..14752eee3d0c 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -236,6 +236,11 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtctr	reg;							\
 	bctr
 
+#define BRANCH_LINK_TO_FAR(reg, label)					\
+	__LOAD_FAR_HANDLER(reg, label);					\
+	mtctr	reg;							\
+	bctrl
+
 /*
  * KVM requires __LOAD_FAR_HANDLER.
  *
@@ -260,6 +265,9 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define BRANCH_TO_COMMON(reg, label)					\
 	b	label
 
+#define BRANCH_LINK_TO_FAR(reg, label)					\
+	bl	label
+
 #define BRANCH_TO_KVM(reg, label)					\
 	b	label
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 34a04a5fa468..76dd7738c122 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -982,7 +982,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
 	EXCEPTION_PROLOG_COMMON_3(0xe60)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	hmi_exception_realmode
+	BRANCH_LINK_TO_FAR(r4, hmi_exception_realmode)
 	/* Windup the stack. */
 	/* Move original HSRR0 and HSRR1 into the respective regs */
 	ld	r9,_MSR(r1)
-- 
cgit v1.2.3


From ab9bad0ead9ab179ace09988a3f1cfca122eb7c2 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 7 Feb 2017 16:03:17 +1100
Subject: powerpc/powernv: Remove separate entry for OPAL real mode calls

All entry points already read the MSR so they can easily do
the right thing.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal.h                |  7 ---
 arch/powerpc/kernel/idle_book3s.S              |  6 +--
 arch/powerpc/kvm/book3s_hv_builtin.c           | 34 ++++---------
 arch/powerpc/kvm/book3s_hv_rm_xics.c           | 10 ++--
 arch/powerpc/platforms/powernv/opal-wrappers.S | 70 +++++++++++---------------
 arch/powerpc/platforms/powernv/pci-ioda.c      |  5 --
 6 files changed, 46 insertions(+), 86 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 5c7db0f1a708..16efe7406776 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -67,7 +67,6 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func,
 int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func,
 				   uint64_t offset, uint32_t data);
 int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
-int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
 int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority);
 int64_t opal_register_exception_handler(uint64_t opal_exception,
 					uint64_t handler_address,
@@ -220,18 +219,12 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id,
 int64_t opal_pci_poll2(uint64_t id, uint64_t data);
 
 int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll);
-int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll);
 int64_t opal_int_set_cppr(uint8_t cppr);
 int64_t opal_int_eoi(uint32_t xirr);
-int64_t opal_rm_int_eoi(uint32_t xirr);
 int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
-int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
 int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
 			  uint32_t pe_num, uint32_t tce_size,
 			  uint64_t dma_addr, uint32_t npages);
-int64_t opal_rm_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
-			     uint32_t pe_num, uint32_t tce_size,
-			     uint64_t dma_addr, uint32_t npages);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 72dac0b58061..5302e1ad82c2 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -250,7 +250,7 @@ fastsleep_workaround_at_entry:
 	/* Fast sleep workaround */
 	li	r3,1
 	li	r4,1
-	bl	opal_rm_config_cpu_idle_state
+	bl	opal_config_cpu_idle_state
 
 	/* Clear Lock bit */
 	li	r0,0
@@ -544,7 +544,7 @@ timebase_resync:
 	 */
 	ble	cr3,clear_lock
 	/* Time base re-sync */
-	bl	opal_rm_resync_timebase;
+	bl	opal_resync_timebase;
 	/*
 	 * If waking up from sleep, per core state is not lost, skip to
 	 * clear_lock.
@@ -633,7 +633,7 @@ hypervisor_state_restored:
 fastsleep_workaround_at_exit:
 	li	r3,1
 	li	r4,0
-	bl	opal_rm_config_cpu_idle_state
+	bl	opal_config_cpu_idle_state
 	b	timebase_resync
 
 /*
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fe08fea54b70..2f69fbc19bb0 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -29,11 +29,6 @@
 #include <asm/opal.h>
 #include <asm/smp.h>
 
-static bool in_realmode(void)
-{
-	return !(mfmsr() & MSR_IR);
-}
-
 #define KVM_CMA_CHUNK_ORDER	18
 
 /*
@@ -230,13 +225,10 @@ void kvmhv_rm_send_ipi(int cpu)
 
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	if (!in_realmode())
-		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
-	else if (xics_phys)
+	if (xics_phys)
 		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
 	else
-		opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
-				     IPI_PRIORITY);
+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
 }
 
 /*
@@ -419,10 +411,8 @@ static long kvmppc_read_one_intr(bool *again)
 	/* Now read the interrupt from the ICP */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
 	rc = 0;
-	if (!in_realmode())
+	if (!xics_phys)
 		rc = opal_int_get_xirr(&xirr, false);
-	else if (!xics_phys)
-		rc = opal_rm_int_get_xirr(&xirr, false);
 	else
 		xirr = _lwzcix(xics_phys + XICS_XIRR);
 	if (rc < 0)
@@ -453,15 +443,12 @@ static long kvmppc_read_one_intr(bool *again)
 	 */
 	if (xisr == XICS_IPI) {
 		rc = 0;
-		if (!in_realmode()) {
-			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
-			rc = opal_int_eoi(h_xirr);
-		} else if (xics_phys) {
+		if (xics_phys) {
 			_stbcix(xics_phys + XICS_MFRR, 0xff);
 			_stwcix(xics_phys + XICS_XIRR, xirr);
 		} else {
-			opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
-			rc = opal_rm_int_eoi(h_xirr);
+			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
+			rc = opal_int_eoi(h_xirr);
 		}
 		/* If rc > 0, there is another interrupt pending */
 		*again = rc > 0;
@@ -482,14 +469,11 @@ static long kvmppc_read_one_intr(bool *again)
 			/* We raced with the host,
 			 * we need to resend that IPI, bummer
 			 */
-			if (!in_realmode())
-				opal_int_set_mfrr(hard_smp_processor_id(),
-						  IPI_PRIORITY);
-			else if (xics_phys)
+			if (xics_phys)
 				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
 			else
-				opal_rm_int_set_mfrr(hard_smp_processor_id(),
-						     IPI_PRIORITY);
+				opal_int_set_mfrr(hard_smp_processor_id(),
+						  IPI_PRIORITY);
 			/* Let side effects complete */
 			smp_mb();
 			return 1;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 7e2eb3e865b3..29f43ed6d5eb 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			    u32 new_irq);
-static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
+static int xics_opal_set_server(unsigned int hw_irq, int server_cpu);
 
 /* -- ICS routines -- */
 static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -728,7 +728,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 			++vcpu->stat.pthru_host;
 			if (state->intr_cpu != pcpu) {
 				++vcpu->stat.pthru_bad_aff;
-				xics_opal_rm_set_server(state->host_irq, pcpu);
+				xics_opal_set_server(state->host_irq, pcpu);
 			}
 			state->intr_cpu = -1;
 		}
@@ -756,16 +756,16 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 	if (xics_phys) {
 		_stwcix(xics_phys + XICS_XIRR, xirr);
 	} else {
-		rc = opal_rm_int_eoi(be32_to_cpu(xirr));
+		rc = opal_int_eoi(be32_to_cpu(xirr));
 		*again = rc > 0;
 	}
 }
 
-static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+static int xics_opal_set_server(unsigned int hw_irq, int server_cpu)
 {
 	unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
 
-	return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+	return opal_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
 }
 
 /*
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3aa40f1b20f5..28799e557348 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -58,14 +58,16 @@ END_FTR_SECTION(0, 1);						\
 
 #define OPAL_CALL(name, token)		\
  _GLOBAL_TOC(name);			\
+	mfmsr	r12;			\
 	mflr	r0;			\
+	andi.	r11,r12,MSR_IR|MSR_DR; 	\
 	std	r0,PPC_LR_STKOFF(r1);	\
 	li	r0,token;		\
+	beq	opal_real_call;         \
 	OPAL_BRANCH(opal_tracepoint_entry) \
-	mfcr	r12;			\
-	stw	r12,8(r1);		\
+	mfcr	r11;			\
+	stw	r11,8(r1);		\
 	li	r11,0;			\
-	mfmsr	r12;			\
 	ori	r11,r11,MSR_EE;		\
 	std	r12,PACASAVEDMSR(r13);	\
 	andc	r12,r12,r11;		\
@@ -98,6 +100,30 @@ opal_return:
 	mtcr	r4;
 	rfid
 
+opal_real_call:
+	mfcr	r11
+	stw	r11,8(r1)
+	/* Set opal return address */
+	LOAD_REG_ADDR(r11, opal_return_realmode)
+	mtlr	r11
+	li	r11,MSR_LE
+	andc	r12,r12,r11
+	mtspr	SPRN_HSRR1,r12
+	LOAD_REG_ADDR(r11,opal)
+	ld	r12,8(r11)
+	ld	r2,0(r11)
+	mtspr	SPRN_HSRR0,r12
+	hrfid
+
+opal_return_realmode:
+	FIXUP_ENDIAN
+	ld	r2,PACATOC(r13);
+	lwz	r11,8(r1);
+	ld	r12,PPC_LR_STKOFF(r1)
+	mtcr	r11;
+	mtlr	r12
+	blr
+
 #ifdef CONFIG_TRACEPOINTS
 opal_tracepoint_entry:
 	stdu	r1,-STACKFRAMESIZE(r1)
@@ -155,36 +181,6 @@ opal_tracepoint_return:
 	blr
 #endif
 
-#define OPAL_CALL_REAL(name, token)			\
- _GLOBAL_TOC(name);					\
-	mflr	r0;					\
-	std	r0,PPC_LR_STKOFF(r1);			\
-	li	r0,token;				\
-	mfcr	r12;					\
-	stw	r12,8(r1);				\
-							\
-	/* Set opal return address */			\
-	LOAD_REG_ADDR(r11, opal_return_realmode);	\
-	mtlr	r11;					\
-	mfmsr	r12;					\
-	li	r11,MSR_LE;				\
-	andc	r12,r12,r11;				\
-	mtspr	SPRN_HSRR1,r12;				\
-	LOAD_REG_ADDR(r11,opal);			\
-	ld	r12,8(r11);				\
-	ld	r2,0(r11);				\
-	mtspr	SPRN_HSRR0,r12;				\
-	hrfid
-
-opal_return_realmode:
-	FIXUP_ENDIAN
-	ld	r2,PACATOC(r13);
-	lwz	r11,8(r1);
-	ld	r12,PPC_LR_STKOFF(r1)
-	mtcr	r11;
-	mtlr	r12
-	blr
-
 
 OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
 OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
@@ -208,7 +204,6 @@ OPAL_CALL(opal_pci_config_write_byte,		OPAL_PCI_CONFIG_WRITE_BYTE);
 OPAL_CALL(opal_pci_config_write_half_word,	OPAL_PCI_CONFIG_WRITE_HALF_WORD);
 OPAL_CALL(opal_pci_config_write_word,		OPAL_PCI_CONFIG_WRITE_WORD);
 OPAL_CALL(opal_set_xive,			OPAL_SET_XIVE);
-OPAL_CALL_REAL(opal_rm_set_xive,		OPAL_SET_XIVE);
 OPAL_CALL(opal_get_xive,			OPAL_GET_XIVE);
 OPAL_CALL(opal_register_exception_handler,	OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
 OPAL_CALL(opal_pci_eeh_freeze_status,		OPAL_PCI_EEH_FREEZE_STATUS);
@@ -264,7 +259,6 @@ OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
 OPAL_CALL(opal_resync_timebase,			OPAL_RESYNC_TIMEBASE);
-OPAL_CALL_REAL(opal_rm_resync_timebase,		OPAL_RESYNC_TIMEBASE);
 OPAL_CALL(opal_check_token,			OPAL_CHECK_TOKEN);
 OPAL_CALL(opal_dump_init,			OPAL_DUMP_INIT);
 OPAL_CALL(opal_dump_info,			OPAL_DUMP_INFO);
@@ -280,9 +274,7 @@ OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ);
 OPAL_CALL(opal_get_param,			OPAL_GET_PARAM);
 OPAL_CALL(opal_set_param,			OPAL_SET_PARAM);
 OPAL_CALL(opal_handle_hmi,			OPAL_HANDLE_HMI);
-OPAL_CALL_REAL(opal_rm_handle_hmi,		OPAL_HANDLE_HMI);
 OPAL_CALL(opal_config_cpu_idle_state,		OPAL_CONFIG_CPU_IDLE_STATE);
-OPAL_CALL_REAL(opal_rm_config_cpu_idle_state,	OPAL_CONFIG_CPU_IDLE_STATE);
 OPAL_CALL(opal_slw_set_reg,			OPAL_SLW_SET_REG);
 OPAL_CALL(opal_register_dump_region,		OPAL_REGISTER_DUMP_REGION);
 OPAL_CALL(opal_unregister_dump_region,		OPAL_UNREGISTER_DUMP_REGION);
@@ -304,11 +296,7 @@ OPAL_CALL(opal_pci_get_presence_state,		OPAL_PCI_GET_PRESENCE_STATE);
 OPAL_CALL(opal_pci_get_power_state,		OPAL_PCI_GET_POWER_STATE);
 OPAL_CALL(opal_pci_set_power_state,		OPAL_PCI_SET_POWER_STATE);
 OPAL_CALL(opal_int_get_xirr,			OPAL_INT_GET_XIRR);
-OPAL_CALL_REAL(opal_rm_int_get_xirr,		OPAL_INT_GET_XIRR);
 OPAL_CALL(opal_int_set_cppr,			OPAL_INT_SET_CPPR);
 OPAL_CALL(opal_int_eoi,				OPAL_INT_EOI);
-OPAL_CALL_REAL(opal_rm_int_eoi,			OPAL_INT_EOI);
 OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
-OPAL_CALL_REAL(opal_rm_int_set_mfrr,		OPAL_INT_SET_MFRR);
 OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
-OPAL_CALL_REAL(opal_rm_pci_tce_kill,		OPAL_PCI_TCE_KILL);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b07680cd2518..a897958edb88 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1962,11 +1962,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
 			pnv_pci_phb3_tce_invalidate(pe, rm, shift,
 						    index, npages);
-		else if (rm)
-			opal_rm_pci_tce_kill(phb->opal_id,
-					     OPAL_PCI_TCE_KILL_PAGES,
-					     pe->pe_number, 1u << shift,
-					     index << shift, npages);
 		else
 			opal_pci_tce_kill(phb->opal_id,
 					  OPAL_PCI_TCE_KILL_PAGES,
-- 
cgit v1.2.3


From 523717d1496c794e8380d0e3de5ca6a8c1887dab Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Cleanup printk calls

Add missing log message severity, remove old debug messages and
replace printk() loop with print_hex_dump() call.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 2088e23a8002..06175126c986 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -221,7 +221,7 @@ static int __init via_cuda_start(void)
 	return -EAGAIN;
     }
 
-    printk("Macintosh CUDA driver v0.5 for Unified ADB.\n");
+    pr_info("Macintosh CUDA driver v0.5 for Unified ADB.\n");
 
     cuda_fully_inited = 1;
     return 0;
@@ -251,7 +251,7 @@ cuda_probe(void)
     	int x;							\
 	for (x = 1000; !(cond); --x) {				\
 	    if (x == 0) {					\
-		printk("Timeout waiting for " what "\n");	\
+		pr_err("Timeout waiting for " what "\n");	\
 		return -ENXIO;					\
 	    }							\
 	    udelay(100);					\
@@ -357,6 +357,7 @@ cuda_reset_adb_bus(void)
     return 0;
 }
 #endif /* CONFIG_ADB */
+
 /* Construct and send a cuda request */
 int
 cuda_request(struct adb_request *req, void (*done)(struct adb_request *),
@@ -474,12 +475,9 @@ cuda_interrupt(int irq, void *arg)
     }
     
     status = (~in_8(&via[B]) & (TIP|TREQ)) | (in_8(&via[ACR]) & SR_OUT);
-    /* printk("cuda_interrupt: state=%d status=%x\n", cuda_state, status); */
     switch (cuda_state) {
     case idle:
 	/* CUDA has sent us the first byte of data - unsolicited */
-	if (status != TREQ)
-	    printk("cuda: state=idle, status=%x\n", status);
 	(void)in_8(&via[SR]);
 	out_8(&via[B], in_8(&via[B]) & ~TIP);
 	cuda_state = reading;
@@ -489,8 +487,6 @@ cuda_interrupt(int irq, void *arg)
 
     case awaiting_reply:
 	/* CUDA has sent us the first byte of data of a reply */
-	if (status != TREQ)
-	    printk("cuda: state=awaiting_reply, status=%x\n", status);
 	(void)in_8(&via[SR]);
 	out_8(&via[B], in_8(&via[B]) & ~TIP);
 	cuda_state = reading;
@@ -506,9 +502,6 @@ cuda_interrupt(int irq, void *arg)
 	    out_8(&via[B], in_8(&via[B]) | TIP | TACK);
 	    cuda_state = idle;
 	} else {
-	    /* assert status == TIP + SR_OUT */
-	    if (status != TIP + SR_OUT)
-		printk("cuda: state=sent_first_byte status=%x\n", status);
 	    out_8(&via[SR], current_req->data[1]);
 	    out_8(&via[B], in_8(&via[B]) ^ TACK);
 	    data_index = 2;
@@ -545,9 +538,6 @@ cuda_interrupt(int irq, void *arg)
 	    out_8(&via[B], in_8(&via[B]) | TACK | TIP);
 	    cuda_state = read_done;
 	} else {
-	    /* assert status == TIP | TREQ */
-	    if (status != TIP + TREQ)
-		printk("cuda: state=reading status=%x\n", status);
 	    out_8(&via[B], in_8(&via[B]) ^ TACK);
 	}
 	break;
@@ -593,7 +583,7 @@ cuda_interrupt(int irq, void *arg)
 	break;
 
     default:
-	printk("cuda_interrupt: unknown cuda_state %d?\n", cuda_state);
+	pr_err("cuda_interrupt: unknown cuda_state %d?\n", cuda_state);
     }
     spin_unlock(&cuda_lock);
     if (complete && req) {
@@ -614,8 +604,6 @@ cuda_interrupt(int irq, void *arg)
 static void
 cuda_input(unsigned char *buf, int nb)
 {
-    int i;
-
     switch (buf[0]) {
     case ADB_PACKET:
 #ifdef CONFIG_XMON
@@ -633,9 +621,7 @@ cuda_input(unsigned char *buf, int nb)
 	break;
 
     default:
-	printk("data from cuda (%d bytes):", nb);
-	for (i = 0; i < nb; ++i)
-	    printk(" %.2x", buf[i]);
-	printk("\n");
+	print_hex_dump(KERN_INFO, "cuda_input: ", DUMP_PREFIX_NONE, 32, 1,
+	               buf, nb, false);
     }
 }
-- 
cgit v1.2.3


From 06d7e99408acd2faa099d8af1a57cb1f6624062a Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Remove redundant temporary variable

There is no possibility that current_req can change during execution of
cuda_start(). This can be confirmed by inspection: cuda_lock is always
held whenever cuda_start() is called or current_req is modified.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 06175126c986..dd51df5a6ec0 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -414,19 +414,15 @@ cuda_write(struct adb_request *req)
 static void
 cuda_start(void)
 {
-    struct adb_request *req;
-
     /* assert cuda_state == idle */
-    /* get the packet to send */
-    req = current_req;
-    if (req == 0)
+    if (current_req == NULL)
 	return;
     if ((in_8(&via[B]) & TREQ) == 0)
 	return;			/* a byte is coming in from the CUDA */
 
     /* set the shift register to shift out and send a byte */
     out_8(&via[ACR], in_8(&via[ACR]) | SR_OUT);
-    out_8(&via[SR], req->data[0]);
+    out_8(&via[SR], current_req->data[0]);
     out_8(&via[B], in_8(&via[B]) & ~TIP);
     cuda_state = sent_first_byte;
 }
-- 
cgit v1.2.3


From fd7a65a27c6cb9b0920130d9402b95695168092d Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Add TREQ, TIP and TACK signal helpers

Introduce some helpers for handling the signalling between VIA and
Cuda. This abstraction will be used to add support for Egret devices,
which utilize slightly different signalling.

Don't invert the sense of the Cuda's active-low signals when storing
them in the 'status' variable. Just assert, negate and test those
signals using the helpers.

The state machine does not need to test its own output signals to
figure out what to do next: the next state depends on the Cuda's TREQ
output. Just call the TREQ_asserted() helper function to test for that.

Similarly, there is no need to store pin directions in the 'status'
variable. That was only useful for debugging messages.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 76 ++++++++++++++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 23 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index dd51df5a6ec0..64a04af248a1 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -65,6 +65,36 @@ static DEFINE_SPINLOCK(cuda_lock);
 #define IER_CLR		0		/* clear bits in IER */
 #define SR_INT		0x04		/* Shift register full/empty */
 
+static inline bool TREQ_asserted(u8 portb)
+{
+	return !(portb & TREQ);
+}
+
+static inline void assert_TIP(void)
+{
+	out_8(&via[B], in_8(&via[B]) & ~TIP);
+}
+
+static inline void assert_TACK(void)
+{
+	out_8(&via[B], in_8(&via[B]) & ~TACK);
+}
+
+static inline void toggle_TACK(void)
+{
+	out_8(&via[B], in_8(&via[B]) ^ TACK);
+}
+
+static inline void negate_TACK(void)
+{
+	out_8(&via[B], in_8(&via[B]) | TACK);
+}
+
+static inline void negate_TIP_and_TACK(void)
+{
+	out_8(&via[B], in_8(&via[B]) | TIP | TACK);
+}
+
 static enum cuda_state {
     idle,
     sent_first_byte,
@@ -262,7 +292,7 @@ static int
 __init cuda_init_via(void)
 {
     out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ);	/* TACK & TIP out */
-    out_8(&via[B], in_8(&via[B]) | TACK | TIP);			/* negate them */
+    negate_TIP_and_TACK();
     out_8(&via[ACR] ,(in_8(&via[ACR]) & ~SR_CTRL) | SR_EXT);	/* SR data in */
     (void)in_8(&via[SR]);						/* clear any left-over data */
 #ifdef CONFIG_PPC
@@ -278,10 +308,10 @@ __init cuda_init_via(void)
     out_8(&via[IFR], SR_INT);
 
     /* sync with the CUDA - assert TACK without TIP */
-    out_8(&via[B], in_8(&via[B]) & ~TACK);
+    assert_TACK();
 
     /* wait for the CUDA to assert TREQ in response */
-    WAIT_FOR((in_8(&via[B]) & TREQ) == 0, "CUDA response to sync");
+    WAIT_FOR(TREQ_asserted(in_8(&via[B])), "CUDA response to sync");
 
     /* wait for the interrupt and then clear it */
     WAIT_FOR(in_8(&via[IFR]) & SR_INT, "CUDA response to sync (2)");
@@ -289,14 +319,13 @@ __init cuda_init_via(void)
     out_8(&via[IFR], SR_INT);
 
     /* finish the sync by negating TACK */
-    out_8(&via[B], in_8(&via[B]) | TACK);
+    negate_TACK();
 
     /* wait for the CUDA to negate TREQ and the corresponding interrupt */
-    WAIT_FOR(in_8(&via[B]) & TREQ, "CUDA response to sync (3)");
+    WAIT_FOR(!TREQ_asserted(in_8(&via[B])), "CUDA response to sync (3)");
     WAIT_FOR(in_8(&via[IFR]) & SR_INT, "CUDA response to sync (4)");
     (void)in_8(&via[SR]);
     out_8(&via[IFR], SR_INT);
-    out_8(&via[B], in_8(&via[B]) | TIP);	/* should be unnecessary */
 
     return 0;
 }
@@ -417,13 +446,13 @@ cuda_start(void)
     /* assert cuda_state == idle */
     if (current_req == NULL)
 	return;
-    if ((in_8(&via[B]) & TREQ) == 0)
+    if (TREQ_asserted(in_8(&via[B])))
 	return;			/* a byte is coming in from the CUDA */
 
     /* set the shift register to shift out and send a byte */
     out_8(&via[ACR], in_8(&via[ACR]) | SR_OUT);
     out_8(&via[SR], current_req->data[0]);
-    out_8(&via[B], in_8(&via[B]) & ~TIP);
+    assert_TIP();
     cuda_state = sent_first_byte;
 }
 
@@ -444,7 +473,7 @@ EXPORT_SYMBOL(cuda_poll);
 static irqreturn_t
 cuda_interrupt(int irq, void *arg)
 {
-    int status;
+    u8 status;
     struct adb_request *req = NULL;
     unsigned char ibuf[16];
     int ibuf_len = 0;
@@ -469,13 +498,14 @@ cuda_interrupt(int irq, void *arg)
             out_8(&via[IFR], SR_INT);
         }
     }
-    
-    status = (~in_8(&via[B]) & (TIP|TREQ)) | (in_8(&via[ACR]) & SR_OUT);
+
+    status = in_8(&via[B]) & (TIP | TACK | TREQ);
+
     switch (cuda_state) {
     case idle:
 	/* CUDA has sent us the first byte of data - unsolicited */
 	(void)in_8(&via[SR]);
-	out_8(&via[B], in_8(&via[B]) & ~TIP);
+	assert_TIP();
 	cuda_state = reading;
 	reply_ptr = cuda_rbuf;
 	reading_reply = 0;
@@ -484,22 +514,22 @@ cuda_interrupt(int irq, void *arg)
     case awaiting_reply:
 	/* CUDA has sent us the first byte of data of a reply */
 	(void)in_8(&via[SR]);
-	out_8(&via[B], in_8(&via[B]) & ~TIP);
+	assert_TIP();
 	cuda_state = reading;
 	reply_ptr = current_req->reply;
 	reading_reply = 1;
 	break;
 
     case sent_first_byte:
-	if (status == TREQ + TIP + SR_OUT) {
+	if (TREQ_asserted(status)) {
 	    /* collision */
 	    out_8(&via[ACR], in_8(&via[ACR]) & ~SR_OUT);
 	    (void)in_8(&via[SR]);
-	    out_8(&via[B], in_8(&via[B]) | TIP | TACK);
+	    negate_TIP_and_TACK();
 	    cuda_state = idle;
 	} else {
 	    out_8(&via[SR], current_req->data[1]);
-	    out_8(&via[B], in_8(&via[B]) ^ TACK);
+	    toggle_TACK();
 	    data_index = 2;
 	    cuda_state = sending;
 	}
@@ -510,7 +540,7 @@ cuda_interrupt(int irq, void *arg)
 	if (data_index >= req->nbytes) {
 	    out_8(&via[ACR], in_8(&via[ACR]) & ~SR_OUT);
 	    (void)in_8(&via[SR]);
-	    out_8(&via[B], in_8(&via[B]) | TACK | TIP);
+	    negate_TIP_and_TACK();
 	    req->sent = 1;
 	    if (req->reply_expected) {
 		cuda_state = awaiting_reply;
@@ -523,18 +553,18 @@ cuda_interrupt(int irq, void *arg)
 	    }
 	} else {
 	    out_8(&via[SR], req->data[data_index++]);
-	    out_8(&via[B], in_8(&via[B]) ^ TACK);
+	    toggle_TACK();
 	}
 	break;
 
     case reading:
 	*reply_ptr++ = in_8(&via[SR]);
-	if (status == TIP) {
+	if (!TREQ_asserted(status)) {
 	    /* that's all folks */
-	    out_8(&via[B], in_8(&via[B]) | TACK | TIP);
+	    negate_TIP_and_TACK();
 	    cuda_state = read_done;
 	} else {
-	    out_8(&via[B], in_8(&via[B]) ^ TACK);
+	    toggle_TACK();
 	}
 	break;
 
@@ -567,8 +597,8 @@ cuda_interrupt(int irq, void *arg)
 	    ibuf_len = reply_ptr - cuda_rbuf;
 	    memcpy(ibuf, cuda_rbuf, ibuf_len);
 	}
-	if (status == TREQ) {
-	    out_8(&via[B], in_8(&via[B]) & ~TIP);
+	if (TREQ_asserted(status)) {
+	    assert_TIP();
 	    cuda_state = reading;
 	    reply_ptr = cuda_rbuf;
 	    reading_reply = 0;
-- 
cgit v1.2.3


From fe73b582f179354e233e5deddbd274efe8d3bbb9 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Prevent read buffer overflow

If the Cuda driver does not enter the 'read_done' state for some
reason, it may continue in the 'reading' state until the buffer
overflows. Add a bounds check to prevent this.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 64a04af248a1..1cf1467cf6e5 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -470,6 +470,8 @@ cuda_poll(void)
 }
 EXPORT_SYMBOL(cuda_poll);
 
+#define ARRAY_FULL(a, p)	((p) - (a) == ARRAY_SIZE(a))
+
 static irqreturn_t
 cuda_interrupt(int irq, void *arg)
 {
@@ -558,7 +560,11 @@ cuda_interrupt(int irq, void *arg)
 	break;
 
     case reading:
-	*reply_ptr++ = in_8(&via[SR]);
+	if (reading_reply ? ARRAY_FULL(current_req->reply, reply_ptr)
+	                  : ARRAY_FULL(cuda_rbuf, reply_ptr))
+	    (void)in_8(&via[SR]);
+	else
+	    *reply_ptr++ = in_8(&via[SR]);
 	if (!TREQ_asserted(status)) {
 	    /* that's all folks */
 	    negate_TIP_and_TACK();
-- 
cgit v1.2.3


From cfbf99801bcaf8398492ebc16af72259ad7aa146 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Fix re-initialization of reply_ptr and reading_reply

When reading_reply is set, reply_ptr points into an adb_request struct.
Conversely, when reply_ptr instead points into the global cuda_rbuf,
reading_reply must be false.

Unfortunately, this rule can be violated because re-initialization
of reply_ptr and reading_reply presently depends on the TREQ input.

Fix this by re-initializing reply_ptr and reading_reply as soon as they
are known to be invalid.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 1cf1467cf6e5..ae3da6b95229 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -592,6 +592,7 @@ cuda_interrupt(int irq, void *arg)
 	    }
 	    current_req = req->next;
 	    complete = 1;
+	    reading_reply = 0;
 	} else {
 	    /* This is tricky. We must break the spinlock to call
 	     * cuda_input. However, doing so means we might get
@@ -603,11 +604,10 @@ cuda_interrupt(int irq, void *arg)
 	    ibuf_len = reply_ptr - cuda_rbuf;
 	    memcpy(ibuf, cuda_rbuf, ibuf_len);
 	}
+	reply_ptr = cuda_rbuf;
 	if (TREQ_asserted(status)) {
 	    assert_TIP();
 	    cuda_state = reading;
-	    reply_ptr = cuda_rbuf;
-	    reading_reply = 0;
 	} else {
 	    cuda_state = idle;
 	    cuda_start();
-- 
cgit v1.2.3


From a64662432200f8af6f67cd3664885a323f6a2f2d Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Avoid TREQ race condition

When a read transaction completes, one of several things will happen:
a new transfer is started by the driver, a new transfer request
is raised by the Cuda (i.e. TREQ asserted), or both happen at once.

When both happen at once, there is a race condition between the TREQ test
in the read_done state and the same test in cuda_start(). Moreover, the
former test uses a stale TREQ value.

Theoretically, this can result in the undesirable outcome that the
interrupt handler completes with the state machine 'idle' when it should
instead start the next transaction.

Avoid this race by calling cuda_start() first and then confirming that it
succeeded. If not, test the current TREQ value before entering the
'reading' state.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index ae3da6b95229..32126958ac66 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -605,12 +605,11 @@ cuda_interrupt(int irq, void *arg)
 	    memcpy(ibuf, cuda_rbuf, ibuf_len);
 	}
 	reply_ptr = cuda_rbuf;
-	if (TREQ_asserted(status)) {
+	cuda_state = idle;
+	cuda_start();
+	if (cuda_state == idle && TREQ_asserted(in_8(&via[B]))) {
 	    assert_TIP();
 	    cuda_state = reading;
-	} else {
-	    cuda_state = idle;
-	    cuda_start();
 	}
 	break;
 
-- 
cgit v1.2.3


From ac39452e942af6a212e8f89e8a36b71354323845 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Use spinlock_irq_save/restore instead of enable/disable_irq

The cuda_start() function uses spinlock_irq_save/restore for mutual
exclusion. Let's have cuda_poll() do the same when polling the VIA
interrupt.

The benefit to disabling local irqs when the interrupt is being polled
is that the interrupt handler now has the same timing properties
regardless of whether it is invoked normally or from cuda_poll().

This driver was written back when local irqs remained enabled during
execution of interrupt handlers and cuda_poll() was probably trying
to achieve the same effect by use of enable/disable_irq.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 32126958ac66..e3763cb4184b 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -459,14 +459,7 @@ cuda_start(void)
 void
 cuda_poll(void)
 {
-    /* cuda_interrupt only takes a normal lock, we disable
-     * interrupts here to avoid re-entering and thus deadlocking.
-     */
-    if (cuda_irq)
-	disable_irq(cuda_irq);
-    cuda_interrupt(0, NULL);
-    if (cuda_irq)
-	enable_irq(cuda_irq);
+	cuda_interrupt(0, NULL);
 }
 EXPORT_SYMBOL(cuda_poll);
 
@@ -475,13 +468,14 @@ EXPORT_SYMBOL(cuda_poll);
 static irqreturn_t
 cuda_interrupt(int irq, void *arg)
 {
+    unsigned long flags;
     u8 status;
     struct adb_request *req = NULL;
     unsigned char ibuf[16];
     int ibuf_len = 0;
     int complete = 0;
     
-    spin_lock(&cuda_lock);
+    spin_lock_irqsave(&cuda_lock, flags);
 
     /* On powermacs, this handler is registered for the VIA IRQ. But they use
      * just the shift register IRQ -- other VIA interrupt sources are disabled.
@@ -494,7 +488,7 @@ cuda_interrupt(int irq, void *arg)
 #endif
     {
         if ((in_8(&via[IFR]) & SR_INT) == 0) {
-            spin_unlock(&cuda_lock);
+            spin_unlock_irqrestore(&cuda_lock, flags);
             return IRQ_NONE;
         } else {
             out_8(&via[IFR], SR_INT);
@@ -616,7 +610,7 @@ cuda_interrupt(int irq, void *arg)
     default:
 	pr_err("cuda_interrupt: unknown cuda_state %d?\n", cuda_state);
     }
-    spin_unlock(&cuda_lock);
+    spin_unlock_irqrestore(&cuda_lock, flags);
     if (complete && req) {
     	void (*done)(struct adb_request *) = req->done;
     	mb();
-- 
cgit v1.2.3


From 97ced1aac07e7b5348a560512b287af69f863917 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Initialize data_index early and increment consistently

Initialize data_index where appropriate to improve readability and
assist debugging. This change doesn't affect driver behaviour.

I prefer to see
	current_req->data[data_index++]
in place of
	current_req->data[0]
or
	current_req->data[1]
inasmuchas it becomes obvious what the data_index variable does.

Moreover, the actual value of data_index when examined at any given moment
tells me something about prior events, which did prove helpful.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index e3763cb4184b..57fb20dcb9dd 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -446,12 +446,13 @@ cuda_start(void)
     /* assert cuda_state == idle */
     if (current_req == NULL)
 	return;
+    data_index = 0;
     if (TREQ_asserted(in_8(&via[B])))
 	return;			/* a byte is coming in from the CUDA */
 
     /* set the shift register to shift out and send a byte */
     out_8(&via[ACR], in_8(&via[ACR]) | SR_OUT);
-    out_8(&via[SR], current_req->data[0]);
+    out_8(&via[SR], current_req->data[data_index++]);
     assert_TIP();
     cuda_state = sent_first_byte;
 }
@@ -524,9 +525,8 @@ cuda_interrupt(int irq, void *arg)
 	    negate_TIP_and_TACK();
 	    cuda_state = idle;
 	} else {
-	    out_8(&via[SR], current_req->data[1]);
+	    out_8(&via[SR], current_req->data[data_index++]);
 	    toggle_TACK();
-	    data_index = 2;
 	    cuda_state = sending;
 	}
 	break;
-- 
cgit v1.2.3


From d23eee88b56921a0bccd3b2355fc6feb4b5d343b Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: via-cuda: Add support for Egret system controller

The Egret system controller was the predecessor to the Cuda and the
differences are minor.

On Cuda, byte acknowledgement requires one transition of the TACK
signal; on Egret two are needed. On Cuda, TIP is active low; on Egret
it is active high. And Cuda raises certain interrupts that Egret omits.

Accomodating these differences complicates the Cuda driver slightly
but avoids a lot of duplication (see next patch).

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/macintosh/via-cuda.c | 155 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 134 insertions(+), 21 deletions(-)

diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 57fb20dcb9dd..1a742bd9f612 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -1,10 +1,10 @@
 /*
- * Device driver for the via-cuda on Apple Powermacs.
+ * Device driver for the Cuda and Egret system controllers found on PowerMacs
+ * and 68k Macs.
  *
- * The VIA (versatile interface adapter) interfaces to the CUDA,
- * a 6805 microprocessor core which controls the ADB (Apple Desktop
- * Bus) which connects to the keyboard and mouse.  The CUDA also
- * controls system power and the RTC (real time clock) chip.
+ * The Cuda or Egret is a 6805 microcontroller interfaced to the 6522 VIA.
+ * This MCU controls system power, Parameter RAM, Real Time Clock and the
+ * Apple Desktop Bus (ADB) that connects to the keyboard and mouse.
  *
  * Copyright (C) 1996 Paul Mackerras.
  */
@@ -50,10 +50,27 @@ static DEFINE_SPINLOCK(cuda_lock);
 #define IER		(14*RS)		/* Interrupt enable register */
 #define ANH		(15*RS)		/* A-side data, no handshake */
 
-/* Bits in B data register: all active low */
-#define TREQ		0x08		/* Transfer request (input) */
-#define TACK		0x10		/* Transfer acknowledge (output) */
-#define TIP		0x20		/* Transfer in progress (output) */
+/*
+ * When the Cuda design replaced the Egret, some signal names and
+ * logic sense changed. They all serve the same purposes, however.
+ *
+ *   VIA pin       |  Egret pin
+ * ----------------+------------------------------------------
+ *   PB3 (input)   |  Transceiver session   (active low)
+ *   PB4 (output)  |  VIA full              (active high)
+ *   PB5 (output)  |  System session        (active high)
+ *
+ *   VIA pin       |  Cuda pin
+ * ----------------+------------------------------------------
+ *   PB3 (input)   |  Transfer request      (active low)
+ *   PB4 (output)  |  Byte acknowledge      (active low)
+ *   PB5 (output)  |  Transfer in progress  (active low)
+ */
+
+/* Bits in Port B data register */
+#define TREQ		0x08		/* Transfer request */
+#define TACK		0x10		/* Transfer acknowledge */
+#define TIP		0x20		/* Transfer in progress */
 
 /* Bits in ACR */
 #define SR_CTRL		0x1c		/* Shift register control bits */
@@ -65,6 +82,19 @@ static DEFINE_SPINLOCK(cuda_lock);
 #define IER_CLR		0		/* clear bits in IER */
 #define SR_INT		0x04		/* Shift register full/empty */
 
+/* Duration of byte acknowledgement pulse (us) */
+#define EGRET_TACK_ASSERTED_DELAY	300
+#define EGRET_TACK_NEGATED_DELAY	400
+
+/* Interval from interrupt to start of session (us) */
+#define EGRET_SESSION_DELAY		450
+
+#ifdef CONFIG_PPC
+#define mcu_is_egret	false
+#else
+static bool mcu_is_egret;
+#endif
+
 static inline bool TREQ_asserted(u8 portb)
 {
 	return !(portb & TREQ);
@@ -72,12 +102,29 @@ static inline bool TREQ_asserted(u8 portb)
 
 static inline void assert_TIP(void)
 {
-	out_8(&via[B], in_8(&via[B]) & ~TIP);
+	if (mcu_is_egret) {
+		udelay(EGRET_SESSION_DELAY);
+		out_8(&via[B], in_8(&via[B]) | TIP);
+	} else
+		out_8(&via[B], in_8(&via[B]) & ~TIP);
+}
+
+static inline void assert_TIP_and_TACK(void)
+{
+	if (mcu_is_egret) {
+		udelay(EGRET_SESSION_DELAY);
+		out_8(&via[B], in_8(&via[B]) | TIP | TACK);
+	} else
+		out_8(&via[B], in_8(&via[B]) & ~(TIP | TACK));
 }
 
 static inline void assert_TACK(void)
 {
-	out_8(&via[B], in_8(&via[B]) & ~TACK);
+	if (mcu_is_egret) {
+		udelay(EGRET_TACK_NEGATED_DELAY);
+		out_8(&via[B], in_8(&via[B]) | TACK);
+	} else
+		out_8(&via[B], in_8(&via[B]) & ~TACK);
 }
 
 static inline void toggle_TACK(void)
@@ -87,12 +134,20 @@ static inline void toggle_TACK(void)
 
 static inline void negate_TACK(void)
 {
-	out_8(&via[B], in_8(&via[B]) | TACK);
+	if (mcu_is_egret) {
+		udelay(EGRET_TACK_ASSERTED_DELAY);
+		out_8(&via[B], in_8(&via[B]) & ~TACK);
+	} else
+		out_8(&via[B], in_8(&via[B]) | TACK);
 }
 
 static inline void negate_TIP_and_TACK(void)
 {
-	out_8(&via[B], in_8(&via[B]) | TIP | TACK);
+	if (mcu_is_egret) {
+		udelay(EGRET_TACK_ASSERTED_DELAY);
+		out_8(&via[B], in_8(&via[B]) & ~(TIP | TACK));
+	} else
+		out_8(&via[B], in_8(&via[B]) | TIP | TACK);
 }
 
 static enum cuda_state {
@@ -155,6 +210,7 @@ int __init find_via_cuda(void)
 
     via = via1;
     cuda_state = idle;
+    mcu_is_egret = false;
 
     err = cuda_init_via();
     if (err) {
@@ -251,7 +307,7 @@ static int __init via_cuda_start(void)
 	return -EAGAIN;
     }
 
-    pr_info("Macintosh CUDA driver v0.5 for Unified ADB.\n");
+    pr_info("Macintosh Cuda and Egret driver.\n");
 
     cuda_fully_inited = 1;
     return 0;
@@ -276,6 +332,33 @@ cuda_probe(void)
 }
 #endif /* CONFIG_ADB */
 
+static int __init sync_egret(void)
+{
+	if (TREQ_asserted(in_8(&via[B]))) {
+		/* Complete the inbound transfer */
+		assert_TIP_and_TACK();
+		while (1) {
+			negate_TACK();
+			mdelay(1);
+			(void)in_8(&via[SR]);
+			assert_TACK();
+			if (!TREQ_asserted(in_8(&via[B])))
+				break;
+		}
+		negate_TIP_and_TACK();
+	} else if (in_8(&via[B]) & TIP) {
+		/* Terminate the outbound transfer */
+		negate_TACK();
+		assert_TACK();
+		mdelay(1);
+		negate_TIP_and_TACK();
+	}
+	/* Clear shift register interrupt */
+	if (in_8(&via[IFR]) & SR_INT)
+		(void)in_8(&via[SR]);
+	return 0;
+}
+
 #define WAIT_FOR(cond, what)					\
     do {                                                        \
     	int x;							\
@@ -291,10 +374,6 @@ cuda_probe(void)
 static int
 __init cuda_init_via(void)
 {
-    out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ);	/* TACK & TIP out */
-    negate_TIP_and_TACK();
-    out_8(&via[ACR] ,(in_8(&via[ACR]) & ~SR_CTRL) | SR_EXT);	/* SR data in */
-    (void)in_8(&via[SR]);						/* clear any left-over data */
 #ifdef CONFIG_PPC
     out_8(&via[IER], 0x7f);					/* disable interrupts from VIA */
     (void)in_8(&via[IER]);
@@ -302,6 +381,15 @@ __init cuda_init_via(void)
     out_8(&via[IER], SR_INT);					/* disable SR interrupt from VIA */
 #endif
 
+    out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ);	/* TACK & TIP out */
+    out_8(&via[ACR], (in_8(&via[ACR]) & ~SR_CTRL) | SR_EXT);	/* SR data in */
+    (void)in_8(&via[SR]);					/* clear any left-over data */
+
+    if (mcu_is_egret)
+	return sync_egret();
+
+    negate_TIP_and_TACK();
+
     /* delay 4ms and then clear any pending interrupt */
     mdelay(4);
     (void)in_8(&via[SR]);
@@ -453,7 +541,10 @@ cuda_start(void)
     /* set the shift register to shift out and send a byte */
     out_8(&via[ACR], in_8(&via[ACR]) | SR_OUT);
     out_8(&via[SR], current_req->data[data_index++]);
-    assert_TIP();
+    if (mcu_is_egret)
+	assert_TIP_and_TACK();
+    else
+	assert_TIP();
     cuda_state = sent_first_byte;
 }
 
@@ -500,8 +591,9 @@ cuda_interrupt(int irq, void *arg)
 
     switch (cuda_state) {
     case idle:
-	/* CUDA has sent us the first byte of data - unsolicited */
+	/* System controller has unsolicited data for us */
 	(void)in_8(&via[SR]);
+idle_state:
 	assert_TIP();
 	cuda_state = reading;
 	reply_ptr = cuda_rbuf;
@@ -509,7 +601,7 @@ cuda_interrupt(int irq, void *arg)
 	break;
 
     case awaiting_reply:
-	/* CUDA has sent us the first byte of data of a reply */
+	/* System controller has reply data for us */
 	(void)in_8(&via[SR]);
 	assert_TIP();
 	cuda_state = reading;
@@ -524,9 +616,14 @@ cuda_interrupt(int irq, void *arg)
 	    (void)in_8(&via[SR]);
 	    negate_TIP_and_TACK();
 	    cuda_state = idle;
+	    /* Egret does not raise an "aborted" interrupt */
+	    if (mcu_is_egret)
+		goto idle_state;
 	} else {
 	    out_8(&via[SR], current_req->data[data_index++]);
 	    toggle_TACK();
+	    if (mcu_is_egret)
+		assert_TACK();
 	    cuda_state = sending;
 	}
 	break;
@@ -550,6 +647,8 @@ cuda_interrupt(int irq, void *arg)
 	} else {
 	    out_8(&via[SR], req->data[data_index++]);
 	    toggle_TACK();
+	    if (mcu_is_egret)
+		assert_TACK();
 	}
 	break;
 
@@ -560,16 +659,24 @@ cuda_interrupt(int irq, void *arg)
 	else
 	    *reply_ptr++ = in_8(&via[SR]);
 	if (!TREQ_asserted(status)) {
+	    if (mcu_is_egret)
+		assert_TACK();
 	    /* that's all folks */
 	    negate_TIP_and_TACK();
 	    cuda_state = read_done;
+	    /* Egret does not raise a "read done" interrupt */
+	    if (mcu_is_egret)
+		goto read_done_state;
 	} else {
 	    toggle_TACK();
+	    if (mcu_is_egret)
+		negate_TACK();
 	}
 	break;
 
     case read_done:
 	(void)in_8(&via[SR]);
+read_done_state:
 	if (reading_reply) {
 	    req = current_req;
 	    req->reply_len = reply_ptr - req->reply;
@@ -645,6 +752,12 @@ cuda_input(unsigned char *buf, int nb)
 #endif /* CONFIG_ADB */
 	break;
 
+    case TIMER_PACKET:
+	/* Egret sends these periodically. Might be useful as a 'heartbeat'
+	 * to trigger a recovery for the VIA shift register errata.
+	 */
+	break;
+
     default:
 	print_hex_dump(KERN_INFO, "cuda_input: ", DUMP_PREFIX_NONE, 32, 1,
 	               buf, nb, false);
-- 
cgit v1.2.3


From f74faec6b3af9d88943a33ccd08de63b0dab8bc7 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sat, 31 Dec 2016 19:56:26 -0500
Subject: m68k/mac: Replace via-maciisi driver with via-cuda driver

Change the device probe test in the via-cuda.c driver so it will load on
Egret-based machines too. Remove the now redundant via-maciisi.c driver.

Tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/m68k/include/asm/macintosh.h |   2 +-
 arch/m68k/mac/config.c            |  18 +-
 arch/m68k/mac/misc.c              |  72 +---
 drivers/macintosh/Kconfig         |  24 +-
 drivers/macintosh/Makefile        |   1 -
 drivers/macintosh/adb.c           |   4 -
 drivers/macintosh/via-cuda.c      |   8 +-
 drivers/macintosh/via-maciisi.c   | 677 --------------------------------------
 8 files changed, 30 insertions(+), 776 deletions(-)
 delete mode 100644 drivers/macintosh/via-maciisi.c

diff --git a/arch/m68k/include/asm/macintosh.h b/arch/m68k/include/asm/macintosh.h
index 42235e7fbeed..5b81ab188aa5 100644
--- a/arch/m68k/include/asm/macintosh.h
+++ b/arch/m68k/include/asm/macintosh.h
@@ -38,7 +38,7 @@ struct mac_model
 
 #define MAC_ADB_NONE		0
 #define MAC_ADB_II		1
-#define MAC_ADB_IISI		2
+#define MAC_ADB_EGRET		2
 #define MAC_ADB_CUDA		3
 #define MAC_ADB_PB1		4
 #define MAC_ADB_PB2		5
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index e46895316eb0..9dc65a4c28d2 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -286,7 +286,7 @@ static struct mac_model mac_data_table[] = {
 	}, {
 		.ident		= MAC_MODEL_IISI,
 		.name		= "IIsi",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_OLD,
 		.scc_type	= MAC_SCC_II,
@@ -295,7 +295,7 @@ static struct mac_model mac_data_table[] = {
 	}, {
 		.ident		= MAC_MODEL_IIVI,
 		.name		= "IIvi",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
@@ -304,7 +304,7 @@ static struct mac_model mac_data_table[] = {
 	}, {
 		.ident		= MAC_MODEL_IIVX,
 		.name		= "IIvx",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
@@ -319,7 +319,7 @@ static struct mac_model mac_data_table[] = {
 	{
 		.ident		= MAC_MODEL_CLII,
 		.name		= "Classic II",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
@@ -352,7 +352,7 @@ static struct mac_model mac_data_table[] = {
 	{
 		.ident		= MAC_MODEL_LC,
 		.name		= "LC",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
@@ -361,7 +361,7 @@ static struct mac_model mac_data_table[] = {
 	}, {
 		.ident		= MAC_MODEL_LCII,
 		.name		= "LC II",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
@@ -370,7 +370,7 @@ static struct mac_model mac_data_table[] = {
 	}, {
 		.ident		= MAC_MODEL_LCIII,
 		.name		= "LC III",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
@@ -498,7 +498,7 @@ static struct mac_model mac_data_table[] = {
 	{
 		.ident		= MAC_MODEL_P460,
 		.name		= "Performa 460",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
@@ -575,7 +575,7 @@ static struct mac_model mac_data_table[] = {
 	}, {
 		.ident		= MAC_MODEL_P600,
 		.name		= "Performa 600",
-		.adb_type	= MAC_ADB_IISI,
+		.adb_type	= MAC_ADB_EGRET,
 		.via_type	= MAC_VIA_IICI,
 		.scsi_type	= MAC_SCSI_LC,
 		.scc_type	= MAC_SCC_II,
diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index c6d351f5bd79..f4bb73fcb67a 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -142,54 +142,6 @@ static void pmu_write_pram(int offset, __u8 data)
 #define pmu_write_pram NULL
 #endif
 
-#if 0 /* def CONFIG_ADB_MACIISI */
-extern int maciisi_request(struct adb_request *req,
-			void (*done)(struct adb_request *), int nbytes, ...);
-
-static long maciisi_read_time(void)
-{
-	struct adb_request req;
-	long time;
-
-	if (maciisi_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME))
-		return 0;
-
-	time = (req.reply[3] << 24) | (req.reply[4] << 16)
-		| (req.reply[5] << 8) | req.reply[6];
-	return time - RTC_OFFSET;
-}
-
-static void maciisi_write_time(long data)
-{
-	struct adb_request req;
-	data += RTC_OFFSET;
-	maciisi_request(&req, NULL, 6, CUDA_PACKET, CUDA_SET_TIME,
-			(data >> 24) & 0xFF, (data >> 16) & 0xFF,
-			(data >> 8) & 0xFF, data & 0xFF);
-}
-
-static __u8 maciisi_read_pram(int offset)
-{
-	struct adb_request req;
-	if (maciisi_request(&req, NULL, 4, CUDA_PACKET, CUDA_GET_PRAM,
-			(offset >> 8) & 0xFF, offset & 0xFF))
-		return 0;
-	return req.reply[3];
-}
-
-static void maciisi_write_pram(int offset, __u8 data)
-{
-	struct adb_request req;
-	maciisi_request(&req, NULL, 5, CUDA_PACKET, CUDA_SET_PRAM,
-			(offset >> 8) & 0xFF, offset & 0xFF, data);
-}
-#else
-#define maciisi_read_time() 0
-#define maciisi_write_time(n)
-#define maciisi_read_pram NULL
-#define maciisi_write_pram NULL
-#endif
-
 /*
  * VIA PRAM/RTC access routines
  *
@@ -458,11 +410,10 @@ void mac_pram_read(int offset, __u8 *buffer, int len)
 	int i;
 
 	switch(macintosh_config->adb_type) {
-	case MAC_ADB_IISI:
-		func = maciisi_read_pram; break;
 	case MAC_ADB_PB1:
 	case MAC_ADB_PB2:
 		func = pmu_read_pram; break;
+	case MAC_ADB_EGRET:
 	case MAC_ADB_CUDA:
 		func = cuda_read_pram; break;
 	default:
@@ -481,11 +432,10 @@ void mac_pram_write(int offset, __u8 *buffer, int len)
 	int i;
 
 	switch(macintosh_config->adb_type) {
-	case MAC_ADB_IISI:
-		func = maciisi_write_pram; break;
 	case MAC_ADB_PB1:
 	case MAC_ADB_PB2:
 		func = pmu_write_pram; break;
+	case MAC_ADB_EGRET:
 	case MAC_ADB_CUDA:
 		func = cuda_write_pram; break;
 	default:
@@ -500,17 +450,13 @@ void mac_pram_write(int offset, __u8 *buffer, int len)
 
 void mac_poweroff(void)
 {
-	/*
-	 * MAC_ADB_IISI may need to be moved up here if it doesn't actually
-	 * work using the ADB packet method.  --David Kilzer
-	 */
-
 	if (oss_present) {
 		oss_shutdown();
 	} else if (macintosh_config->adb_type == MAC_ADB_II) {
 		via_shutdown();
 #ifdef CONFIG_ADB_CUDA
-	} else if (macintosh_config->adb_type == MAC_ADB_CUDA) {
+	} else if (macintosh_config->adb_type == MAC_ADB_EGRET ||
+	           macintosh_config->adb_type == MAC_ADB_CUDA) {
 		cuda_shutdown();
 #endif
 #ifdef CONFIG_ADB_PMU68K
@@ -550,7 +496,8 @@ void mac_reset(void)
 			local_irq_restore(flags);
 		}
 #ifdef CONFIG_ADB_CUDA
-	} else if (macintosh_config->adb_type == MAC_ADB_CUDA) {
+	} else if (macintosh_config->adb_type == MAC_ADB_EGRET ||
+	           macintosh_config->adb_type == MAC_ADB_CUDA) {
 		cuda_restart();
 #endif
 #ifdef CONFIG_ADB_PMU68K
@@ -699,13 +646,11 @@ int mac_hwclk(int op, struct rtc_time *t)
 		case MAC_ADB_IOP:
 			now = via_read_time();
 			break;
-		case MAC_ADB_IISI:
-			now = maciisi_read_time();
-			break;
 		case MAC_ADB_PB1:
 		case MAC_ADB_PB2:
 			now = pmu_read_time();
 			break;
+		case MAC_ADB_EGRET:
 		case MAC_ADB_CUDA:
 			now = cuda_read_time();
 			break;
@@ -737,6 +682,7 @@ int mac_hwclk(int op, struct rtc_time *t)
 		case MAC_ADB_IOP:
 			via_write_time(now);
 			break;
+		case MAC_ADB_EGRET:
 		case MAC_ADB_CUDA:
 			cuda_write_time(now);
 			break;
@@ -744,8 +690,6 @@ int mac_hwclk(int op, struct rtc_time *t)
 		case MAC_ADB_PB2:
 			pmu_write_time(now);
 			break;
-		case MAC_ADB_IISI:
-			maciisi_write_time(now);
 		}
 	}
 	return 0;
diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 5d80810934df..97a420c11eed 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -30,14 +30,6 @@ config ADB_MACII
 	  Quadra 610, Quadra 650, Quadra 700, Quadra 800, Centris 610 and
 	  Centris 650.
 
-config ADB_MACIISI
-	bool "Include Mac IIsi ADB driver"
-	depends on ADB && MAC && BROKEN
-	help
-	  Say Y here if want your kernel to support Macintosh systems that use
-	  the Mac IIsi style ADB.  This includes the IIsi, IIvi, IIvx, Classic
-	  II, LC, LC II, LC III, Performa 460, and the Performa 600.
-
 config ADB_IOP
 	bool "Include IOP (IIfx/Quadra 9x0) ADB driver"
 	depends on ADB && MAC
@@ -60,17 +52,15 @@ config ADB_PMU68K
 
 # we want to change this to something like CONFIG_SYSCTRL_CUDA/PMU
 config ADB_CUDA
-	bool "Support for CUDA based Macs and PowerMacs"
+	bool "Support for Cuda/Egret based Macs and PowerMacs"
 	depends on (ADB || PPC_PMAC) && !PPC_PMAC64
 	help
-	  This provides support for CUDA based Macintosh and Power Macintosh
-	  systems.  This includes many m68k based Macs (Color Classic, Mac TV,
-	  Performa 475, Performa 520, Performa 550, Performa 575,
-	  Performa 588, Quadra 605, Quadra 630, Quadra/Centris 660AV, and
-	  Quadra 840AV), most OldWorld PowerMacs, the first generation iMacs,
-	  the Blue&White G3 and the "Yikes" G4 (PCI Graphics).  All later
-	  models should use CONFIG_ADB_PMU instead.  It is safe to say Y here
-	  even if your machine doesn't have a CUDA.
+	  This provides support for Cuda/Egret based Macintosh and
+	  Power Macintosh systems. This includes most m68k based Macs,
+	  most Old World PowerMacs, the first generation iMacs, the
+	  Blue & White G3 and the "Yikes" G4 (PCI Graphics). All later
+	  models should use CONFIG_ADB_PMU instead. It is safe to say Y
+	  here even if your machine doesn't have a Cuda or Egret device.
 
 	  If unsure say Y.
 
diff --git a/drivers/macintosh/Makefile b/drivers/macintosh/Makefile
index 383ba920085b..516eb65bcacc 100644
--- a/drivers/macintosh/Makefile
+++ b/drivers/macintosh/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_PMAC_SMU)		+= smu.o
 
 obj-$(CONFIG_ADB)		+= adb.o
 obj-$(CONFIG_ADB_MACII)		+= via-macii.o
-obj-$(CONFIG_ADB_MACIISI)	+= via-maciisi.o
 obj-$(CONFIG_ADB_IOP)		+= adb-iop.o
 obj-$(CONFIG_ADB_PMU68K)	+= via-pmu68k.o
 obj-$(CONFIG_ADB_MACIO)		+= macio-adb.o
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 226179b975a0..152414e6378a 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -48,7 +48,6 @@
 EXPORT_SYMBOL(adb_client_list);
 
 extern struct adb_driver via_macii_driver;
-extern struct adb_driver via_maciisi_driver;
 extern struct adb_driver via_cuda_driver;
 extern struct adb_driver adb_iop_driver;
 extern struct adb_driver via_pmu_driver;
@@ -59,9 +58,6 @@ static struct adb_driver *adb_driver_list[] = {
 #ifdef CONFIG_ADB_MACII
 	&via_macii_driver,
 #endif
-#ifdef CONFIG_ADB_MACIISI
-	&via_maciisi_driver,
-#endif
 #ifdef CONFIG_ADB_CUDA
 	&via_cuda_driver,
 #endif
diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c
index 1a742bd9f612..c60415958dfe 100644
--- a/drivers/macintosh/via-cuda.c
+++ b/drivers/macintosh/via-cuda.c
@@ -205,12 +205,13 @@ int __init find_via_cuda(void)
     struct adb_request req;
     int err;
 
-    if (macintosh_config->adb_type != MAC_ADB_CUDA)
+    if (macintosh_config->adb_type != MAC_ADB_CUDA &&
+        macintosh_config->adb_type != MAC_ADB_EGRET)
 	return 0;
 
     via = via1;
     cuda_state = idle;
-    mcu_is_egret = false;
+    mcu_is_egret = macintosh_config->adb_type == MAC_ADB_EGRET;
 
     err = cuda_init_via();
     if (err) {
@@ -323,7 +324,8 @@ cuda_probe(void)
     if (sys_ctrler != SYS_CTRLER_CUDA)
 	return -ENODEV;
 #else
-    if (macintosh_config->adb_type != MAC_ADB_CUDA)
+    if (macintosh_config->adb_type != MAC_ADB_CUDA &&
+        macintosh_config->adb_type != MAC_ADB_EGRET)
 	return -ENODEV;
 #endif
     if (via == NULL)
diff --git a/drivers/macintosh/via-maciisi.c b/drivers/macintosh/via-maciisi.c
deleted file mode 100644
index 34d02a91b29f..000000000000
--- a/drivers/macintosh/via-maciisi.c
+++ /dev/null
@@ -1,677 +0,0 @@
-/*
- * Device driver for the IIsi-style ADB on some Mac LC and II-class machines
- *
- * Based on via-cuda.c and via-macii.c, as well as the original
- * adb-bus.c, which in turn is somewhat influenced by (but uses no
- * code from) the NetBSD HWDIRECT ADB code.  Original IIsi driver work
- * was done by Robert Thompson and integrated into the old style
- * driver by Michael Schmitz.
- *
- * Original sources (c) Alan Cox, Paul Mackerras, and others.
- *
- * Rewritten for Unified ADB by David Huggins-Daines <dhd@debian.org>
- * 
- * 7/13/2000- extensive changes by Andrew McPherson <andrew@macduff.dhs.org>
- * Works about 30% of the time now.
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/adb.h>
-#include <linux/cuda.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <asm/macintosh.h>
-#include <asm/macints.h>
-#include <asm/mac_via.h>
-
-static volatile unsigned char *via;
-
-/* VIA registers - spaced 0x200 bytes apart - only the ones we actually use */
-#define RS		0x200		/* skip between registers */
-#define B		0		/* B-side data */
-#define A		RS		/* A-side data */
-#define DIRB		(2*RS)		/* B-side direction (1=output) */
-#define DIRA		(3*RS)		/* A-side direction (1=output) */
-#define SR		(10*RS)		/* Shift register */
-#define ACR		(11*RS)		/* Auxiliary control register */
-#define IFR		(13*RS)		/* Interrupt flag register */
-#define IER		(14*RS)		/* Interrupt enable register */
-
-/* Bits in B data register: all active low */
-#define TREQ		0x08		/* Transfer request (input) */
-#define TACK		0x10		/* Transfer acknowledge (output) */
-#define TIP		0x20		/* Transfer in progress (output) */
-#define ST_MASK		0x30		/* mask for selecting ADB state bits */
-
-/* Bits in ACR */
-#define SR_CTRL		0x1c		/* Shift register control bits */
-#define SR_EXT		0x0c		/* Shift on external clock */
-#define SR_OUT		0x10		/* Shift out if 1 */
-
-/* Bits in IFR and IER */
-#define IER_SET		0x80		/* set bits in IER */
-#define IER_CLR		0		/* clear bits in IER */
-#define SR_INT		0x04		/* Shift register full/empty */
-#define SR_DATA		0x08		/* Shift register data */
-#define SR_CLOCK	0x10		/* Shift register clock */
-
-#define ADB_DELAY 150
-
-#undef DEBUG_MACIISI_ADB
-
-static struct adb_request* current_req;
-static struct adb_request* last_req;
-static unsigned char maciisi_rbuf[16];
-static unsigned char *reply_ptr;
-static int data_index;
-static int reading_reply;
-static int reply_len;
-static int tmp;
-static int need_sync;
-
-static enum maciisi_state {
-    idle,
-    sending,
-    reading,
-} maciisi_state;
-
-static int maciisi_probe(void);
-static int maciisi_init(void);
-static int maciisi_send_request(struct adb_request* req, int sync);
-static void maciisi_sync(struct adb_request *req);
-static int maciisi_write(struct adb_request* req);
-static irqreturn_t maciisi_interrupt(int irq, void* arg);
-static void maciisi_input(unsigned char *buf, int nb);
-static int maciisi_init_via(void);
-static void maciisi_poll(void);
-static int maciisi_start(void);
-
-struct adb_driver via_maciisi_driver = {
-	"Mac IIsi",
-	maciisi_probe,
-	maciisi_init,
-	maciisi_send_request,
-	NULL, /* maciisi_adb_autopoll, */
-	maciisi_poll,
-	NULL /* maciisi_reset_adb_bus */
-};
-
-static int
-maciisi_probe(void)
-{
-	if (macintosh_config->adb_type != MAC_ADB_IISI)
-		return -ENODEV;
-
-	via = via1;
-	return 0;
-}
-
-static int
-maciisi_init(void)
-{
-	int err;
-
-	if (via == NULL)
-		return -ENODEV;
-
-	if ((err = maciisi_init_via())) {
-		printk(KERN_ERR "maciisi_init: maciisi_init_via() failed, code %d\n", err);
-		via = NULL;
-		return err;
-	}
-
-	if (request_irq(IRQ_MAC_ADB, maciisi_interrupt, 0, "ADB",
-			maciisi_interrupt)) {
-		printk(KERN_ERR "maciisi_init: can't get irq %d\n", IRQ_MAC_ADB);
-		return -EAGAIN;
-	}
-
-	printk("adb: Mac IIsi driver v0.2 for Unified ADB.\n");
-	return 0;
-}
-
-/* Flush data from the ADB controller */
-static void
-maciisi_stfu(void)
-{
-	int status = via[B] & (TIP|TREQ);
-
-	if (status & TREQ) {
-#ifdef DEBUG_MACIISI_ADB
-		printk (KERN_DEBUG "maciisi_stfu called with TREQ high!\n");
-#endif
-		return;
-	}
-	
-	udelay(ADB_DELAY);
-	via[ACR] &= ~SR_OUT;
-	via[IER] = IER_CLR | SR_INT;
-
-	udelay(ADB_DELAY);
-
-	status = via[B] & (TIP|TREQ);
-
-	if (!(status & TREQ))
-	{
-		via[B] |= TIP;
-
-		while(1)
-		{
-			int poll_timeout = ADB_DELAY * 5;
-			/* Poll for SR interrupt */
-			while (!(via[IFR] & SR_INT) && poll_timeout-- > 0)
-				status = via[B] & (TIP|TREQ);
-
-			tmp = via[SR]; /* Clear shift register */
-#ifdef DEBUG_MACIISI_ADB
-			printk(KERN_DEBUG "maciisi_stfu: status %x timeout %d data %x\n",
-			       status, poll_timeout, tmp);
-#endif	
-			if(via[B] & TREQ)
-				break;
-	
-			/* ACK on-off */
-			via[B] |= TACK;
-			udelay(ADB_DELAY);
-			via[B] &= ~TACK;
-		}
-
-		/* end frame */
-		via[B] &= ~TIP;
-		udelay(ADB_DELAY);
-	}
-
-	via[IER] = IER_SET | SR_INT;	
-}
-
-/* All specifically VIA-related initialization goes here */
-static int
-maciisi_init_via(void)
-{
-	int	i;
-	
-	/* Set the lines up. We want TREQ as input TACK|TIP as output */
-	via[DIRB] = (via[DIRB] | TACK | TIP) & ~TREQ;
-	/* Shift register on input */
-	via[ACR]  = (via[ACR] & ~SR_CTRL) | SR_EXT;
-#ifdef DEBUG_MACIISI_ADB
-	printk(KERN_DEBUG "maciisi_init_via: initial status %x\n", via[B] & (TIP|TREQ));
-#endif
-	/* Wipe any pending data and int */
-	tmp = via[SR];
-	/* Enable keyboard interrupts */
-	via[IER] = IER_SET | SR_INT;
-	/* Set initial state: idle */
-	via[B] &= ~(TACK|TIP);
-	/* Clear interrupt bit */
-	via[IFR] = SR_INT;
-
-	for(i = 0; i < 60; i++) {
-		udelay(ADB_DELAY);
-		maciisi_stfu();
-		udelay(ADB_DELAY);
-		if(via[B] & TREQ)
-			break;
-	}
-	if (i == 60)
-		printk(KERN_ERR "maciisi_init_via: bus jam?\n");
-
-	maciisi_state = idle;
-	need_sync = 0;
-
-	return 0;
-}
-
-/* Send a request, possibly waiting for a reply */
-static int
-maciisi_send_request(struct adb_request* req, int sync)
-{
-	int i;
-
-#ifdef DEBUG_MACIISI_ADB
-	static int dump_packet = 0;
-#endif
-
-	if (via == NULL) {
-		req->complete = 1;
-		return -ENXIO;
-	}
-
-#ifdef DEBUG_MACIISI_ADB
-	if (dump_packet) {
-		printk(KERN_DEBUG "maciisi_send_request:");
-		for (i = 0; i < req->nbytes; i++) {
-			printk(" %.2x", req->data[i]);
-		}
-		printk(" sync %d\n", sync);
-	}
-#endif
-
-	req->reply_expected = 1;
-	
-	i = maciisi_write(req);
-	if (i)
-	{
-		/* Normally, if a packet requires syncing, that happens at the end of
-		 * maciisi_send_request. But if the transfer fails, it will be restarted
-		 * by maciisi_interrupt(). We use need_sync to tell maciisi_interrupt
-		 * when to sync a packet that it sends out.
-		 * 
-		 * Suggestions on a better way to do this are welcome.
-		 */
-		if(i == -EBUSY && sync)
-			need_sync = 1;
-		else
-			need_sync = 0;
-		return i;
-	}
-	if(sync)
-		maciisi_sync(req);
-	
-	return 0;
-}
-
-/* Poll the ADB chip until the request completes */
-static void maciisi_sync(struct adb_request *req)
-{
-	int count = 0; 
-
-#ifdef DEBUG_MACIISI_ADB
-	printk(KERN_DEBUG "maciisi_sync called\n");
-#endif
-
-	/* If for some reason the ADB chip shuts up on us, we want to avoid an endless loop. */
-	while (!req->complete && count++ < 50) {
-		maciisi_poll();
-	}
-	/* This could be BAD... when the ADB controller doesn't respond
-	 * for this long, it's probably not coming back :-( */
-	if (count > 50) /* Hopefully shouldn't happen */
-		printk(KERN_ERR "maciisi_send_request: poll timed out!\n");
-}
-
-int
-maciisi_request(struct adb_request *req, void (*done)(struct adb_request *),
-	    int nbytes, ...)
-{
-	va_list list;
-	int i;
-
-	req->nbytes = nbytes;
-	req->done = done;
-	req->reply_expected = 0;
-	va_start(list, nbytes);
-	for (i = 0; i < nbytes; i++)
-		req->data[i++] = va_arg(list, int);
-	va_end(list);
-
-	return maciisi_send_request(req, 1);
-}
-
-/* Enqueue a request, and run the queue if possible */
-static int
-maciisi_write(struct adb_request* req)
-{
-	unsigned long flags;
-	int i;
-
-	/* We will accept CUDA packets - the VIA sends them to us, so
-           it figures that we should be able to send them to it */
-	if (req->nbytes < 2 || req->data[0] > CUDA_PACKET) {
-		printk(KERN_ERR "maciisi_write: packet too small or not an ADB or CUDA packet\n");
-		req->complete = 1;
-		return -EINVAL;
-	}
-	req->next = NULL;
-	req->sent = 0;
-	req->complete = 0;
-	req->reply_len = 0;
-	
-	local_irq_save(flags);
-
-	if (current_req) {
-		last_req->next = req;
-		last_req = req;
-	} else {
-		current_req = req;
-		last_req = req;
-	}
-	if (maciisi_state == idle)
-	{
-		i = maciisi_start();
-		if(i != 0)
-		{
-			local_irq_restore(flags);
-			return i;
-		}
-	}
-	else
-	{
-#ifdef DEBUG_MACIISI_ADB
-		printk(KERN_DEBUG "maciisi_write: would start, but state is %d\n", maciisi_state);
-#endif
-		local_irq_restore(flags);
-		return -EBUSY;
-	}
-
-	local_irq_restore(flags);
-
-	return 0;
-}
-
-static int
-maciisi_start(void)
-{
-	struct adb_request* req;
-	int status;
-
-#ifdef DEBUG_MACIISI_ADB
-	status = via[B] & (TIP | TREQ);
-
-	printk(KERN_DEBUG "maciisi_start called, state=%d, status=%x, ifr=%x\n", maciisi_state, status, via[IFR]);
-#endif
-
-	if (maciisi_state != idle) {
-		/* shouldn't happen */
-		printk(KERN_ERR "maciisi_start: maciisi_start called when driver busy!\n");
-		return -EBUSY;
-	}
-
-	req = current_req;
-	if (req == NULL)
-		return -EINVAL;
-
-	status = via[B] & (TIP|TREQ);
-	if (!(status & TREQ)) {
-#ifdef DEBUG_MACIISI_ADB
-		printk(KERN_DEBUG "maciisi_start: bus busy - aborting\n");
-#endif
-		return -EBUSY;
-	}
-
-	/* Okay, send */
-#ifdef DEBUG_MACIISI_ADB
-	printk(KERN_DEBUG "maciisi_start: sending\n");
-#endif
-	/* Set state to active */
-	via[B] |= TIP;
-	/* ACK off */
-	via[B] &= ~TACK;
-	/* Delay */
-	udelay(ADB_DELAY);
-	/* Shift out and send */
-	via[ACR] |= SR_OUT;
-	via[SR] = req->data[0];
-	data_index = 1;
-	/* ACK on */
-	via[B] |= TACK;
-	maciisi_state = sending;
-
-	return 0;
-}
-
-void
-maciisi_poll(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (via[IFR] & SR_INT) {
-		maciisi_interrupt(0, NULL);
-	}
-	else /* avoid calling this function too quickly in a loop */
-		udelay(ADB_DELAY);
-
-	local_irq_restore(flags);
-}
-
-/* Shift register interrupt - this is *supposed* to mean that the
-   register is either full or empty. In practice, I have no idea what
-   it means :( */
-static irqreturn_t
-maciisi_interrupt(int irq, void* arg)
-{
-	int status;
-	struct adb_request *req;
-#ifdef DEBUG_MACIISI_ADB
-	static int dump_reply = 0;
-#endif
-	int i;
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	status = via[B] & (TIP|TREQ);
-#ifdef DEBUG_MACIISI_ADB
-	printk(KERN_DEBUG "state %d status %x ifr %x\n", maciisi_state, status, via[IFR]);
-#endif
-
-	if (!(via[IFR] & SR_INT)) {
-		/* Shouldn't happen, we hope */
-		printk(KERN_ERR "maciisi_interrupt: called without interrupt flag set\n");
-		local_irq_restore(flags);
-		return IRQ_NONE;
-	}
-
-	/* Clear the interrupt */
-	/* via[IFR] = SR_INT; */
-
- switch_start:
-	switch (maciisi_state) {
-	case idle:
-		if (status & TIP)
-			printk(KERN_ERR "maciisi_interrupt: state is idle but TIP asserted!\n");
-
-		if(!reading_reply)
-			udelay(ADB_DELAY);
-		/* Shift in */
-		via[ACR] &= ~SR_OUT;
- 		/* Signal start of frame */
-		via[B] |= TIP;
-		/* Clear the interrupt (throw this value on the floor, it's useless) */
-		tmp = via[SR];
-		/* ACK adb chip, high-low */
-		via[B] |= TACK;
-		udelay(ADB_DELAY);
-		via[B] &= ~TACK;
-		reply_len = 0;
-		maciisi_state = reading;
-		if (reading_reply) {
-			reply_ptr = current_req->reply;
-		} else {
-			reply_ptr = maciisi_rbuf;
-		}
-		break;
-
-	case sending:
-		/* via[SR]; */
-		/* Set ACK off */
-		via[B] &= ~TACK;
-		req = current_req;
-
-		if (!(status & TREQ)) {
-			/* collision */
-			printk(KERN_ERR "maciisi_interrupt: send collision\n");
-			/* Set idle and input */
-			via[ACR] &= ~SR_OUT;
-			tmp = via[SR];
-			via[B] &= ~TIP;
-			/* Must re-send */
-			reading_reply = 0;
-			reply_len = 0;
-			maciisi_state = idle;
-			udelay(ADB_DELAY);
-			/* process this now, because the IFR has been cleared */
-			goto switch_start;
-		}
-
-		udelay(ADB_DELAY);
-
-		if (data_index >= req->nbytes) {
-			/* Sent the whole packet, put the bus back in idle state */
-			/* Shift in, we are about to read a reply (hopefully) */
-			via[ACR] &= ~SR_OUT;
-			tmp = via[SR];
-			/* End of frame */
-			via[B] &= ~TIP;
-			req->sent = 1;
-			maciisi_state = idle;
-			if (req->reply_expected) {
-				/* Note: only set this once we've
-                                   successfully sent the packet */
-				reading_reply = 1;
-			} else {
-				current_req = req->next;
-				if (req->done)
-					(*req->done)(req);
-				/* Do any queued requests now */
-				i = maciisi_start();
-				if(i == 0 && need_sync) {
-					/* Packet needs to be synced */
-					maciisi_sync(current_req);
-				}
-				if(i != -EBUSY)
-					need_sync = 0;
-			}
-		} else {
-			/* Sending more stuff */
-			/* Shift out */
-			via[ACR] |= SR_OUT;
-			/* Write */
-			via[SR] = req->data[data_index++];
-			/* Signal 'byte ready' */
-			via[B] |= TACK;
-		}
-		break;
-
-	case reading:
-		/* Shift in */
-		/* via[ACR] &= ~SR_OUT; */ /* Not in 2.2 */
-		if (reply_len++ > 16) {
-			printk(KERN_ERR "maciisi_interrupt: reply too long, aborting read\n");
-			via[B] |= TACK;
-			udelay(ADB_DELAY);
-			via[B] &= ~(TACK|TIP);
-			maciisi_state = idle;
-			i = maciisi_start();
-			if(i == 0 && need_sync) {
-				/* Packet needs to be synced */
-				maciisi_sync(current_req);
-			}
-			if(i != -EBUSY)
-				need_sync = 0;
-			break;
-		}
-		/* Read data */
-		*reply_ptr++ = via[SR];
-		status = via[B] & (TIP|TREQ);
-		/* ACK on/off */
-		via[B] |= TACK;
-		udelay(ADB_DELAY);
-		via[B] &= ~TACK;	
-		if (!(status & TREQ))
-			break; /* more stuff to deal with */
-		
-		/* end of frame */
-		via[B] &= ~TIP;
-		tmp = via[SR]; /* That's what happens in 2.2 */
-		udelay(ADB_DELAY); /* Give controller time to recover */
-
-		/* end of packet, deal with it */
-		if (reading_reply) {
-			req = current_req;
-			req->reply_len = reply_ptr - req->reply;
-			if (req->data[0] == ADB_PACKET) {
-				/* Have to adjust the reply from ADB commands */
-				if (req->reply_len <= 2 || (req->reply[1] & 2) != 0) {
-					/* the 0x2 bit indicates no response */
-					req->reply_len = 0;
-				} else {
-					/* leave just the command and result bytes in the reply */
-					req->reply_len -= 2;
-					memmove(req->reply, req->reply + 2, req->reply_len);
-				}
-			}
-#ifdef DEBUG_MACIISI_ADB
-			if (dump_reply) {
-				int i;
-				printk(KERN_DEBUG "maciisi_interrupt: reply is ");
-				for (i = 0; i < req->reply_len; ++i)
-					printk(" %.2x", req->reply[i]);
-				printk("\n");
-			}
-#endif
-			req->complete = 1;
-			current_req = req->next;
-			if (req->done)
-				(*req->done)(req);
-			/* Obviously, we got it */
-			reading_reply = 0;
-		} else {
-			maciisi_input(maciisi_rbuf, reply_ptr - maciisi_rbuf);
-		}
-		maciisi_state = idle;
-		status = via[B] & (TIP|TREQ);
-		if (!(status & TREQ)) {
-			/* Timeout?! More likely, another packet coming in already */
-#ifdef DEBUG_MACIISI_ADB
-			printk(KERN_DEBUG "extra data after packet: status %x ifr %x\n",
-			       status, via[IFR]);
-#endif
-#if 0
-			udelay(ADB_DELAY);
-			via[B] |= TIP;
-
-			maciisi_state = reading;
-			reading_reply = 0;
-			reply_ptr = maciisi_rbuf;
-#else
-			/* Process the packet now */
-			reading_reply = 0;
-			goto switch_start;
-#endif
-			/* We used to do this... but the controller might actually have data for us */
-			/* maciisi_stfu(); */
-		}
-		else {
-			/* Do any queued requests now if possible */
-			i = maciisi_start();
-			if(i == 0 && need_sync) {
-				/* Packet needs to be synced */
-				maciisi_sync(current_req);
-			}
-			if(i != -EBUSY)
-				need_sync = 0;
-		}
-		break;
-
-	default:
-		printk("maciisi_interrupt: unknown maciisi_state %d?\n", maciisi_state);
-	}
-	local_irq_restore(flags);
-	return IRQ_HANDLED;
-}
-
-static void
-maciisi_input(unsigned char *buf, int nb)
-{
-#ifdef DEBUG_MACIISI_ADB
-    int i;
-#endif
-
-    switch (buf[0]) {
-    case ADB_PACKET:
-	    adb_input(buf+2, nb-2, buf[1] & 0x40);
-	    break;
-    default:
-#ifdef DEBUG_MACIISI_ADB
-	    printk(KERN_DEBUG "data from IIsi ADB (%d bytes):", nb);
-	    for (i = 0; i < nb; ++i)
-		    printk(" %.2x", buf[i]);
-	    printk("\n");
-#endif
-	    break;
-    }
-}
-- 
cgit v1.2.3


From 63b2547b155de2c41d40ea750155749232dde511 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Tue, 7 Feb 2017 23:36:41 +0530
Subject: powerpc/64: Include bpf/bcc related config options in defconfigs

Specifically:
 - CONFIG_BPF_SYSCALL
 - CONFIG_NET_SCHED
 - CONFIG_NET_CLS_BPF
 - CONFIG_NET_CLS_ACT
 - CONFIG_NET_ACT_BPF
 - CONFIG_CGROUP_BPF
 - CONFIG_UPROBE_EVENT

... in pseries, ppc64 and powernv defconfigs.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 7 +++++++
 arch/powerpc/configs/ppc64_defconfig   | 7 +++++++
 arch/powerpc/configs/pseries_defconfig | 7 +++++++
 3 files changed, 21 insertions(+)

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index b793550fac91..7a60b026247a 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -26,9 +26,11 @@ CONFIG_CGROUP_FREEZER=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_BPF=y
 CONFIG_CGROUP_PERF=y
 CONFIG_USER_NS=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
@@ -79,6 +81,10 @@ CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
+CONFIG_NET_SCHED=y
+CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_BPF=m
 CONFIG_BPF_JIT=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -290,6 +296,7 @@ CONFIG_LOCKUP_DETECTOR=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_UPROBE_EVENT=y
 CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 0396126ba6a8..4f1288b04303 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -14,7 +14,9 @@ CONFIG_LOG_BUF_SHIFT=18
 CONFIG_LOG_CPU_MAX_BUF_SHIFT=13
 CONFIG_CGROUPS=y
 CONFIG_CPUSETS=y
+CONFIG_CGROUP_BPF=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
@@ -76,6 +78,10 @@ CONFIG_INET_IPCOMP=m
 CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
 CONFIG_BRIDGE=m
+CONFIG_NET_SCHED=y
+CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_BPF=m
 CONFIG_BPF_JIT=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -324,6 +330,7 @@ CONFIG_DEBUG_MUTEXES=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_UPROBE_EVENT=y
 CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index d99734f3b868..6d0eb02fefa4 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -24,12 +24,14 @@ CONFIG_CGROUP_FREEZER=y
 CONFIG_CGROUP_DEVICE=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_BPF=y
 CONFIG_MEMCG=y
 CONFIG_MEMCG_SWAP=y
 CONFIG_CGROUP_PERF=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_USER_NS=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
@@ -82,6 +84,10 @@ CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
+CONFIG_NET_SCHED=y
+CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_BPF=m
 CONFIG_BPF_JIT=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -290,6 +296,7 @@ CONFIG_LOCKUP_DETECTOR=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_UPROBE_EVENT=y
 CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
-- 
cgit v1.2.3


From e623c54ec995899cf4179873070861dd438b8a9c Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 6 Feb 2017 13:55:43 +1100
Subject: powerpc/powernv: Add XHCI and USB storage to defconfig

These are common on bare metal machines, so put them in the defconfig.

This adds 216KB to the vmlinux size

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 7a60b026247a..ac8b8332ed82 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -220,10 +220,11 @@ CONFIG_HID_SUNPLUS=y
 CONFIG_USB_HIDDEV=y
 CONFIG_USB=y
 CONFIG_USB_MON=m
+CONFIG_USB_XHCI_HCD=y
 CONFIG_USB_EHCI_HCD=y
 # CONFIG_USB_EHCI_HCD_PPC_OF is not set
 CONFIG_USB_OHCI_HCD=y
-CONFIG_USB_STORAGE=m
+CONFIG_USB_STORAGE=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=m
 CONFIG_LEDS_POWERNV=m
-- 
cgit v1.2.3


From 470a36a8c014e5cac7bb2df382948597f7ec1b2c Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 16 Nov 2016 10:58:02 +0530
Subject: powerpc/powernv: Display the correct error info for CAPP errors.

On some CAPP errors we see console messages that prints unknown HMIs for
which CAPI recovery is in progress. This patch fixes this by printing
correct error info for HMI generated due to CAPP recovery.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Tested-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-hmi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index c0a8201cb4d9..88f3c61eec95 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -180,7 +180,8 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
 		"An XSCOM operation completed",
 		"SCOM has set a reserved FIR bit to cause recovery",
 		"Debug trigger has set a reserved FIR bit to cause recovery",
-		"A hypervisor resource error occurred"
+		"A hypervisor resource error occurred",
+		"CAPP recovery process is in progress",
 	};
 
 	/* Print things out */
-- 
cgit v1.2.3


From 2717a33d60745f2f72e521cdaedf79b00f66f8ca Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 6 Feb 2017 16:07:36 +1100
Subject: powerpc/opal-irqchip: Use interrupt names if present

Recent versions of OPAL can provide names for the various OPAL interrupts,
so let's use them. This also modernises the code that fetches the
interrupt array to use the helpers provided by the generic code instead
of hand-parsing the property.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Free irqs on error, check allocation of names, consolidate error
      handling, whitespace.]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-irqchip.c | 55 ++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index 998316bf2dad..ecdcba9d1220 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -183,8 +183,9 @@ void opal_event_shutdown(void)
 int __init opal_event_init(void)
 {
 	struct device_node *dn, *opal_node;
-	const __be32 *irqs;
-	int i, irqlen, rc = 0;
+	const char **names;
+	u32 *irqs;
+	int i, rc;
 
 	opal_node = of_find_node_by_path("/ibm,opal");
 	if (!opal_node) {
@@ -209,31 +210,56 @@ int __init opal_event_init(void)
 		goto out;
 	}
 
-	/* Get interrupt property */
-	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
-	opal_irq_count = irqs ? (irqlen / 4) : 0;
+	/* Get opal-interrupts property and names if present */
+	rc = of_property_count_u32_elems(opal_node, "opal-interrupts");
+	if (rc < 0)
+		goto out;
+
+	opal_irq_count = rc;
 	pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count);
 
-	/* Install interrupt handlers */
+	irqs = kcalloc(opal_irq_count, sizeof(*irqs), GFP_KERNEL);
+	names = kcalloc(opal_irq_count, sizeof(*names), GFP_KERNEL);
 	opal_irqs = kcalloc(opal_irq_count, sizeof(*opal_irqs), GFP_KERNEL);
-	for (i = 0; irqs && i < opal_irq_count; i++, irqs++) {
-		unsigned int irq, virq;
+
+	if (WARN_ON(!irqs || !names || !opal_irqs))
+		goto out_free;
+
+	rc = of_property_read_u32_array(opal_node, "opal-interrupts",
+					irqs, opal_irq_count);
+	if (rc < 0) {
+		pr_err("Error %d reading opal-interrupts array\n", rc);
+		goto out_free;
+	}
+
+	/* It's not an error for the names to be missing */
+	of_property_read_string_array(opal_node, "opal-interrupts-names",
+				      names, opal_irq_count);
+
+	/* Install interrupt handlers */
+	for (i = 0; i < opal_irq_count; i++) {
+		unsigned int virq;
+		char *name;
 
 		/* Get hardware and virtual IRQ */
-		irq = be32_to_cpup(irqs);
-		virq = irq_create_mapping(NULL, irq);
+		virq = irq_create_mapping(NULL, irqs[i]);
 		if (!virq) {
-			pr_warn("Failed to map irq 0x%x\n", irq);
+			pr_warn("Failed to map irq 0x%x\n", irqs[i]);
 			continue;
 		}
 
+		if (names[i] && strlen(names[i]))
+			name = kasprintf(GFP_KERNEL, "opal-%s", names[i]);
+		else
+			name = kasprintf(GFP_KERNEL, "opal");
+
 		/* Install interrupt handler */
 		rc = request_irq(virq, opal_interrupt, IRQF_TRIGGER_LOW,
-				 "opal", NULL);
+				 name, NULL);
 		if (rc) {
 			irq_dispose_mapping(virq);
 			pr_warn("Error %d requesting irq %d (0x%x)\n",
-				 rc, virq, irq);
+				 rc, virq, irqs[i]);
 			continue;
 		}
 
@@ -241,6 +267,9 @@ int __init opal_event_init(void)
 		opal_irqs[i] = virq;
 	}
 
+out_free:
+	kfree(irqs);
+	kfree(names);
 out:
 	of_node_put(opal_node);
 	return rc;
-- 
cgit v1.2.3


From 86f7ce4b4751aaf51ebcff6dc21e38b09894a915 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 7 Feb 2017 11:37:28 +1100
Subject: powerpc/opal-lpc: Remove unneeded include

We don't need asm/xics.h

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-lpc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 399908bd9954..a91d7876fae2 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -18,7 +18,6 @@
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
-#include <asm/xics.h>
 #include <asm/opal.h>
 #include <asm/prom.h>
 #include <linux/uaccess.h>
-- 
cgit v1.2.3


From de55ce0de94b5daa804f69aa6ede793928900614 Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Fri, 3 Feb 2017 13:43:16 +1300
Subject: Documentation: powerpc/fsl: Update compatible for l2cache binding

List all the current valid compatible strings for the l2cache binding.
This should stop checkpatch.pl from complaining and will hopefully save
someone from having to debug a typo in their dts.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../devicetree/bindings/powerpc/fsl/l2cache.txt    | 42 ++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt b/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt
index c41b2187eaa8..dc9bb3182525 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt
+++ b/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt
@@ -5,8 +5,46 @@ The cache bindings explained below are ePAPR compliant
 
 Required Properties:
 
-- compatible	: Should include "fsl,chip-l2-cache-controller" and "cache"
-		  where chip is the processor (bsc9132, npc8572 etc.)
+- compatible	: Should include one of the following:
+		  "fsl,8540-l2-cache-controller"
+		  "fsl,8541-l2-cache-controller"
+		  "fsl,8544-l2-cache-controller"
+		  "fsl,8548-l2-cache-controller"
+		  "fsl,8555-l2-cache-controller"
+		  "fsl,8568-l2-cache-controller"
+		  "fsl,b4420-l2-cache-controller"
+		  "fsl,b4860-l2-cache-controller"
+		  "fsl,bsc9131-l2-cache-controller"
+		  "fsl,bsc9132-l2-cache-controller"
+		  "fsl,c293-l2-cache-controller"
+		  "fsl,mpc8536-l2-cache-controller"
+		  "fsl,mpc8540-l2-cache-controller"
+		  "fsl,mpc8541-l2-cache-controller"
+		  "fsl,mpc8544-l2-cache-controller"
+		  "fsl,mpc8548-l2-cache-controller"
+		  "fsl,mpc8555-l2-cache-controller"
+		  "fsl,mpc8560-l2-cache-controller"
+		  "fsl,mpc8568-l2-cache-controller"
+		  "fsl,mpc8569-l2-cache-controller"
+		  "fsl,mpc8572-l2-cache-controller"
+		  "fsl,p1010-l2-cache-controller"
+		  "fsl,p1011-l2-cache-controller"
+		  "fsl,p1012-l2-cache-controller"
+		  "fsl,p1013-l2-cache-controller"
+		  "fsl,p1014-l2-cache-controller"
+		  "fsl,p1015-l2-cache-controller"
+		  "fsl,p1016-l2-cache-controller"
+		  "fsl,p1020-l2-cache-controller"
+		  "fsl,p1021-l2-cache-controller"
+		  "fsl,p1022-l2-cache-controller"
+		  "fsl,p1023-l2-cache-controller"
+		  "fsl,p1024-l2-cache-controller"
+		  "fsl,p1025-l2-cache-controller"
+		  "fsl,p2010-l2-cache-controller"
+		  "fsl,p2020-l2-cache-controller"
+		  "fsl,t2080-l2-cache-controller"
+		  "fsl,t4240-l2-cache-controller"
+		  and "cache".
 - reg		: Address and size of L2 cache controller registers
 - cache-size	: Size of the entire L2 cache
 - interrupts	: Error interrupt of L2 controller
-- 
cgit v1.2.3


From 64b40ffbc83029f035571cad9727e34e69dbf6d0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 9 Dec 2016 11:07:35 +1100
Subject: powerpc/pseries: Add hypercall wrappers for hash page table resizing

This adds the hypercall numbers and wrapper functions for the hash page
table resizing hypercalls.

These hypercall numbers are defined in the PAPR ACR "HPT resizing
option".

It also adds a new firmware feature flag to track the presence of the
HPT resizing calls.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/firmware.h       |  5 +++--
 arch/powerpc/include/asm/hvcall.h         |  2 ++
 arch/powerpc/include/asm/plpar_wrappers.h | 12 ++++++++++++
 arch/powerpc/platforms/pseries/firmware.c |  1 +
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 1e0b5a5d660a..8645897472b1 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -42,7 +42,7 @@
 #define FW_FEATURE_SPLPAR	ASM_CONST(0x0000000000100000)
 #define FW_FEATURE_LPAR		ASM_CONST(0x0000000000400000)
 #define FW_FEATURE_PS3_LV1	ASM_CONST(0x0000000000800000)
-/* Free				ASM_CONST(0x0000000001000000) */
+#define FW_FEATURE_HPT_RESIZE	ASM_CONST(0x0000000001000000)
 #define FW_FEATURE_CMO		ASM_CONST(0x0000000002000000)
 #define FW_FEATURE_VPHN		ASM_CONST(0x0000000004000000)
 #define FW_FEATURE_XCMO		ASM_CONST(0x0000000008000000)
@@ -66,7 +66,8 @@ enum {
 		FW_FEATURE_MULTITCE | FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
 		FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
 		FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
-		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN,
+		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+		FW_FEATURE_HPT_RESIZE,
 	FW_FEATURE_PSERIES_ALWAYS = 0,
 	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL,
 	FW_FEATURE_POWERNV_ALWAYS = 0,
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 77ff1ba99d1f..490c4b9f4e3a 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -276,6 +276,8 @@
 #define H_GET_MPP_X		0x314
 #define H_SET_MODE		0x31C
 #define H_CLEAR_HPT		0x358
+#define H_RESIZE_HPT_PREPARE	0x36C
+#define H_RESIZE_HPT_COMMIT	0x370
 #define H_SIGNAL_SYS_RESET	0x380
 #define MAX_HCALL_OPCODE	H_SIGNAL_SYS_RESET
 
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 0bcc75e295e3..c7b164836bc3 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -210,6 +210,18 @@ static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex,
 	return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
 }
 
+static inline long plpar_resize_hpt_prepare(unsigned long flags,
+					    unsigned long shift)
+{
+	return plpar_hcall_norets(H_RESIZE_HPT_PREPARE, flags, shift);
+}
+
+static inline long plpar_resize_hpt_commit(unsigned long flags,
+					   unsigned long shift)
+{
+	return plpar_hcall_norets(H_RESIZE_HPT_COMMIT, flags, shift);
+}
+
 static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba,
 		unsigned long *tce_ret)
 {
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
index ea7f09bd73b1..658c02df2f36 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -64,6 +64,7 @@ hypertas_fw_features_table[] = {
 	{FW_FEATURE_VPHN,		"hcall-vphn"},
 	{FW_FEATURE_SET_MODE,		"hcall-set-mode"},
 	{FW_FEATURE_BEST_ENERGY,	"hcall-best-energy-1*"},
+	{FW_FEATURE_HPT_RESIZE,		"hcall-hpt-resize"},
 };
 
 /* Build up the firmware features bitmask using the contents of
-- 
cgit v1.2.3


From dbcf929c0062b758fbb6312ccaf30716c0c0a608 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 9 Dec 2016 11:07:36 +1100
Subject: powerpc/pseries: Add support for hash table resizing

This adds support for using two hypercalls to change the size of the
main hash page table while running as a PAPR guest. For now these
hypercalls are only in experimental qemu versions.

The interface is two part: first H_RESIZE_HPT_PREPARE is used to
allocate and prepare the new hash table. This may be slow, but can be
done asynchronously. Then, H_RESIZE_HPT_COMMIT is used to switch to the
new hash table. This requires that no CPUs be concurrently updating the
HPT, and so must be run under stop_machine().

This also adds a debugfs file which can be used to manually control
HPT resizing or testing purposes.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Mackerras <paulus@samba.org>
[mpe: Rename the debugfs file to "hpt_order"]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   1 +
 arch/powerpc/mm/hash_utils_64.c               |  33 ++++++++
 arch/powerpc/platforms/pseries/lpar.c         | 109 ++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 823015cff149..52d8d1e4b772 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -157,6 +157,7 @@ struct mmu_hash_ops {
 					       unsigned long addr,
 					       unsigned char *hpte_slot_array,
 					       int psize, int ssize, int local);
+	int		(*resize_hpt)(unsigned long shift);
 	/*
 	 * Special for kexec.
 	 * To be called in real mode with interrupts disabled. No locks are
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 67e19a0821be..a3371d4e35b6 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,7 +35,9 @@
 #include <linux/memblock.h>
 #include <linux/context_tracking.h>
 #include <linux/libfdt.h>
+#include <linux/debugfs.h>
 
+#include <asm/debug.h>
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -1795,3 +1797,34 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	/* Finally limit subsequent allocations */
 	memblock_set_current_limit(ppc64_rma_size);
 }
+
+#ifdef CONFIG_DEBUG_FS
+
+static int hpt_order_get(void *data, u64 *val)
+{
+	*val = ppc64_pft_size;
+	return 0;
+}
+
+static int hpt_order_set(void *data, u64 val)
+{
+	if (!mmu_hash_ops.resize_hpt)
+		return -ENODEV;
+
+	return mmu_hash_ops.resize_hpt(val);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+
+static int __init hash64_debugfs(void)
+{
+	if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root,
+				 NULL, &fops_hpt_order)) {
+		pr_err("lpar: unable to create hpt_order debugsfs file\n");
+	}
+
+	return 0;
+}
+machine_device_initcall(pseries, hash64_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5dc1c3c6e716..c2e13a51f369 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -27,6 +27,8 @@
 #include <linux/console.h>
 #include <linux/export.h>
 #include <linux/jump_label.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/page.h>
@@ -609,6 +611,112 @@ static int __init disable_bulk_remove(char *str)
 
 __setup("bulk_remove=", disable_bulk_remove);
 
+#define HPT_RESIZE_TIMEOUT	10000 /* ms */
+
+struct hpt_resize_state {
+	unsigned long shift;
+	int commit_rc;
+};
+
+static int pseries_lpar_resize_hpt_commit(void *data)
+{
+	struct hpt_resize_state *state = data;
+
+	state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
+	if (state->commit_rc != H_SUCCESS)
+		return -EIO;
+
+	/* Hypervisor has transitioned the HTAB, update our globals */
+	ppc64_pft_size = state->shift;
+	htab_size_bytes = 1UL << ppc64_pft_size;
+	htab_hash_mask = (htab_size_bytes >> 7) - 1;
+
+	return 0;
+}
+
+/* Must be called in user context */
+static int pseries_lpar_resize_hpt(unsigned long shift)
+{
+	struct hpt_resize_state state = {
+		.shift = shift,
+		.commit_rc = H_FUNCTION,
+	};
+	unsigned int delay, total_delay = 0;
+	int rc;
+	ktime_t t0, t1, t2;
+
+	might_sleep();
+
+	if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
+		return -ENODEV;
+
+	printk(KERN_INFO "lpar: Attempting to resize HPT to shift %lu\n",
+	       shift);
+
+	t0 = ktime_get();
+
+	rc = plpar_resize_hpt_prepare(0, shift);
+	while (H_IS_LONG_BUSY(rc)) {
+		delay = get_longbusy_msecs(rc);
+		total_delay += delay;
+		if (total_delay > HPT_RESIZE_TIMEOUT) {
+			/* prepare with shift==0 cancels an in-progress resize */
+			rc = plpar_resize_hpt_prepare(0, 0);
+			if (rc != H_SUCCESS)
+				printk(KERN_WARNING
+				       "lpar: Unexpected error %d cancelling timed out HPT resize\n",
+				       rc);
+			return -ETIMEDOUT;
+		}
+		msleep(delay);
+		rc = plpar_resize_hpt_prepare(0, shift);
+	};
+
+	switch (rc) {
+	case H_SUCCESS:
+		/* Continue on */
+		break;
+
+	case H_PARAMETER:
+		return -EINVAL;
+	case H_RESOURCE:
+		return -EPERM;
+	default:
+		printk(KERN_WARNING
+		       "lpar: Unexpected error %d from H_RESIZE_HPT_PREPARE\n",
+		       rc);
+		return -EIO;
+	}
+
+	t1 = ktime_get();
+
+	rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
+
+	t2 = ktime_get();
+
+	if (rc != 0) {
+		switch (state.commit_rc) {
+		case H_PTEG_FULL:
+			printk(KERN_WARNING
+			       "lpar: Hash collision while resizing HPT\n");
+			return -ENOSPC;
+
+		default:
+			printk(KERN_WARNING
+			       "lpar: Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
+			       state.commit_rc);
+			return -EIO;
+		};
+	}
+
+	printk(KERN_INFO
+	       "lpar: HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
+	       shift, (long long) ktime_ms_delta(t1, t0),
+	       (long long) ktime_ms_delta(t2, t1));
+
+	return 0;
+}
+
 void __init hpte_init_pseries(void)
 {
 	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
@@ -620,6 +728,7 @@ void __init hpte_init_pseries(void)
 	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
 	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
+	mmu_hash_ops.resize_hpt		 = pseries_lpar_resize_hpt;
 }
 
 #ifdef CONFIG_PPC_SMLPAR
-- 
cgit v1.2.3


From 0de0fb09bbce1e1635a0d4c4781af6ec8cbfdb81 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 9 Dec 2016 11:07:37 +1100
Subject: powerpc/pseries: Advertise HPT resizing support via CAS

The hypervisor needs to know a guest is capable of using the HPT resizing
PAPR extension in order to make full advantage of it for memory hotplug.

If the hypervisor knows the guest is HPT resize aware, it can size the
initial HPT based on the initial guest RAM size, relying on the guest to
resize the HPT when more memory is hot-added. Without this, the hypervisor
must size the HPT for the maximum possible guest RAM, which can lead to
a huge waste of space if the guest never actually expends to that maximum
size.

This patch advertises the guest's support for HPT resizing via the
ibm,client-architecture-support OF interface. We use bit 5 of byte 6 of
option vector 5 for this purpose, as defined in the PAPR ACR "HPT
resizing option".

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Reviewed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/prom.h | 1 +
 arch/powerpc/kernel/prom_init.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 5e57705b4759..00fcfcbdd053 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -151,6 +151,7 @@ struct of_drconf_cell {
 #define OV5_XCMO		0x0440	/* Page Coalescing */
 #define OV5_TYPE1_AFFINITY	0x0580	/* Type 1 NUMA affinity */
 #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
+#define OV5_RESIZE_HPT		0x0601	/* Hash Page Table resizing */
 #define OV5_PFO_HW_RNG		0x0E80	/* PFO Random Number Generator */
 #define OV5_PFO_HW_842		0x0E40	/* PFO Compression Accelerator */
 #define OV5_PFO_HW_ENCR		0x0E20	/* PFO Encryption Accelerator */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index ec47a939cbdd..d16b0f005290 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -826,7 +826,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		0,
 #endif
 		.associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
-		.bin_opts = 0,
+		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT),
 		.micro_checkpoint = 0,
 		.reserved0 = 0,
 		.max_cpus = cpu_to_be32(NR_CPUS),	/* number of cores supported */
-- 
cgit v1.2.3


From 438cc81a41e8c2905d0f4f300d4690445b2ab240 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 9 Dec 2016 11:07:38 +1100
Subject: powerpc/pseries: Automatically resize HPT for memory hot add/remove

We've now implemented code in the pseries platform to use the new PAPR
interface to allow resizing the hash page table (HPT) at runtime.

This patch uses that interface to automatically attempt to resize the HPT
when memory is hot added or removed.  This tries to always keep the HPT at
a reasonable size for our current memory size.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/sparsemem.h |  1 +
 arch/powerpc/mm/hash_utils_64.c      | 29 +++++++++++++++++++++++++++++
 arch/powerpc/mm/mem.c                |  4 ++++
 3 files changed, 34 insertions(+)

diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index f6fc0ee813d7..737335c891e4 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -16,6 +16,7 @@
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+extern void resize_hpt_for_hotplug(unsigned long new_mem_size);
 extern int create_section_mapping(unsigned long start, unsigned long end);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
 #ifdef CONFIG_NUMA
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a3371d4e35b6..12d679df50bd 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -749,6 +749,35 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+void resize_hpt_for_hotplug(unsigned long new_mem_size)
+{
+	unsigned target_hpt_shift;
+
+	if (!mmu_hash_ops.resize_hpt)
+		return;
+
+	target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
+
+	/*
+	 * To avoid lots of HPT resizes if memory size is fluctuating
+	 * across a boundary, we deliberately have some hysterisis
+	 * here: we immediately increase the HPT size if the target
+	 * shift exceeds the current shift, but we won't attempt to
+	 * reduce unless the target shift is at least 2 below the
+	 * current shift
+	 */
+	if ((target_hpt_shift > ppc64_pft_size)
+	    || (target_hpt_shift < (ppc64_pft_size - 1))) {
+		int rc;
+
+		rc = mmu_hash_ops.resize_hpt(target_hpt_shift);
+		if (rc)
+			printk(KERN_WARNING
+			       "Unable to resize hash page table to target order %d: %d\n",
+			       target_hpt_shift, rc);
+	}
+}
+
 int hash__create_section_mapping(unsigned long start, unsigned long end)
 {
 	int rc = htab_bolt_mapping(start, end, __pa(start),
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 5f844337de21..9ee536ec0739 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -134,6 +134,8 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int rc;
 
+	resize_hpt_for_hotplug(memblock_phys_mem_size());
+
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
@@ -174,6 +176,8 @@ int arch_remove_memory(u64 start, u64 size)
 	 */
 	vm_unmap_aliases();
 
+	resize_hpt_for_hotplug(memblock_phys_mem_size());
+
 	return ret;
 }
 #endif
-- 
cgit v1.2.3


From c233f5979b3dbb39a5b2473b5fcaf58baec8f1bd Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 14:27:29 +0530
Subject: powerpc/bpf: Introduce __PPC_SH64()

Introduce __PPC_SH64() as a 64-bit variant to encode shift field in some
of the shift and rotate instructions operating on double-words. Convert
some of the BPF instruction macros to use the same.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/ppc-opcode.h |  1 +
 arch/powerpc/net/bpf_jit.h            | 11 +++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index c4ced1d01d57..d99bd442aacb 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -306,6 +306,7 @@
 #define __PPC_WC(w)	(((w) & 0x3) << 21)
 #define __PPC_WS(w)	(((w) & 0x1f) << 11)
 #define __PPC_SH(s)	__PPC_WS(s)
+#define __PPC_SH64(s)	(__PPC_SH(s) | (((s) & 0x20) >> 4))
 #define __PPC_MB(s)	(((s) & 0x1f) << 6)
 #define __PPC_ME(s)	(((s) & 0x1f) << 1)
 #define __PPC_MB64(s)	(__PPC_MB(s) | ((s) & 0x20))
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 89f70073dec8..30cf03f53428 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -157,8 +157,7 @@
 #define PPC_SRAD(d, a, s)	EMIT(PPC_INST_SRAD | ___PPC_RA(d) |	      \
 				     ___PPC_RS(a) | ___PPC_RB(s))
 #define PPC_SRADI(d, a, i)	EMIT(PPC_INST_SRADI | ___PPC_RA(d) |	      \
-				     ___PPC_RS(a) | __PPC_SH(i) |             \
-				     (((i) & 0x20) >> 4))
+				     ___PPC_RS(a) | __PPC_SH64(i))
 #define PPC_RLWINM(d, a, i, mb, me)	EMIT(PPC_INST_RLWINM | ___PPC_RA(d) | \
 					___PPC_RS(a) | __PPC_SH(i) |	      \
 					__PPC_MB(mb) | __PPC_ME(me))
@@ -166,11 +165,11 @@
 					___PPC_RS(a) | __PPC_SH(i) |	      \
 					__PPC_MB(mb) | __PPC_ME(me))
 #define PPC_RLDICL(d, a, i, mb)		EMIT(PPC_INST_RLDICL | ___PPC_RA(d) | \
-					___PPC_RS(a) | __PPC_SH(i) |	      \
-					__PPC_MB64(mb) | (((i) & 0x20) >> 4))
+					___PPC_RS(a) | __PPC_SH64(i) |	      \
+					__PPC_MB64(mb))
 #define PPC_RLDICR(d, a, i, me)		EMIT(PPC_INST_RLDICR | ___PPC_RA(d) | \
-					___PPC_RS(a) | __PPC_SH(i) |	      \
-					__PPC_ME64(me) | (((i) & 0x20) >> 4))
+					___PPC_RS(a) | __PPC_SH64(i) |	      \
+					__PPC_ME64(me))
 
 /* slwi = rlwinm Rx, Ry, n, 0, 31-n */
 #define PPC_SLWI(d, a, i)	PPC_RLWINM(d, a, i, 0, 31-(i))
-- 
cgit v1.2.3


From ebfa50df435eed18e1389a43e0596246228e7298 Mon Sep 17 00:00:00 2001
From: Anju T <anju@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 14:27:30 +0530
Subject: powerpc: Add helper to check if offset is within relative branch
 range

To permit the use of relative branch instruction in powerpc, the target
address has to be relatively nearby, since the address is specified in an
immediate field (24 bit filed) in the instruction opcode itself. Here
nearby refers to 32MB on either side of the current instruction.

This patch verifies whether the target address is within +/- 32MB
range or not.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/code-patching.h |  1 +
 arch/powerpc/lib/code-patching.c         | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 2015b072422c..75ee4f4ac840 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -22,6 +22,7 @@
 #define BRANCH_SET_LINK	0x1
 #define BRANCH_ABSOLUTE	0x2
 
+bool is_offset_in_branch_range(long offset);
 unsigned int create_branch(const unsigned int *addr,
 			   unsigned long target, int flags);
 unsigned int create_cond_branch(const unsigned int *addr,
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index c1746df0f88e..4ccf16a822cc 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -32,6 +32,28 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags)
 	return patch_instruction(addr, create_branch(addr, target, flags));
 }
 
+bool is_offset_in_branch_range(long offset)
+{
+	/*
+	 * Powerpc branch instruction is :
+	 *
+	 *  0         6                 30   31
+	 *  +---------+----------------+---+---+
+	 *  | opcode  |     LI         |AA |LK |
+	 *  +---------+----------------+---+---+
+	 *  Where AA = 0 and LK = 0
+	 *
+	 * LI is a signed 24 bits integer. The real branch offset is computed
+	 * by: imm32 = SignExtend(LI:'0b00', 32);
+	 *
+	 * So the maximum forward branch should be:
+	 *   (0x007fffff << 2) = 0x01fffffc =  0x1fffffc
+	 * The maximum backward branch should be:
+	 *   (0xff800000 << 2) = 0xfe000000 = -0x2000000
+	 */
+	return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3));
+}
+
 unsigned int create_branch(const unsigned int *addr,
 			   unsigned long target, int flags)
 {
@@ -43,7 +65,7 @@ unsigned int create_branch(const unsigned int *addr,
 		offset = offset - (unsigned long)addr;
 
 	/* Check we can represent the target in the instruction format */
-	if (offset < -0x2000000 || offset > 0x1fffffc || offset & 0x3)
+	if (!is_offset_in_branch_range(offset))
 		return 0;
 
 	/* Mask out the flags and target, so they don't step on each other. */
-- 
cgit v1.2.3


From 30176466e36aadba01e1a630cf42397a3438efa4 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 14:27:31 +0530
Subject: powerpc/kprobes: Fixes for kprobe_lookup_name() on BE

Fix two issues with kprobes.h on BE which were exposed with the
optprobes work:
  - one, having to do with a missing include for linux/module.h for
    MODULE_NAME_LEN -- this didn't show up previously since the only
    users of kprobe_lookup_name were in kprobes.c, which included
    linux/module.h through other headers, and
  - two, with a missing const qualifier for a local variable which ends
    up referring a string literal. Again, this is unique to how
    kprobe_lookup_name is being invoked in optprobes.c

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kprobes.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 97b8c1f83453..77885d89f548 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -29,6 +29,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <linux/module.h>
 #include <asm/probes.h>
 #include <asm/code-patching.h>
 
@@ -61,7 +62,7 @@ typedef ppc_opcode_t kprobe_opcode_t;
 #define kprobe_lookup_name(name, addr)					\
 {									\
 	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];		\
-	char *modsym;							\
+	const char *modsym;							\
 	bool dot_appended = false;					\
 	if ((modsym = strchr(name, ':')) != NULL) {			\
 		modsym++;						\
-- 
cgit v1.2.3


From 51c9c0843993528bffc920c54c2121d9e6f8b090 Mon Sep 17 00:00:00 2001
From: Anju T <anju@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 15:20:51 +0530
Subject: powerpc/kprobes: Implement Optprobes

Current infrastructure of kprobe uses the unconditional trap instruction
to probe a running kernel. Optprobe allows kprobe to replace the trap
with a branch instruction to a detour buffer. Detour buffer contains
instructions to create an in memory pt_regs. Detour buffer also has a
call to optimized_callback() which in turn call the pre_handler(). After
the execution of the pre-handler, a call is made for instruction
emulation. The NIP is determined in advanced through dummy instruction
emulation and a branch instruction is created to the NIP at the end of
the trampoline.

To address the limitation of branch instruction in POWER architecture,
detour buffer slot is allocated from a reserved area. For the time
being, 64KB is reserved in memory for this purpose.

Instructions which can be emulated using analyse_instr() are the
candidates for optimization. Before optimization ensure that the address
range between the detour buffer allocated and the instruction being
probed is within +/- 32MB.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                     |   1 +
 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/include/asm/kprobes.h       |  24 ++-
 arch/powerpc/kernel/Makefile             |   1 +
 arch/powerpc/kernel/optprobes.c          | 348 +++++++++++++++++++++++++++++++
 arch/powerpc/kernel/optprobes_head.S     | 135 ++++++++++++
 arch/powerpc/lib/code-patching.c         |  21 ++
 7 files changed, 530 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/optprobes.c
 create mode 100644 arch/powerpc/kernel/optprobes_head.S

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bfdd80e7754c..f26c2253fdf2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -100,6 +100,7 @@ config PPC
 	select HAVE_IOREMAP_PROT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU)
 	select HAVE_KPROBES
+	select HAVE_OPTPROBES if PPC64
 	select HAVE_ARCH_KGDB
 	select HAVE_KRETPROBES
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 75ee4f4ac840..8ab937771068 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -35,6 +35,7 @@ int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr);
 unsigned long branch_target(const unsigned int *instr);
 unsigned int translate_branch(const unsigned int *dest,
 			      const unsigned int *src);
+extern bool is_conditional_branch(unsigned int instr);
 #ifdef CONFIG_PPC_BOOK3E_64
 void __patch_exception(int exc, unsigned long addr);
 #define patch_exception(exc, name) do { \
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 77885d89f548..d821835ade86 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -40,7 +40,23 @@ struct pt_regs;
 struct kprobe;
 
 typedef ppc_opcode_t kprobe_opcode_t;
-#define MAX_INSN_SIZE 1
+
+extern kprobe_opcode_t optinsn_slot;
+
+/* Optinsn template address */
+extern kprobe_opcode_t optprobe_template_entry[];
+extern kprobe_opcode_t optprobe_template_op_address[];
+extern kprobe_opcode_t optprobe_template_call_handler[];
+extern kprobe_opcode_t optprobe_template_insn[];
+extern kprobe_opcode_t optprobe_template_call_emulate[];
+extern kprobe_opcode_t optprobe_template_ret[];
+extern kprobe_opcode_t optprobe_template_end[];
+
+/* Fixed instruction size for powerpc */
+#define MAX_INSN_SIZE		1
+#define MAX_OPTIMIZED_LENGTH	sizeof(kprobe_opcode_t)	/* 4 bytes */
+#define MAX_OPTINSN_SIZE	(optprobe_template_end - optprobe_template_entry)
+#define RELATIVEJUMP_SIZE	sizeof(kprobe_opcode_t)	/* 4 bytes */
 
 #ifdef PPC64_ELF_ABI_v2
 /* PPC64 ABIv2 needs local entry point */
@@ -126,6 +142,12 @@ struct kprobe_ctlblk {
 	struct prev_kprobe prev_kprobe;
 };
 
+struct arch_optimized_insn {
+	kprobe_opcode_t copied_insn[1];
+	/* detour buffer */
+	kprobe_opcode_t *insn;
+};
+
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 					unsigned long val, void *data);
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f4898e6ad18d..a048b37b9b27 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_OPTPROBES)		+= optprobes.o optprobes_head.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
new file mode 100644
index 000000000000..17f4c94142d3
--- /dev/null
+++ b/arch/powerpc/kernel/optprobes.c
@@ -0,0 +1,348 @@
+/*
+ * Code for Kernel probes Jump optimization.
+ *
+ * Copyright 2017, Anju T, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/jump_label.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <asm/kprobes.h>
+#include <asm/ptrace.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+#include <asm/sstep.h>
+#include <asm/ppc-opcode.h>
+
+#define TMPL_CALL_HDLR_IDX	\
+	(optprobe_template_call_handler - optprobe_template_entry)
+#define TMPL_EMULATE_IDX	\
+	(optprobe_template_call_emulate - optprobe_template_entry)
+#define TMPL_RET_IDX		\
+	(optprobe_template_ret - optprobe_template_entry)
+#define TMPL_OP_IDX		\
+	(optprobe_template_op_address - optprobe_template_entry)
+#define TMPL_INSN_IDX		\
+	(optprobe_template_insn - optprobe_template_entry)
+#define TMPL_END_IDX		\
+	(optprobe_template_end - optprobe_template_entry)
+
+DEFINE_INSN_CACHE_OPS(ppc_optinsn);
+
+static bool insn_page_in_use;
+
+static void *__ppc_alloc_insn_page(void)
+{
+	if (insn_page_in_use)
+		return NULL;
+	insn_page_in_use = true;
+	return &optinsn_slot;
+}
+
+static void __ppc_free_insn_page(void *page __maybe_unused)
+{
+	insn_page_in_use = false;
+}
+
+struct kprobe_insn_cache kprobe_ppc_optinsn_slots = {
+	.mutex = __MUTEX_INITIALIZER(kprobe_ppc_optinsn_slots.mutex),
+	.pages = LIST_HEAD_INIT(kprobe_ppc_optinsn_slots.pages),
+	/* insn_size initialized later */
+	.alloc = __ppc_alloc_insn_page,
+	.free = __ppc_free_insn_page,
+	.nr_garbage = 0,
+};
+
+/*
+ * Check if we can optimize this probe. Returns NIP post-emulation if this can
+ * be optimized and 0 otherwise.
+ */
+static unsigned long can_optimize(struct kprobe *p)
+{
+	struct pt_regs regs;
+	struct instruction_op op;
+	unsigned long nip = 0;
+
+	/*
+	 * kprobe placed for kretprobe during boot time
+	 * is not optimizing now.
+	 *
+	 * TODO: Optimize kprobe in kretprobe_trampoline
+	 */
+	if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
+		return 0;
+
+	/*
+	 * We only support optimizing kernel addresses, but not
+	 * module addresses.
+	 *
+	 * FIXME: Optimize kprobes placed in module addresses.
+	 */
+	if (!is_kernel_addr((unsigned long)p->addr))
+		return 0;
+
+	memset(&regs, 0, sizeof(struct pt_regs));
+	regs.nip = (unsigned long)p->addr;
+	regs.trap = 0x0;
+	regs.msr = MSR_KERNEL;
+
+	/*
+	 * Kprobe placed in conditional branch instructions are
+	 * not optimized, as we can't predict the nip prior with
+	 * dummy pt_regs and can not ensure that the return branch
+	 * from detour buffer falls in the range of address (i.e 32MB).
+	 * A branch back from trampoline is set up in the detour buffer
+	 * to the nip returned by the analyse_instr() here.
+	 *
+	 * Ensure that the instruction is not a conditional branch,
+	 * and that can be emulated.
+	 */
+	if (!is_conditional_branch(*p->ainsn.insn) &&
+			analyse_instr(&op, &regs, *p->ainsn.insn))
+		nip = regs.nip;
+
+	return nip;
+}
+
+static void optimized_callback(struct optimized_kprobe *op,
+			       struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	unsigned long flags;
+
+	/* This is possible if op is under delayed unoptimizing */
+	if (kprobe_disabled(&op->kp))
+		return;
+
+	local_irq_save(flags);
+	hard_irq_disable();
+
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(&op->kp);
+	} else {
+		__this_cpu_write(current_kprobe, &op->kp);
+		regs->nip = (unsigned long)op->kp.addr;
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		opt_pre_handler(&op->kp, regs);
+		__this_cpu_write(current_kprobe, NULL);
+	}
+
+	/*
+	 * No need for an explicit __hard_irq_enable() here.
+	 * local_irq_restore() will re-enable interrupts,
+	 * if they were hard disabled.
+	 */
+	local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(optimized_callback);
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+	if (op->optinsn.insn) {
+		free_ppc_optinsn_slot(op->optinsn.insn, 1);
+		op->optinsn.insn = NULL;
+	}
+}
+
+/*
+ * emulate_step() requires insn to be emulated as
+ * second parameter. Load register 'r4' with the
+ * instruction.
+ */
+void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr)
+{
+	/* addis r4,0,(insn)@h */
+	*addr++ = PPC_INST_ADDIS | ___PPC_RT(4) |
+		  ((val >> 16) & 0xffff);
+
+	/* ori r4,r4,(insn)@l */
+	*addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) |
+		(val & 0xffff);
+}
+
+/*
+ * Generate instructions to load provided immediate 64-bit value
+ * to register 'r3' and patch these instructions at 'addr'.
+ */
+void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr)
+{
+	/* lis r3,(op)@highest */
+	*addr++ = PPC_INST_ADDIS | ___PPC_RT(3) |
+		  ((val >> 48) & 0xffff);
+
+	/* ori r3,r3,(op)@higher */
+	*addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) |
+		  ((val >> 32) & 0xffff);
+
+	/* rldicr r3,r3,32,31 */
+	*addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) |
+		  __PPC_SH64(32) | __PPC_ME64(31);
+
+	/* oris r3,r3,(op)@h */
+	*addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) |
+		  ((val >> 16) & 0xffff);
+
+	/* ori r3,r3,(op)@l */
+	*addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) |
+		(val & 0xffff);
+}
+
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
+{
+	kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step;
+	kprobe_opcode_t *op_callback_addr, *emulate_step_addr;
+	long b_offset;
+	unsigned long nip;
+
+	kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+
+	nip = can_optimize(p);
+	if (!nip)
+		return -EILSEQ;
+
+	/* Allocate instruction slot for detour buffer */
+	buff = get_ppc_optinsn_slot();
+	if (!buff)
+		return -ENOMEM;
+
+	/*
+	 * OPTPROBE uses 'b' instruction to branch to optinsn.insn.
+	 *
+	 * The target address has to be relatively nearby, to permit use
+	 * of branch instruction in powerpc, because the address is specified
+	 * in an immediate field in the instruction opcode itself, ie 24 bits
+	 * in the opcode specify the address. Therefore the address should
+	 * be within 32MB on either side of the current instruction.
+	 */
+	b_offset = (unsigned long)buff - (unsigned long)p->addr;
+	if (!is_offset_in_branch_range(b_offset))
+		goto error;
+
+	/* Check if the return address is also within 32MB range */
+	b_offset = (unsigned long)(buff + TMPL_RET_IDX) -
+			(unsigned long)nip;
+	if (!is_offset_in_branch_range(b_offset))
+		goto error;
+
+	/* Setup template */
+	memcpy(buff, optprobe_template_entry,
+			TMPL_END_IDX * sizeof(kprobe_opcode_t));
+
+	/*
+	 * Fixup the template with instructions to:
+	 * 1. load the address of the actual probepoint
+	 */
+	patch_imm64_load_insns((unsigned long)op, buff + TMPL_OP_IDX);
+
+	/*
+	 * 2. branch to optimized_callback() and emulate_step()
+	 */
+	kprobe_lookup_name("optimized_callback", op_callback_addr);
+	kprobe_lookup_name("emulate_step", emulate_step_addr);
+	if (!op_callback_addr || !emulate_step_addr) {
+		WARN(1, "kprobe_lookup_name() failed\n");
+		goto error;
+	}
+
+	branch_op_callback = create_branch((unsigned int *)buff + TMPL_CALL_HDLR_IDX,
+				(unsigned long)op_callback_addr,
+				BRANCH_SET_LINK);
+
+	branch_emulate_step = create_branch((unsigned int *)buff + TMPL_EMULATE_IDX,
+				(unsigned long)emulate_step_addr,
+				BRANCH_SET_LINK);
+
+	if (!branch_op_callback || !branch_emulate_step)
+		goto error;
+
+	buff[TMPL_CALL_HDLR_IDX] = branch_op_callback;
+	buff[TMPL_EMULATE_IDX] = branch_emulate_step;
+
+	/*
+	 * 3. load instruction to be emulated into relevant register, and
+	 */
+	patch_imm32_load_insns(*p->ainsn.insn, buff + TMPL_INSN_IDX);
+
+	/*
+	 * 4. branch back from trampoline
+	 */
+	buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX,
+				(unsigned long)nip, 0);
+
+	flush_icache_range((unsigned long)buff,
+			   (unsigned long)(&buff[TMPL_END_IDX]));
+
+	op->optinsn.insn = buff;
+
+	return 0;
+
+error:
+	free_ppc_optinsn_slot(buff, 0);
+	return -ERANGE;
+
+}
+
+int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+	return optinsn->insn != NULL;
+}
+
+/*
+ * On powerpc, Optprobes always replaces one instruction (4 bytes
+ * aligned and 4 bytes long). It is impossible to encounter another
+ * kprobe in this address range. So always return 0.
+ */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+	return 0;
+}
+
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+	struct optimized_kprobe *op;
+	struct optimized_kprobe *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		/*
+		 * Backup instructions which will be replaced
+		 * by jump address
+		 */
+		memcpy(op->optinsn.copied_insn, op->kp.addr,
+					       RELATIVEJUMP_SIZE);
+		patch_instruction(op->kp.addr,
+			create_branch((unsigned int *)op->kp.addr,
+				      (unsigned long)op->optinsn.insn, 0));
+		list_del_init(&op->list);
+	}
+}
+
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+	arch_arm_kprobe(&op->kp);
+}
+
+void arch_unoptimize_kprobes(struct list_head *oplist,
+			     struct list_head *done_list)
+{
+	struct optimized_kprobe *op;
+	struct optimized_kprobe *tmp;
+
+	list_for_each_entry_safe(op, tmp, oplist, list) {
+		arch_unoptimize_kprobe(op);
+		list_move(&op->list, done_list);
+	}
+}
+
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+				 unsigned long addr)
+{
+	return ((unsigned long)op->kp.addr <= addr &&
+		(unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr);
+}
diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S
new file mode 100644
index 000000000000..53e429b5a29d
--- /dev/null
+++ b/arch/powerpc/kernel/optprobes_head.S
@@ -0,0 +1,135 @@
+/*
+ * Code to prepare detour buffer for optprobes in Kernel.
+ *
+ * Copyright 2017, Anju T, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/ptrace.h>
+#include <asm/asm-offsets.h>
+
+#define	OPT_SLOT_SIZE	65536
+
+	.balign	4
+
+	/*
+	 * Reserve an area to allocate slots for detour buffer.
+	 * This is part of .text section (rather than vmalloc area)
+	 * as this needs to be within 32MB of the probed address.
+	 */
+	.global optinsn_slot
+optinsn_slot:
+	.space	OPT_SLOT_SIZE
+
+	/*
+	 * Optprobe template:
+	 * This template gets copied into one of the slots in optinsn_slot
+	 * and gets fixed up with real optprobe structures et al.
+	 */
+	.global optprobe_template_entry
+optprobe_template_entry:
+	/* Create an in-memory pt_regs */
+	stdu	r1,-INT_FRAME_SIZE(r1)
+	SAVE_GPR(0,r1)
+	/* Save the previous SP into stack */
+	addi	r0,r1,INT_FRAME_SIZE
+	std	r0,GPR1(r1)
+	SAVE_10GPRS(2,r1)
+	SAVE_10GPRS(12,r1)
+	SAVE_10GPRS(22,r1)
+	/* Save SPRS */
+	mfmsr	r5
+	std	r5,_MSR(r1)
+	li	r5,0x700
+	std	r5,_TRAP(r1)
+	li	r5,0
+	std	r5,ORIG_GPR3(r1)
+	std	r5,RESULT(r1)
+	mfctr	r5
+	std	r5,_CTR(r1)
+	mflr	r5
+	std	r5,_LINK(r1)
+	mfspr	r5,SPRN_XER
+	std	r5,_XER(r1)
+	mfcr	r5
+	std	r5,_CCR(r1)
+	lbz     r5,PACASOFTIRQEN(r13)
+	std     r5,SOFTE(r1)
+	mfdar	r5
+	std	r5,_DAR(r1)
+	mfdsisr	r5
+	std	r5,_DSISR(r1)
+
+	.global optprobe_template_op_address
+optprobe_template_op_address:
+	/*
+	 * Parameters to optimized_callback():
+	 * 1. optimized_kprobe structure in r3
+	 */
+	nop
+	nop
+	nop
+	nop
+	nop
+	/* 2. pt_regs pointer in r4 */
+	addi	r4,r1,STACK_FRAME_OVERHEAD
+
+	.global optprobe_template_call_handler
+optprobe_template_call_handler:
+	/* Branch to optimized_callback() */
+	nop
+
+	/*
+	 * Parameters for instruction emulation:
+	 * 1. Pass SP in register r3.
+	 */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+
+	.global optprobe_template_insn
+optprobe_template_insn:
+	/* 2, Pass instruction to be emulated in r4 */
+	nop
+	nop
+
+	.global optprobe_template_call_emulate
+optprobe_template_call_emulate:
+	/* Branch to emulate_step()  */
+	nop
+
+	/*
+	 * All done.
+	 * Now, restore the registers...
+	 */
+	ld	r5,_MSR(r1)
+	mtmsr	r5
+	ld	r5,_CTR(r1)
+	mtctr	r5
+	ld	r5,_LINK(r1)
+	mtlr	r5
+	ld	r5,_XER(r1)
+	mtxer	r5
+	ld	r5,_CCR(r1)
+	mtcr	r5
+	ld	r5,_DAR(r1)
+	mtdar	r5
+	ld	r5,_DSISR(r1)
+	mtdsisr	r5
+	REST_GPR(0,r1)
+	REST_10GPRS(2,r1)
+	REST_10GPRS(12,r1)
+	REST_10GPRS(22,r1)
+	/* Restore the previous SP */
+	addi	r1,r1,INT_FRAME_SIZE
+
+	.global optprobe_template_ret
+optprobe_template_ret:
+	/* ... and jump back from trampoline */
+	nop
+
+	.global optprobe_template_end
+optprobe_template_end:
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 4ccf16a822cc..0899315e1434 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -54,6 +54,27 @@ bool is_offset_in_branch_range(long offset)
 	return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3));
 }
 
+/*
+ * Helper to check if a given instruction is a conditional branch
+ * Derived from the conditional checks in analyse_instr()
+ */
+bool __kprobes is_conditional_branch(unsigned int instr)
+{
+	unsigned int opcode = instr >> 26;
+
+	if (opcode == 16)       /* bc, bca, bcl, bcla */
+		return true;
+	if (opcode == 19) {
+		switch ((instr >> 1) & 0x3ff) {
+		case 16:        /* bclr, bclrl */
+		case 528:       /* bcctr, bcctrl */
+		case 560:       /* bctar, bctarl */
+			return true;
+		}
+	}
+	return false;
+}
+
 unsigned int create_branch(const unsigned int *addr,
 			   unsigned long target, int flags)
 {
-- 
cgit v1.2.3


From 762df10bad6954b353ee649c387a8ffacf6dc347 Mon Sep 17 00:00:00 2001
From: Anju T <anju@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 15:20:52 +0530
Subject: powerpc/kprobes: Optimize kprobe in kretprobe_trampoline()

Kprobe placed on the  kretprobe_trampoline() during boot time can be
optimized, since the instruction at probe point is a 'nop'.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c   | 8 ++++++++
 arch/powerpc/kernel/optprobes.c | 7 +++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 735ff3d3f77d..45e4f82b230d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -285,6 +285,7 @@ asm(".global kretprobe_trampoline\n"
 	".type kretprobe_trampoline, @function\n"
 	"kretprobe_trampoline:\n"
 	"nop\n"
+	"blr\n"
 	".size kretprobe_trampoline, .-kretprobe_trampoline\n");
 
 /*
@@ -337,6 +338,13 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
 	regs->nip = orig_ret_address;
+	/*
+	 * Make LR point to the orig_ret_address.
+	 * When the 'nop' inside the kretprobe_trampoline
+	 * is optimized, we can do a 'blr' after executing the
+	 * detour buffer code.
+	 */
+	regs->link = orig_ret_address;
 
 	reset_current_kprobe();
 	kretprobe_hash_unlock(current, &flags);
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 17f4c94142d3..2282bf4e63cd 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -72,12 +72,11 @@ static unsigned long can_optimize(struct kprobe *p)
 
 	/*
 	 * kprobe placed for kretprobe during boot time
-	 * is not optimizing now.
-	 *
-	 * TODO: Optimize kprobe in kretprobe_trampoline
+	 * has a 'nop' instruction, which can be emulated.
+	 * So further checks can be skipped.
 	 */
 	if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
-		return 0;
+		return (unsigned long)p->addr + sizeof(kprobe_opcode_t);
 
 	/*
 	 * We only support optimizing kernel addresses, but not
-- 
cgit v1.2.3


From be9ba9ff93cc3e44dc46da9ed25655780069411a Mon Sep 17 00:00:00 2001
From: Shailendra Singh <shailendras@nvidia.com>
Date: Wed, 1 Feb 2017 14:52:42 -0800
Subject: powerpc: Drop GPL from of_node_to_nid() export to match other arches

The generic implementation of of_node_to_nid() is EXPORT_SYMBOL, added
in commit 298535c00a2c ("of, numa: Add NUMA of binding
implementation.").

The powerpc implementation added in commit 953039c8df7b ("[PATCH]
powerpc: Allow devices to register with numa topology") is
EXPORT_SYMBOL_GPL.

This creates an inconsistency for of_node_to_nid() callers across
architectures.

Update the powerpc implementation to be exported consistently with the
generic implementation.

Signed-off-by: Shailendra Singh <shailendras@nvidia.com>
Reviewed-by: Andy Ritger <aritger@nvidia.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 16267ff8c86c..9befaee237d6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -290,7 +290,7 @@ int of_node_to_nid(struct device_node *device)
 
 	return nid;
 }
-EXPORT_SYMBOL_GPL(of_node_to_nid);
+EXPORT_SYMBOL(of_node_to_nid);
 
 static int __init find_min_common_depth(void)
 {
-- 
cgit v1.2.3


From 99ad503287daf78e19e64e0e51f1d60a2a592217 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 9 Feb 2017 23:55:43 +1100
Subject: powerpc: Add a prototype for mcount() so it can be versioned

Currently we get a warning that _mcount() can't be versioned:

  WARNING: EXPORT symbol "_mcount" [vmlinux] version generation failed, symbol will not be versioned.

Add a prototype to asm-prototypes.h to fix it.

The prototype is not really correct, mcount() is not a normal function,
it has a special ABI. But for the purpose of versioning it doesn't
matter.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/asm-prototypes.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index ba47c70712f9..f6c5264287e5 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -120,4 +120,6 @@ extern s64 __ashrdi3(s64, int);
 extern int __cmpdi2(s64, s64);
 extern int __ucmpdi2(u64, u64);
 
+void _mcount(void);
+
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
-- 
cgit v1.2.3


From a7e0fb6c2029a780444d09560f739e020d54fe4d Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 7 Feb 2017 21:01:01 +1100
Subject: powerpc/powernv: Fix opal_exit tracepoint opcode

Currently the opal_exit tracepoint usually shows the opcode as 0:

  <idle>-0     [047] d.h.   635.654292: opal_entry: opcode=63
  <idle>-0     [047] d.h.   635.654296: opal_exit: opcode=0 retval=0
  kopald-1209  [019] d...   636.420943: opal_entry: opcode=10
  kopald-1209  [019] d...   636.420959: opal_exit: opcode=0 retval=0

This is because we incorrectly load the opcode into r0 before calling
__trace_opal_exit(), whereas it expects the opcode in r3 (first function
parameter). In fact we are leaving the retval in r3, so opcode and
retval will always show the same value.

Instead load the opcode into r3, resulting in:

  <idle>-0     [040] d.h.   636.618625: opal_entry: opcode=63
  <idle>-0     [040] d.h.   636.618627: opal_exit: opcode=63 retval=0

Fixes: c49f63530bb6 ("powernv: Add OPAL tracepoints")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-wrappers.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index f7c19c9c57ed..63fe1b2b1175 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -146,7 +146,7 @@ opal_tracepoint_entry:
 opal_tracepoint_return:
 	std	r3,STK_REG(R31)(r1)
 	mr	r4,r3
-	ld	r0,STK_REG(R23)(r1)
+	ld	r3,STK_REG(R23)(r1)
 	bl	__trace_opal_exit
 	ld	r3,STK_REG(R31)(r1)
 	addi	r1,r1,STACKFRAMESIZE
-- 
cgit v1.2.3


From 496e9cb5b2aa2ba303d2bbd08518f9be2219ab4b Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Fri, 10 Feb 2017 12:16:59 +1100
Subject: powerpc/ftrace: Fix confusing help text for DISABLE_MPROFILE_KERNEL

The final paragraph of the help text is reversed. We want to enable
this option by default, and disable it if the toolchain has a working
-mprofile-kernel.

Fixes: 8c50b72a3b4f ("powerpc/ftrace: Add Kconfig & Make glue for mprofile-kernel")
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f26c2253fdf2..260dd6a371e0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -394,8 +394,8 @@ config DISABLE_MPROFILE_KERNEL
 	  be disabled also.
 
 	  If you have a toolchain which supports mprofile-kernel, then you can
-	  enable this. Otherwise leave it disabled. If you're not sure, say
-	  "N".
+	  disable this. Otherwise leave it enabled. If you're not sure, say
+	  "Y".
 
 config MPROFILE_KERNEL
 	depends on PPC64 && CPU_LITTLE_ENDIAN
-- 
cgit v1.2.3


From fc62d0207ae0ebc9d19df68394c0dc925b4a92d1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 01:24:14 +0530
Subject: kprobes: Introduce weak variant of kprobe_exceptions_notify()

kprobe_exceptions_notify() is not used on some of the architectures such
as arm[64] and powerpc anymore. Introduce a weak variant for such
architectures.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 kernel/kprobes.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 43460104f119..60a702a05684 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1705,6 +1705,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
 }
 EXPORT_SYMBOL_GPL(unregister_kprobes);
 
+int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+					      unsigned long val, void *data)
+{
+	return NOTIFY_DONE;
+}
+
 static struct notifier_block kprobe_exceptions_nb = {
 	.notifier_call = kprobe_exceptions_notify,
 	.priority = 0x7fffffff /* we need to be notified first */
-- 
cgit v1.2.3


From 0ddde5004d26c483c9e67005b2be5b4d389e8db2 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 01:24:16 +0530
Subject: powerpc/kprobes: Remove kprobe_exceptions_notify()

... as the generic weak variant will do.

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 45e4f82b230d..fce05a38851c 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -475,15 +475,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 	return 0;
 }
 
-/*
- * Wrapper routine to for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-				       unsigned long val, void *data)
-{
-	return NOTIFY_DONE;
-}
-
 unsigned long arch_deref_entry_point(void *entry)
 {
 	return ppc_global_function_entry(entry);
-- 
cgit v1.2.3


From b0b5a76579ea62a9eeb720e71fdaa9d37e80f8fd Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 12 Jan 2017 15:09:21 +0000
Subject: powerpc/pseries: Fix typo in parameter description

Fix typo in "hotplug_delay" parameter description. This allows modinfo
to match the help text to the parameter.

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/cmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 4839db385bb0..4ac419c7eb4c 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -76,7 +76,7 @@ module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
 		 "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
 module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+MODULE_PARM_DESC(hotplug_delay, "Delay (in seconds) after memory hotplug remove "
 		 "before loaning resumes. "
 		 "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
 module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
-- 
cgit v1.2.3


From aad71e3928bec48639a70facd21814ea28d27dcb Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 14 Feb 2017 13:11:38 +1100
Subject: powerpc/mm: Fix build break with RADIX=y & HUGETLBFS=n

If we enable RADIX but disable HUGETLBFS, the build breaks with:

  arch/powerpc/mm/pgtable-radix.c:557:7: error: implicit declaration of function 'pmd_huge'
  arch/powerpc/mm/pgtable-radix.c:588:7: error: implicit declaration of function 'pud_huge'

Fix it by stubbing those functions when HUGETLBFS=n.

Fixes: 4b5d62ca17a1 ("powerpc/mm: add radix__remove_section_mapping()")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable-4k.h  | 5 +++++
 arch/powerpc/include/asm/book3s/64/pgtable-64k.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
index 9db83b4e017d..8708a0239a56 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -47,7 +47,12 @@ static inline int hugepd_ok(hugepd_t hpd)
 	return hash__hugepd_ok(hpd);
 }
 #define is_hugepd(hpd)		(hugepd_ok(hpd))
+
+#else /* !CONFIG_HUGETLB_PAGE */
+static inline int pmd_huge(pmd_t pmd) { return 0; }
+static inline int pud_huge(pud_t pud) { return 0; }
 #endif /* CONFIG_HUGETLB_PAGE */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index 198aff33c380..2ce4209399ed 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -46,6 +46,9 @@ static inline int hugepd_ok(hugepd_t hpd)
 }
 #define is_hugepd(pdep)			0
 
+#else /* !CONFIG_HUGETLB_PAGE */
+static inline int pmd_huge(pmd_t pmd) { return 0; }
+static inline int pud_huge(pud_t pud) { return 0; }
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
-- 
cgit v1.2.3


From a05ef161cdd22faccffe06f21fc8f1e249565385 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 14 Feb 2017 13:44:05 +1100
Subject: powerpc/mm: Fix build break when CMA=n && SPAPR_TCE_IOMMU=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the build breaks if CMA=n and SPAPR_TCE_IOMMU=y:

  arch/powerpc/mm/mmu_context_iommu.c: In function ‘mm_iommu_get’:
  arch/powerpc/mm/mmu_context_iommu.c:193:42: error: ‘MIGRATE_CMA’ undeclared (first use in this function)
  if (get_pageblock_migratetype(page) == MIGRATE_CMA) {
  ^~~~~~~~~~~

Fix it by using the existing is_migrate_cma_page(), which evaulates to
false when CMA=n.

Fixes: 2e5bbb5461f1 ("KVM: PPC: Book3S HV: Migrate pinned pages out of CMA")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mmu_context_iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 104bad029ce9..7de7124ac91b 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -184,7 +184,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 		 * of the CMA zone if possible. NOTE: faulting in + migration
 		 * can be expensive. Batching can be considered later
 		 */
-		if (get_pageblock_migratetype(page) == MIGRATE_CMA) {
+		if (is_migrate_cma_page(page)) {
 			if (mm_iommu_move_page_from_cma(page))
 				goto populate;
 			if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
-- 
cgit v1.2.3


From 36b390fd624a7f3a64bda5de54fa58ebcf956ffc Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 15 Feb 2017 19:54:55 +1100
Subject: powerpc/mm: Fix build break with BOOK3S_64=n and MEMORY_HOTPLUG=y

The recently merged HPT (Hash Page Table) resize support broke the build
when BOOK3S_64=n (ie. 32-bit or 64-bit Book3E) and MEMORY_HOTPLUG=y:

  arch/powerpc/mm/mem.o: In function `.arch_add_memory':
  (.text+0x4e4): undefined reference to `.resize_hpt_for_hotplug'

Fix it by adding a dummy version.

Fixes: 438cc81a41e8 ("powerpc/pseries: Automatically resize HPT for memory hot add/remove")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/sparsemem.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index 737335c891e4..c88930c9db7f 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -16,9 +16,15 @@
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-extern void resize_hpt_for_hotplug(unsigned long new_mem_size);
 extern int create_section_mapping(unsigned long start, unsigned long end);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+extern void resize_hpt_for_hotplug(unsigned long new_mem_size);
+#else
+static inline void resize_hpt_for_hotplug(unsigned long new_mem_size) { }
+#endif
+
 #ifdef CONFIG_NUMA
 extern int hot_add_scn_to_nid(unsigned long scn_addr);
 #else
-- 
cgit v1.2.3


From c21a493a2b44650707d06741601894329486f2ad Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Tue, 22 Nov 2016 14:55:59 +0530
Subject: powerpc/xmon: Fix data-breakpoint

Currently xmon data-breakpoint feature is broken.

Whenever there is a watchpoint match occurs, hw_breakpoint_handler will
be called by do_break via notifier chains mechanism. If watchpoint is
registered by xmon, hw_breakpoint_handler won't find any associated
perf_event and returns immediately with NOTIFY_STOP. Similarly, do_break
also returns without notifying to xmon.

Solve this by returning NOTIFY_DONE when hw_breakpoint_handler does not
find any perf_event associated with matched watchpoint, rather than
NOTIFY_STOP, which tells the core code to continue calling the other
breakpoint handlers including the xmon one.

Cc: stable@vger.kernel.org
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/hw_breakpoint.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 4d3aa05e28be..53cc9270aac8 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -228,8 +228,10 @@ int hw_breakpoint_handler(struct die_args *args)
 	rcu_read_lock();
 
 	bp = __this_cpu_read(bp_per_reg);
-	if (!bp)
+	if (!bp) {
+		rc = NOTIFY_DONE;
 		goto out;
+	}
 	info = counter_arch_bp(bp);
 
 	/*
-- 
cgit v1.2.3


From 18061c17c8ecdbdbf1e7d1695ec44e7388b4f601 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 30 Jan 2017 21:42:59 +0530
Subject: powerpc/mm: Update PROTFAULT handling in the page fault path

With radix, we can get page fault with DSISR_PROTFAULT value set in case of
PROT_NONE or autonuma mapping. The PROT_NONE case in handled by the vma check
where we consider the access bad. For autonuma we should fall through and fixup
the access mask correctly.

Without this patch we trigger the WARN_ON() on radix. This code moves that
WARN_ON() within a radix_enabled() check. I also moved the WARN_ON() outside
the if condition making it apply for all type of faults (exec/write/read). It
is also conditionalized for book3s, because BOOK3E can also get a PROTFAULT to
handle the D/I cache sync.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/copro_fault.c | 10 ++++++----
 arch/powerpc/mm/fault.c       | 43 +++++++++++++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index aaa7ec6788b9..697b70ad1195 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -67,11 +67,13 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto out_unlock;
 		/*
-		 * protfault should only happen due to us
-		 * mapping a region readonly temporarily. PROT_NONE
-		 * is also covered by the VMA check above.
+		 * PROT_NONE is covered by the VMA check above.
+		 * and hash should get a NOHPTE fault instead of
+		 * a PROTFAULT in case fixup is needed for things
+		 * like autonuma.
 		 */
-		WARN_ON_ONCE(dsisr & DSISR_PROTFAULT);
+		if (!radix_enabled())
+			WARN_ON_ONCE(dsisr & DSISR_PROTFAULT);
 	}
 
 	ret = 0;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 6fd30ac7d14a..c636137666c1 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -418,15 +418,6 @@ good_area:
 		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
 		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
 			goto bad_area;
-
-#ifdef CONFIG_PPC_STD_MMU
-		/*
-		 * protfault should only happen due to us
-		 * mapping a region readonly temporarily. PROT_NONE
-		 * is also covered by the VMA check above.
-		 */
-		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-#endif /* CONFIG_PPC_STD_MMU */
 	/* a write */
 	} else if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
@@ -436,8 +427,40 @@ good_area:
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 			goto bad_area;
-		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
 	}
+#ifdef CONFIG_PPC_STD_MMU
+	/*
+	 * For hash translation mode, we should never get a
+	 * PROTFAULT. Any update to pte to reduce access will result in us
+	 * removing the hash page table entry, thus resulting in a DSISR_NOHPTE
+	 * fault instead of DSISR_PROTFAULT.
+	 *
+	 * A pte update to relax the access will not result in a hash page table
+	 * entry invalidate and hence can result in DSISR_PROTFAULT.
+	 * ptep_set_access_flags() doesn't do a hpte flush. This is why we have
+	 * the special !is_write in the below conditional.
+	 *
+	 * For platforms that doesn't supports coherent icache and do support
+	 * per page noexec bit, we do setup things such that we do the
+	 * sync between D/I cache via fault. But that is handled via low level
+	 * hash fault code (hash_page_do_lazy_icache()) and we should not reach
+	 * here in such case.
+	 *
+	 * For wrong access that can result in PROTFAULT, the above vma->vm_flags
+	 * check should handle those and hence we should fall to the bad_area
+	 * handling correctly.
+	 *
+	 * For embedded with per page exec support that doesn't support coherent
+	 * icache we do get PROTFAULT and we handle that D/I cache sync in
+	 * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
+	 * is conditional for server MMU.
+	 *
+	 * For radix, we can get prot fault for autonuma case, because radix
+	 * page table will have them marked noaccess for user.
+	 */
+	if (!radix_enabled() && !is_write)
+		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+#endif /* CONFIG_PPC_STD_MMU */
 
 	/*
 	 * If for any reason at all we couldn't handle the fault,
-- 
cgit v1.2.3


From ca94573b9c69d224e50e1084a2776772f4ea030d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 9 Feb 2017 08:28:19 +0530
Subject: powerpc/mm/radix: Update pte update sequence for pte clear case

In the kernel we do follow the below sequence in different code paths.
pte = ptep_get_clear(ptep)
....
set_pte_at(ptep, pte)

We do that for mremap, autonuma protection update and softdirty clearing. This
implies our optimization to skip a tlb flush when clearing a pte update is
not valid, because for DD1 system that followup set_pte_at will be done witout
doing the required tlbflush. Fix that by always doing the dd1 style pte update
irrespective of new_pte value. In a later patch we will optimize the application
exit case.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 0032b662284c..98c9ab37593c 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -144,16 +144,10 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
 		 * new value of pte
 		 */
 		new_pte = (old_pte | set) & ~clr;
-		/*
-		 * If we are trying to clear the pte, we can skip
-		 * the below sequence and batch the tlb flush. The
-		 * tlb flush batching is done by mmu gather code
-		 */
-		if (new_pte) {
-			asm volatile("ptesync" : : : "memory");
-			radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr);
+		asm volatile("ptesync" : : : "memory");
+		radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr);
+		if (new_pte)
 			__radix_pte_update(ptep, 0, new_pte);
-		}
 	} else
 		old_pte = __radix_pte_update(ptep, clr, set);
 	asm volatile("ptesync" : : : "memory");
-- 
cgit v1.2.3


From f4894b80b1ddfef00d4d2e5c58613ccef358a1b2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 9 Feb 2017 08:28:20 +0530
Subject: powerpc/mm/radix: Use ptep_get_and_clear_full when clearing pte for
 full mm

This helps us to do some optimization for application exit case, where we can
skip the DD1 style pte update sequence.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 17 +++++++++++++++++
 arch/powerpc/include/asm/book3s/64/radix.h   | 23 ++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 5905f0ff57d1..fef738229a68 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -371,6 +371,23 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return __pte(old);
 }
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+					    unsigned long addr,
+					    pte_t *ptep, int full)
+{
+	if (full && radix_enabled()) {
+		/*
+		 * Let's skip the DD1 style pte update here. We know that
+		 * this is a full mm pte clear and hence can be sure there is
+		 * no parallel set_pte.
+		 */
+		return radix__ptep_get_and_clear_full(mm, addr, ptep, full);
+	}
+	return ptep_get_and_clear(mm, addr, ptep);
+}
+
+
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 			     pte_t * ptep)
 {
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 98c9ab37593c..10a290094af4 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -139,7 +139,7 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
 
 		unsigned long new_pte;
 
-		old_pte = __radix_pte_update(ptep, ~0, 0);
+		old_pte = __radix_pte_update(ptep, ~0ul, 0);
 		/*
 		 * new value of pte
 		 */
@@ -157,6 +157,27 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
 	return old_pte;
 }
 
+static inline pte_t radix__ptep_get_and_clear_full(struct mm_struct *mm,
+						   unsigned long addr,
+						   pte_t *ptep, int full)
+{
+	unsigned long old_pte;
+
+	if (full) {
+		/*
+		 * If we are trying to clear the pte, we can skip
+		 * the DD1 pte update sequence and batch the tlb flush. The
+		 * tlb flush batching is done by mmu gather code. We
+		 * still keep the cmp_xchg update to make sure we get
+		 * correct R/C bit which might be updated via Nest MMU.
+		 */
+		old_pte = __radix_pte_update(ptep, ~0ul, 0);
+	} else
+		old_pte = radix__pte_update(mm, addr, ptep, ~0ul, 0, 0);
+
+	return __pte(old_pte);
+}
+
 /*
  * Set the dirty and/or accessed bits atomically in a linux PTE, this
  * function doesn't need to invalidate tlb.
-- 
cgit v1.2.3


From 438e69b52be776c035aa2a851ccc1709033d729b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 9 Feb 2017 08:28:21 +0530
Subject: powerpc/mm/radix: Skip ptesync in pte update helpers

We do them at the start of tlb flush, and we are sure a pte update will be
followed by a tlbflush. Hence we can skip the ptesync in pte update helpers.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Tested-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 10a290094af4..9e0bb7cd6e22 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -144,13 +144,11 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
 		 * new value of pte
 		 */
 		new_pte = (old_pte | set) & ~clr;
-		asm volatile("ptesync" : : : "memory");
 		radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr);
 		if (new_pte)
 			__radix_pte_update(ptep, 0, new_pte);
 	} else
 		old_pte = __radix_pte_update(ptep, clr, set);
-	asm volatile("ptesync" : : : "memory");
 	if (!huge)
 		assert_pte_locked(mm, addr);
 
@@ -195,7 +193,6 @@ static inline void radix__ptep_set_access_flags(struct mm_struct *mm,
 		unsigned long old_pte, new_pte;
 
 		old_pte = __radix_pte_update(ptep, ~0, 0);
-		asm volatile("ptesync" : : : "memory");
 		/*
 		 * new value of pte
 		 */
-- 
cgit v1.2.3