Merge tag 'powerpc-4.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc updates from Michael Ellerman: "Highlights: - Enable support for memory protection keys aka "pkeys" on Power7/8/9 when using the hash table MMU. - Extend our interrupt soft masking to support masking PMU interrupts as well as "normal" interrupts, and then use that to implement local_t for a ~4x speedup vs the current atomics-based implementation. - A new driver "ocxl" for "Open Coherent Accelerator Processor Interface (OpenCAPI)" devices. - Support for new device tree properties on PowerVM to describe hotpluggable memory and devices. - Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE to the 64-bit VDSO. - Freescale updates from Scott: fixes for CPM GPIO and an FSL PCI erratum workaround, plus a minor cleanup patch. As well as quite a lot of other changes all over the place, and small fixes and cleanups as always. Thanks to: Alan Modra, Alastair D'Silva, Alexey Kardashevskiy, Alistair Popple, Andreas Schwab, Andrew Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Anshuman Khandual, Anton Blanchard, Arnd Bergmann, Balbir Singh, Benjamin Herrenschmidt, Bhaktipriya Shridhar, Bryant G. Ly, Cédric Le Goater, Christophe Leroy, Christophe Lombard, Cyril Bur, David Gibson, Desnes A. Nunes do Rosario, Dmitry Torokhov, Frederic Barrat, Geert Uytterhoeven, Guilherme G. Piccoli, Gustavo A. R. Silva, Gustavo Romero, Ivan Mikhaylov, Joakim Tjernlund, Joe Perches, Josh Poimboeuf, Juan J. Alvarez, Julia Cartwright, Kamalesh Babulal, Madhavan Srinivasan, Mahesh Salgaonkar, Mathieu Malaterre, Michael Bringmann, Michael Hanselmann, Michael Neuling, Nathan Fontenot, Naveen N. Rao, Nicholas Piggin, Paul Mackerras, Philippe Bergheaud, Ram Pai, Russell Currey, Santosh Sivaraj, Scott Wood, Seth Forshee, Simon Guo, Stewart Smith, Sukadev Bhattiprolu, Thiago Jung Bauermann, Vaibhav Jain, Vasyl Gomonovych" * tag 'powerpc-4.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (199 commits) powerpc/mm/radix: Fix build error when RADIX_MMU=n macintosh/ams-input: Use true and false for boolean values macintosh: change some data types from int to bool powerpc/watchdog: Print the NIP in soft_nmi_interrupt() powerpc/watchdog: regs can't be null in soft_nmi_interrupt() powerpc/watchdog: Tweak watchdog printks powerpc/cell: Remove axonram driver rtc-opal: Fix handling of firmware error codes, prevent busy loops powerpc/mpc52xx_gpt: make use of raw_spinlock variants macintosh/adb: Properly mark continued kernel messages powerpc/pseries: Fix cpu hotplug crash with memoryless nodes powerpc/numa: Ensure nodes initialized for hotplug powerpc/numa: Use ibm,max-associativity-domains to discover possible nodes powerpc/kernel: Block interrupts when updating TIDR powerpc/powernv/idoa: Remove unnecessary pcidev from pci_dn powerpc/mm/nohash: do not flush the entire mm when range is a single page powerpc/pseries: Add Initialization of VF Bars powerpc/pseries/pci: Associate PEs to VFs in configure SR-IOV powerpc/eeh: Add EEH notify resume sysfs powerpc/eeh: Add EEH operations to notify resume ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-02-02 10:01:04 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-02-02 10:01:04 -0800
commit: 03f51d4efa2287cc628bb20b0c032036d2a9e66a (patch)
tree: ec7fb3b6624d53092e2768578f3ef887c8d77f22 /drivers/misc
parent: 367b0df173b0ebea5d18b6971c244e260b5feb17 (diff)
parent: 015eb1b89e959c9349f0a01803fb8ed1ced36f09 (diff)
22 files changed, 3546 insertions, 7 deletions
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 6722073e339b..03605f8fc0dc 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -512,5 +512,6 @@ source "drivers/misc/mic/Kconfig"
 source "drivers/misc/genwqe/Kconfig"
 source "drivers/misc/echo/Kconfig"
 source "drivers/misc/cxl/Kconfig"
+source "drivers/misc/ocxl/Kconfig"
 source "drivers/misc/cardreader/Kconfig"
 endmenu
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 8d8cc096063b..c3c8624f4d95 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -55,7 +55,8 @@ obj-$(CONFIG_CXL_BASE)		+= cxl/
 obj-$(CONFIG_ASPEED_LPC_CTRL)	+= aspeed-lpc-ctrl.o
 obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
 obj-$(CONFIG_PCI_ENDPOINT_TEST)	+= pci_endpoint_test.o
-obj-$(CONFIG_MISC_RTSX)	+= cardreader/
+obj-$(CONFIG_OCXL)		+= ocxl/
+obj-$(CONFIG_MISC_RTSX)		+= cardreader/
 
 lkdtm-$(CONFIG_LKDTM)		+= lkdtm_core.o
 lkdtm-$(CONFIG_LKDTM)		+= lkdtm_bugs.o
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 12a41b2753f0..7ff315ad3692 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -45,6 +45,8 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master)
 	ctx->pid = NULL; /* Set in start work ioctl */
 	mutex_init(&ctx->mapping_lock);
 	ctx->mapping = NULL;
+	ctx->tidr = 0;
+	ctx->assign_tidr = false;
 
 	if (cxl_is_power8()) {
 		spin_lock_init(&ctx->sste_lock);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index a798c2ccd67d..4f015da78f28 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -630,6 +630,9 @@ struct cxl_context {
 	struct list_head extra_irq_contexts;
 
 	struct mm_struct *mm;
+
+	u16 tidr;
+	bool assign_tidr;
 };
 
 struct cxl_irq_info;
diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c
index dc9bc1807fdf..30ccba436b3b 100644
--- a/drivers/misc/cxl/cxllib.c
+++ b/drivers/misc/cxl/cxllib.c
@@ -199,10 +199,11 @@ int cxllib_get_PE_attributes(struct task_struct *task,
 		 */
 		attr->pid = mm->context.id;
 		mmput(mm);
+		attr->tid = task->thread.tidr;
 	} else {
 		attr->pid = 0;
+		attr->tid = 0;
 	}
-	attr->tid = 0;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cxllib_get_PE_attributes);
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 90341ccda9bd..0162516f5e57 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -173,7 +173,7 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 	 * flags are set it's invalid
 	 */
 	if (work.reserved1 || work.reserved2 || work.reserved3 ||
-	    work.reserved4 || work.reserved5 || work.reserved6 ||
+	    work.reserved4 || work.reserved5 ||
 	    (work.flags & ~CXL_START_WORK_ALL)) {
 		rc = -EINVAL;
 		goto out;
@@ -186,12 +186,16 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 		rc =  -EINVAL;
 		goto out;
 	}
+
 	if ((rc = afu_register_irqs(ctx, work.num_interrupts)))
 		goto out;
 
 	if (work.flags & CXL_START_WORK_AMR)
 		amr = work.amr & mfspr(SPRN_UAMOR);
 
+	if (work.flags & CXL_START_WORK_TID)
+		ctx->assign_tidr = true;
+
 	ctx->mmio_err_ff = !!(work.flags & CXL_START_WORK_ERR_FF);
 
 	/*
@@ -263,8 +267,15 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 		goto out;
 	}
 
-	ctx->status = STARTED;
 	rc = 0;
+	if (work.flags & CXL_START_WORK_TID) {
+		work.tid = ctx->tidr;
+		if (copy_to_user(uwork, &work, sizeof(work)))
+			rc = -EFAULT;
+	}
+
+	ctx->status = STARTED;
+
 out:
 	mutex_unlock(&ctx->status_mutex);
 	return rc;
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 02b6b45b4c20..1b3d7c65ea3f 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <asm/synch.h>
+#include <asm/switch_to.h>
 #include <misc/cxl-base.h>
 
 #include "cxl.h"
@@ -655,6 +656,7 @@ static void update_ivtes_directed(struct cxl_context *ctx)
 static int process_element_entry_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
 {
 	u32 pid;
+	int rc;
 
 	cxl_assign_psn_space(ctx);
 
@@ -673,7 +675,16 @@ static int process_element_entry_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
 		pid = ctx->mm->context.id;
 	}
 
-	ctx->elem->common.tid = 0;
+	/* Assign a unique TIDR (thread id) for the current thread */
+	if (!(ctx->tidr) && (ctx->assign_tidr)) {
+		rc = set_thread_tidr(current);
+		if (rc)
+			return -ENODEV;
+		ctx->tidr = current->thread.tidr;
+		pr_devel("%s: current tidr: %d\n", __func__, ctx->tidr);
+	}
+
+	ctx->elem->common.tid = cpu_to_be32(ctx->tidr);
 	ctx->elem->common.pid = cpu_to_be32(pid);
 
 	ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 19969ee86d6f..758842f65a1b 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -125,8 +125,6 @@ static const struct pci_device_id cxl_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0623), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0628), },
-	{ PCI_DEVICE_CLASS(0x120000, ~0), },
-
 	{ }
 };
 MODULE_DEVICE_TABLE(pci, cxl_pci_tbl);
diff --git a/drivers/misc/ocxl/Kconfig b/drivers/misc/ocxl/Kconfig
new file mode 100644
index 000000000000..4bbdb0d3c8ee
--- /dev/null
+++ b/drivers/misc/ocxl/Kconfig
@@ -0,0 +1,31 @@
+#
+# Open Coherent Accelerator (OCXL) compatible devices
+#
+
+config OCXL_BASE
+	bool
+	default n
+	select PPC_COPRO_BASE
+
+config OCXL
+	tristate "OpenCAPI coherent accelerator support"
+	depends on PPC_POWERNV && PCI && EEH
+	select OCXL_BASE
+	default m
+	help
+	  Select this option to enable the ocxl driver for Open
+	  Coherent Accelerator Processor Interface (OpenCAPI) devices.
+
+	  OpenCAPI allows FPGA and ASIC accelerators to be coherently
+	  attached to a CPU over an OpenCAPI link.
+
+	  The ocxl driver enables userspace programs to access these
+	  accelerators through devices in /dev/ocxl/.
+
+	  For more information, see http://opencapi.org.
+
+	  This is not to be confused with the support for IBM CAPI
+	  accelerators (CONFIG_CXL), which are PCI-based instead of a
+	  dedicated OpenCAPI link, and don't follow the same protocol.
+
+	  If unsure, say N.
diff --git a/drivers/misc/ocxl/Makefile b/drivers/misc/ocxl/Makefile
new file mode 100644
index 000000000000..5229dcda8297
--- /dev/null
+++ b/drivers/misc/ocxl/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0+
+ccflags-$(CONFIG_PPC_WERROR)	+= -Werror
+
+ocxl-y				+= main.o pci.o config.o file.o pasid.o
+ocxl-y				+= link.o context.o afu_irq.o sysfs.o trace.o
+obj-$(CONFIG_OCXL)		+= ocxl.o
+
+# For tracepoints to include our trace.h from tracepoint infrastructure:
+CFLAGS_trace.o := -I$(src)
+
+# ccflags-y += -DDEBUG
diff --git a/drivers/misc/ocxl/afu_irq.c b/drivers/misc/ocxl/afu_irq.c
new file mode 100644
index 000000000000..e70cfa24577f
--- /dev/null
+++ b/drivers/misc/ocxl/afu_irq.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/interrupt.h>
+#include <linux/eventfd.h>
+#include <asm/pnv-ocxl.h>
+#include "ocxl_internal.h"
+#include "trace.h"
+
+struct afu_irq {
+	int id;
+	int hw_irq;
+	unsigned int virq;
+	char *name;
+	u64 trigger_page;
+	struct eventfd_ctx *ev_ctx;
+};
+
+static int irq_offset_to_id(struct ocxl_context *ctx, u64 offset)
+{
+	return (offset - ctx->afu->irq_base_offset) >> PAGE_SHIFT;
+}
+
+static u64 irq_id_to_offset(struct ocxl_context *ctx, int id)
+{
+	return ctx->afu->irq_base_offset + (id << PAGE_SHIFT);
+}
+
+static irqreturn_t afu_irq_handler(int virq, void *data)
+{
+	struct afu_irq *irq = (struct afu_irq *) data;
+
+	trace_ocxl_afu_irq_receive(virq);
+	if (irq->ev_ctx)
+		eventfd_signal(irq->ev_ctx, 1);
+	return IRQ_HANDLED;
+}
+
+static int setup_afu_irq(struct ocxl_context *ctx, struct afu_irq *irq)
+{
+	int rc;
+
+	irq->virq = irq_create_mapping(NULL, irq->hw_irq);
+	if (!irq->virq) {
+		pr_err("irq_create_mapping failed\n");
+		return -ENOMEM;
+	}
+	pr_debug("hw_irq %d mapped to virq %u\n", irq->hw_irq, irq->virq);
+
+	irq->name = kasprintf(GFP_KERNEL, "ocxl-afu-%u", irq->virq);
+	if (!irq->name) {
+		irq_dispose_mapping(irq->virq);
+		return -ENOMEM;
+	}
+
+	rc = request_irq(irq->virq, afu_irq_handler, 0, irq->name, irq);
+	if (rc) {
+		kfree(irq->name);
+		irq->name = NULL;
+		irq_dispose_mapping(irq->virq);
+		pr_err("request_irq failed: %d\n", rc);
+		return rc;
+	}
+	return 0;
+}
+
+static void release_afu_irq(struct afu_irq *irq)
+{
+	free_irq(irq->virq, irq);
+	irq_dispose_mapping(irq->virq);
+	kfree(irq->name);
+}
+
+int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset)
+{
+	struct afu_irq *irq;
+	int rc;
+
+	irq = kzalloc(sizeof(struct afu_irq), GFP_KERNEL);
+	if (!irq)
+		return -ENOMEM;
+
+	/*
+	 * We limit the number of afu irqs per context and per link to
+	 * avoid a single process or user depleting the pool of IPIs
+	 */
+
+	mutex_lock(&ctx->irq_lock);
+
+	irq->id = idr_alloc(&ctx->irq_idr, irq, 0, MAX_IRQ_PER_CONTEXT,
+			GFP_KERNEL);
+	if (irq->id < 0) {
+		rc = -ENOSPC;
+		goto err_unlock;
+	}
+
+	rc = ocxl_link_irq_alloc(ctx->afu->fn->link, &irq->hw_irq,
+				&irq->trigger_page);
+	if (rc)
+		goto err_idr;
+
+	rc = setup_afu_irq(ctx, irq);
+	if (rc)
+		goto err_alloc;
+
+	*irq_offset = irq_id_to_offset(ctx, irq->id);
+
+	trace_ocxl_afu_irq_alloc(ctx->pasid, irq->id, irq->virq, irq->hw_irq,
+				*irq_offset);
+	mutex_unlock(&ctx->irq_lock);
+	return 0;
+
+err_alloc:
+	ocxl_link_free_irq(ctx->afu->fn->link, irq->hw_irq);
+err_idr:
+	idr_remove(&ctx->irq_idr, irq->id);
+err_unlock:
+	mutex_unlock(&ctx->irq_lock);
+	kfree(irq);
+	return rc;
+}
+
+static void afu_irq_free(struct afu_irq *irq, struct ocxl_context *ctx)
+{
+	trace_ocxl_afu_irq_free(ctx->pasid, irq->id);
+	if (ctx->mapping)
+		unmap_mapping_range(ctx->mapping,
+				irq_id_to_offset(ctx, irq->id),
+				1 << PAGE_SHIFT, 1);
+	release_afu_irq(irq);
+	if (irq->ev_ctx)
+		eventfd_ctx_put(irq->ev_ctx);
+	ocxl_link_free_irq(ctx->afu->fn->link, irq->hw_irq);
+	kfree(irq);
+}
+
+int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset)
+{
+	struct afu_irq *irq;
+	int id = irq_offset_to_id(ctx, irq_offset);
+
+	mutex_lock(&ctx->irq_lock);
+
+	irq = idr_find(&ctx->irq_idr, id);
+	if (!irq) {
+		mutex_unlock(&ctx->irq_lock);
+		return -EINVAL;
+	}
+	idr_remove(&ctx->irq_idr, irq->id);
+	afu_irq_free(irq, ctx);
+	mutex_unlock(&ctx->irq_lock);
+	return 0;
+}
+
+void ocxl_afu_irq_free_all(struct ocxl_context *ctx)
+{
+	struct afu_irq *irq;
+	int id;
+
+	mutex_lock(&ctx->irq_lock);
+	idr_for_each_entry(&ctx->irq_idr, irq, id)
+		afu_irq_free(irq, ctx);
+	mutex_unlock(&ctx->irq_lock);
+}
+
+int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, int eventfd)
+{
+	struct afu_irq *irq;
+	struct eventfd_ctx *ev_ctx;
+	int rc = 0, id = irq_offset_to_id(ctx, irq_offset);
+
+	mutex_lock(&ctx->irq_lock);
+	irq = idr_find(&ctx->irq_idr, id);
+	if (!irq) {
+		rc = -EINVAL;
+		goto unlock;
+	}
+
+	ev_ctx = eventfd_ctx_fdget(eventfd);
+	if (IS_ERR(ev_ctx)) {
+		rc = -EINVAL;
+		goto unlock;
+	}
+
+	irq->ev_ctx = ev_ctx;
+unlock:
+	mutex_unlock(&ctx->irq_lock);
+	return rc;
+}
+
+u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset)
+{
+	struct afu_irq *irq;
+	int id = irq_offset_to_id(ctx, irq_offset);
+	u64 addr = 0;
+
+	mutex_lock(&ctx->irq_lock);
+	irq = idr_find(&ctx->irq_idr, id);
+	if (irq)
+		addr = irq->trigger_page;
+	mutex_unlock(&ctx->irq_lock);
+	return addr;
+}
diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
new file mode 100644
index 000000000000..2e30de9c694a
--- /dev/null
+++ b/drivers/misc/ocxl/config.c
@@ -0,0 +1,723 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/pci.h>
+#include <asm/pnv-ocxl.h>
+#include <misc/ocxl.h>
+#include <misc/ocxl-config.h>
+
+#define EXTRACT_BIT(val, bit) (!!(val & BIT(bit)))
+#define EXTRACT_BITS(val, s, e) ((val & GENMASK(e, s)) >> s)
+
+#define OCXL_DVSEC_AFU_IDX_MASK              GENMASK(5, 0)
+#define OCXL_DVSEC_ACTAG_MASK                GENMASK(11, 0)
+#define OCXL_DVSEC_PASID_MASK                GENMASK(19, 0)
+#define OCXL_DVSEC_PASID_LOG_MASK            GENMASK(4, 0)
+
+#define OCXL_DVSEC_TEMPL_VERSION         0x0
+#define OCXL_DVSEC_TEMPL_NAME            0x4
+#define OCXL_DVSEC_TEMPL_AFU_VERSION     0x1C
+#define OCXL_DVSEC_TEMPL_MMIO_GLOBAL     0x20
+#define OCXL_DVSEC_TEMPL_MMIO_GLOBAL_SZ  0x28
+#define OCXL_DVSEC_TEMPL_MMIO_PP         0x30
+#define OCXL_DVSEC_TEMPL_MMIO_PP_SZ      0x38
+#define OCXL_DVSEC_TEMPL_MEM_SZ          0x3C
+#define OCXL_DVSEC_TEMPL_WWID            0x40
+
+#define OCXL_MAX_AFU_PER_FUNCTION 64
+#define OCXL_TEMPL_LEN            0x58
+#define OCXL_TEMPL_NAME_LEN       24
+#define OCXL_CFG_TIMEOUT     3
+
+static int find_dvsec(struct pci_dev *dev, int dvsec_id)
+{
+	int vsec = 0;
+	u16 vendor, id;
+
+	while ((vsec = pci_find_next_ext_capability(dev, vsec,
+						    OCXL_EXT_CAP_ID_DVSEC))) {
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
+				&vendor);
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, &id);
+		if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
+			return vsec;
+	}
+	return 0;
+}
+
+static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
+{
+	int vsec = 0;
+	u16 vendor, id;
+	u8 idx;
+
+	while ((vsec = pci_find_next_ext_capability(dev, vsec,
+						    OCXL_EXT_CAP_ID_DVSEC))) {
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
+				&vendor);
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, &id);
+
+		if (vendor == PCI_VENDOR_ID_IBM &&
+			id == OCXL_DVSEC_AFU_CTRL_ID) {
+			pci_read_config_byte(dev,
+					vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
+					&idx);
+			if (idx == afu_idx)
+				return vsec;
+		}
+	}
+	return 0;
+}
+
+static int read_pasid(struct pci_dev *dev, struct ocxl_fn_config *fn)
+{
+	u16 val;
+	int pos;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_PASID);
+	if (!pos) {
+		/*
+		 * PASID capability is not mandatory, but there
+		 * shouldn't be any AFU
+		 */
+		dev_dbg(&dev->dev, "Function doesn't require any PASID\n");
+		fn->max_pasid_log = -1;
+		goto out;
+	}
+	pci_read_config_word(dev, pos + PCI_PASID_CAP, &val);
+	fn->max_pasid_log = EXTRACT_BITS(val, 8, 12);
+
+out:
+	dev_dbg(&dev->dev, "PASID capability:\n");
+	dev_dbg(&dev->dev, "  Max PASID log = %d\n", fn->max_pasid_log);
+	return 0;
+}
+
+static int read_dvsec_tl(struct pci_dev *dev, struct ocxl_fn_config *fn)
+{
+	int pos;
+
+	pos = find_dvsec(dev, OCXL_DVSEC_TL_ID);
+	if (!pos && PCI_FUNC(dev->devfn) == 0) {
+		dev_err(&dev->dev, "Can't find TL DVSEC\n");
+		return -ENODEV;
+	}
+	if (pos && PCI_FUNC(dev->devfn) != 0) {
+		dev_err(&dev->dev, "TL DVSEC is only allowed on function 0\n");
+		return -ENODEV;
+	}
+	fn->dvsec_tl_pos = pos;
+	return 0;
+}
+
+static int read_dvsec_function(struct pci_dev *dev, struct ocxl_fn_config *fn)
+{
+	int pos, afu_present;
+	u32 val;
+
+	pos = find_dvsec(dev, OCXL_DVSEC_FUNC_ID);
+	if (!pos) {
+		dev_err(&dev->dev, "Can't find function DVSEC\n");
+		return -ENODEV;
+	}
+	fn->dvsec_function_pos = pos;
+
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_FUNC_OFF_INDEX, &val);
+	afu_present = EXTRACT_BIT(val, 31);
+	if (!afu_present) {
+		fn->max_afu_index = -1;
+		dev_dbg(&dev->dev, "Function doesn't define any AFU\n");
+		goto out;
+	}
+	fn->max_afu_index = EXTRACT_BITS(val, 24, 29);
+
+out:
+	dev_dbg(&dev->dev, "Function DVSEC:\n");
+	dev_dbg(&dev->dev, "  Max AFU index = %d\n", fn->max_afu_index);
+	return 0;
+}
+
+static int read_dvsec_afu_info(struct pci_dev *dev, struct ocxl_fn_config *fn)
+{
+	int pos;
+
+	if (fn->max_afu_index < 0) {
+		fn->dvsec_afu_info_pos = -1;
+		return 0;
+	}
+
+	pos = find_dvsec(dev, OCXL_DVSEC_AFU_INFO_ID);
+	if (!pos) {
+		dev_err(&dev->dev, "Can't find AFU information DVSEC\n");
+		return -ENODEV;
+	}
+	fn->dvsec_afu_info_pos = pos;
+	return 0;
+}
+
+static int read_dvsec_vendor(struct pci_dev *dev)
+{
+	int pos;
+	u32 cfg, tlx, dlx;
+
+	/*
+	 * vendor specific DVSEC is optional
+	 *
+	 * It's currently only used on function 0 to specify the
+	 * version of some logic blocks. Some older images may not
+	 * even have it so we ignore any errors
+	 */
+	if (PCI_FUNC(dev->devfn) != 0)
+		return 0;
+
+	pos = find_dvsec(dev, OCXL_DVSEC_VENDOR_ID);
+	if (!pos)
+		return 0;
+
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_CFG_VERS, &cfg);
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_TLX_VERS, &tlx);
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_VENDOR_DLX_VERS, &dlx);
+
+	dev_dbg(&dev->dev, "Vendor specific DVSEC:\n");
+	dev_dbg(&dev->dev, "  CFG version = 0x%x\n", cfg);
+	dev_dbg(&dev->dev, "  TLX version = 0x%x\n", tlx);
+	dev_dbg(&dev->dev, "  DLX version = 0x%x\n", dlx);
+	return 0;
+}
+
+static int validate_function(struct pci_dev *dev, struct ocxl_fn_config *fn)
+{
+	if (fn->max_pasid_log == -1 && fn->max_afu_index >= 0) {
+		dev_err(&dev->dev,
+			"AFUs are defined but no PASIDs are requested\n");
+		return -EINVAL;
+	}
+
+	if (fn->max_afu_index > OCXL_MAX_AFU_PER_FUNCTION) {
+		dev_err(&dev->dev,
+			"Max AFU index out of architectural limit (%d vs %d)\n",
+			fn->max_afu_index, OCXL_MAX_AFU_PER_FUNCTION);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int ocxl_config_read_function(struct pci_dev *dev, struct ocxl_fn_config *fn)
+{
+	int rc;
+
+	rc = read_pasid(dev, fn);
+	if (rc) {
+		dev_err(&dev->dev, "Invalid PASID configuration: %d\n", rc);
+		return -ENODEV;
+	}
+
+	rc = read_dvsec_tl(dev, fn);
+	if (rc) {
+		dev_err(&dev->dev,
+			"Invalid Transaction Layer DVSEC configuration: %d\n",
+			rc);
+		return -ENODEV;
+	}
+
+	rc = read_dvsec_function(dev, fn);
+	if (rc) {
+		dev_err(&dev->dev,
+			"Invalid Function DVSEC configuration: %d\n", rc);
+		return -ENODEV;
+	}
+
+	rc = read_dvsec_afu_info(dev, fn);
+	if (rc) {
+		dev_err(&dev->dev, "Invalid AFU configuration: %d\n", rc);
+		return -ENODEV;
+	}
+
+	rc = read_dvsec_vendor(dev);
+	if (rc) {
+		dev_err(&dev->dev,
+			"Invalid vendor specific DVSEC configuration: %d\n",
+			rc);
+		return -ENODEV;
+	}
+
+	rc = validate_function(dev, fn);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocxl_config_read_function);
+
+static int read_afu_info(struct pci_dev *dev, struct ocxl_fn_config *fn,
+			int offset, u32 *data)
+{
+	u32 val;
+	unsigned long timeout = jiffies + (HZ * OCXL_CFG_TIMEOUT);
+	int pos = fn->dvsec_afu_info_pos;
+
+	/* Protect 'data valid' bit */
+	if (EXTRACT_BIT(offset, 31)) {
+		dev_err(&dev->dev, "Invalid offset in AFU info DVSEC\n");
+		return -EINVAL;
+	}
+
+	pci_write_config_dword(dev, pos + OCXL_DVSEC_AFU_INFO_OFF, offset);
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_AFU_INFO_OFF, &val);
+	while (!EXTRACT_BIT(val, 31)) {
+		if (time_after_eq(jiffies, timeout)) {
+			dev_err(&dev->dev,
+				"Timeout while reading AFU info DVSEC (offset=%d)\n",
+				offset);
+			return -EBUSY;
+		}
+		cpu_relax();
+		pci_read_config_dword(dev, pos + OCXL_DVSEC_AFU_INFO_OFF, &val);
+	}
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_AFU_INFO_DATA, data);
+	return 0;
+}
+
+int ocxl_config_check_afu_index(struct pci_dev *dev,
+				struct ocxl_fn_config *fn, int afu_idx)
+{
+	u32 val;
+	int rc, templ_major, templ_minor, len;
+
+	pci_write_config_word(dev, fn->dvsec_afu_info_pos, afu_idx);
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_VERSION, &val);
+	if (rc)
+		return rc;
+
+	/* AFU index map can have holes */
+	if (!val)
+		return 0;
+
+	templ_major = EXTRACT_BITS(val, 8, 15);
+	templ_minor = EXTRACT_BITS(val, 0, 7);
+	dev_dbg(&dev->dev, "AFU descriptor template version %d.%d\n",
+		templ_major, templ_minor);
+
+	len = EXTRACT_BITS(val, 16, 31);
+	if (len != OCXL_TEMPL_LEN) {
+		dev_warn(&dev->dev,
+			"Unexpected template length in AFU information (%#x)\n",
+			len);
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(ocxl_config_check_afu_index);
+
+static int read_afu_name(struct pci_dev *dev, struct ocxl_fn_config *fn,
+			struct ocxl_afu_config *afu)
+{
+	int i, rc;
+	u32 val, *ptr;
+
+	BUILD_BUG_ON(OCXL_AFU_NAME_SZ < OCXL_TEMPL_NAME_LEN);
+	for (i = 0; i < OCXL_TEMPL_NAME_LEN; i += 4) {
+		rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_NAME + i, &val);
+		if (rc)
+			return rc;
+		ptr = (u32 *) &afu->name[i];
+		*ptr = val;
+	}
+	afu->name[OCXL_AFU_NAME_SZ - 1] = '\0'; /* play safe */
+	return 0;
+}
+
+static int read_afu_mmio(struct pci_dev *dev, struct ocxl_fn_config *fn,
+			struct ocxl_afu_config *afu)
+{
+	int rc;
+	u32 val;
+
+	/*
+	 * Global MMIO
+	 */
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MMIO_GLOBAL, &val);
+	if (rc)
+		return rc;
+	afu->global_mmio_bar = EXTRACT_BITS(val, 0, 2);
+	afu->global_mmio_offset = EXTRACT_BITS(val, 16, 31) << 16;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MMIO_GLOBAL + 4, &val);
+	if (rc)
+		return rc;
+	afu->global_mmio_offset += (u64) val << 32;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MMIO_GLOBAL_SZ, &val);
+	if (rc)
+		return rc;
+	afu->global_mmio_size = val;
+
+	/*
+	 * Per-process MMIO
+	 */
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MMIO_PP, &val);
+	if (rc)
+		return rc;
+	afu->pp_mmio_bar = EXTRACT_BITS(val, 0, 2);
+	afu->pp_mmio_offset = EXTRACT_BITS(val, 16, 31) << 16;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MMIO_PP + 4, &val);
+	if (rc)
+		return rc;
+	afu->pp_mmio_offset += (u64) val << 32;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MMIO_PP_SZ, &val);
+	if (rc)
+		return rc;
+	afu->pp_mmio_stride = val;
+
+	return 0;
+}
+
+static int read_afu_control(struct pci_dev *dev, struct ocxl_afu_config *afu)
+{
+	int pos;
+	u8 val8;
+	u16 val16;
+
+	pos = find_dvsec_afu_ctrl(dev, afu->idx);
+	if (!pos) {
+		dev_err(&dev->dev, "Can't find AFU control DVSEC for AFU %d\n",
+			afu->idx);
+		return -ENODEV;
+	}
+	afu->dvsec_afu_control_pos = pos;
+
+	pci_read_config_byte(dev, pos + OCXL_DVSEC_AFU_CTRL_PASID_SUP, &val8);
+	afu->pasid_supported_log = EXTRACT_BITS(val8, 0, 4);
+
+	pci_read_config_word(dev, pos + OCXL_DVSEC_AFU_CTRL_ACTAG_SUP, &val16);
+	afu->actag_supported = EXTRACT_BITS(val16, 0, 11);
+	return 0;
+}
+
+static bool char_allowed(int c)
+{
+	/*
+	 * Permitted Characters : Alphanumeric, hyphen, underscore, comma
+	 */
+	if ((c >= 0x30 && c <= 0x39) /* digits */ ||
+		(c >= 0x41 && c <= 0x5A) /* upper case */ ||
+		(c >= 0x61 && c <= 0x7A) /* lower case */ ||
+		c == 0 /* NULL */ ||
+		c == 0x2D /* - */ ||
+		c == 0x5F /* _ */ ||
+		c == 0x2C /* , */)
+		return true;
+	return false;
+}
+
+static int validate_afu(struct pci_dev *dev, struct ocxl_afu_config *afu)
+{
+	int i;
+
+	if (!afu->name[0]) {
+		dev_err(&dev->dev, "Empty AFU name\n");
+		return -EINVAL;
+	}
+	for (i = 0; i < OCXL_TEMPL_NAME_LEN; i++) {
+		if (!char_allowed(afu->name[i])) {
+			dev_err(&dev->dev,
+				"Invalid character in AFU name\n");
+			return -EINVAL;
+		}
+	}
+
+	if (afu->global_mmio_bar != 0 &&
+		afu->global_mmio_bar != 2 &&
+		afu->global_mmio_bar != 4) {
+		dev_err(&dev->dev, "Invalid global MMIO bar number\n");
+		return -EINVAL;
+	}
+	if (afu->pp_mmio_bar != 0 &&
+		afu->pp_mmio_bar != 2 &&
+		afu->pp_mmio_bar != 4) {
+		dev_err(&dev->dev, "Invalid per-process MMIO bar number\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int ocxl_config_read_afu(struct pci_dev *dev, struct ocxl_fn_config *fn,
+			struct ocxl_afu_config *afu, u8 afu_idx)
+{
+	int rc;
+	u32 val32;
+
+	/*
+	 * First, we need to write the AFU idx for the AFU we want to
+	 * access.
+	 */
+	WARN_ON((afu_idx & OCXL_DVSEC_AFU_IDX_MASK) != afu_idx);
+	afu->idx = afu_idx;
+	pci_write_config_byte(dev,
+			fn->dvsec_afu_info_pos + OCXL_DVSEC_AFU_INFO_AFU_IDX,
+			afu->idx);
+
+	rc = read_afu_name(dev, fn, afu);
+	if (rc)
+		return rc;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_AFU_VERSION, &val32);
+	if (rc)
+		return rc;
+	afu->version_major = EXTRACT_BITS(val32, 24, 31);
+	afu->version_minor = EXTRACT_BITS(val32, 16, 23);
+	afu->afuc_type = EXTRACT_BITS(val32, 14, 15);
+	afu->afum_type = EXTRACT_BITS(val32, 12, 13);
+	afu->profile = EXTRACT_BITS(val32, 0, 7);
+
+	rc = read_afu_mmio(dev, fn, afu);
+	if (rc)
+		return rc;
+
+	rc = read_afu_info(dev, fn, OCXL_DVSEC_TEMPL_MEM_SZ, &val32);
+	if (rc)
+		return rc;
+	afu->log_mem_size = EXTRACT_BITS(val32, 0, 7);
+
+	rc = read_afu_control(dev, afu);
+	if (rc)
+		return rc;
+
+	dev_dbg(&dev->dev, "AFU configuration:\n");
+	dev_dbg(&dev->dev, "  name = %s\n", afu->name);
+	dev_dbg(&dev->dev, "  version = %d.%d\n", afu->version_major,
+		afu->version_minor);
+	dev_dbg(&dev->dev, "  global mmio bar = %hhu\n", afu->global_mmio_bar);
+	dev_dbg(&dev->dev, "  global mmio offset = %#llx\n",
+		afu->global_mmio_offset);
+	dev_dbg(&dev->dev, "  global mmio size = %#x\n", afu->global_mmio_size);
+	dev_dbg(&dev->dev, "  pp mmio bar = %hhu\n", afu->pp_mmio_bar);
+	dev_dbg(&dev->dev, "  pp mmio offset = %#llx\n", afu->pp_mmio_offset);
+	dev_dbg(&dev->dev, "  pp mmio stride = %#x\n", afu->pp_mmio_stride);
+	dev_dbg(&dev->dev, "  mem size (log) = %hhu\n", afu->log_mem_size);
+	dev_dbg(&dev->dev, "  pasid supported (log) = %u\n",
+		afu->pasid_supported_log);
+	dev_dbg(&dev->dev, "  actag supported = %u\n",
+		afu->actag_supported);
+
+	rc = validate_afu(dev, afu);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocxl_config_read_afu);
+
+int ocxl_config_get_actag_info(struct pci_dev *dev, u16 *base, u16 *enabled,
+			u16 *supported)
+{
+	int rc;
+
+	/*
+	 * This is really a simple wrapper for the kernel API, to
+	 * avoid an external driver using ocxl as a library to call
+	 * platform-dependent code
+	 */
+	rc = pnv_ocxl_get_actag(dev, base, enabled, supported);
+	if (rc) {
+		dev_err(&dev->dev, "Can't get actag for device: %d\n", rc);
+		return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ocxl_config_get_actag_info);
+
+void ocxl_config_set_afu_actag(struct pci_dev *dev, int pos, int actag_base,
+			int actag_count)
+{
+	u16 val;
+
+	val = actag_count & OCXL_DVSEC_ACTAG_MASK;
+	pci_write_config_byte(dev, pos + OCXL_DVSEC_AFU_CTRL_ACTAG_EN, val);
+
+	val = actag_base & OCXL_DVSEC_ACTAG_MASK;
+	pci_write_config_dword(dev, pos + OCXL_DVSEC_AFU_CTRL_ACTAG_BASE, val);
+}
+EXPORT_SYMBOL_GPL(ocxl_config_set_afu_actag);
+
+int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count)
+{
+	return pnv_ocxl_get_pasid_count(dev, count);
+}
+EXPORT_SYMBOL_GPL(ocxl_config_get_pasid_info);
+
+void ocxl_config_set_afu_pasid(struct pci_dev *dev, int pos, int pasid_base,
+			u32 pasid_count_log)
+{
+	u8 val8;
+	u32 val32;
+
+	val8 = pasid_count_log & OCXL_DVSEC_PASID_LOG_MASK;
+	pci_write_config_byte(dev, pos + OCXL_DVSEC_AFU_CTRL_PASID_EN, val8);
+
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_AFU_CTRL_PASID_BASE,
+			&val32);
+	val32 &= ~OCXL_DVSEC_PASID_MASK;
+	val32 |= pasid_base & OCXL_DVSEC_PASID_MASK;
+	pci_write_config_dword(dev, pos + OCXL_DVSEC_AFU_CTRL_PASID_BASE,
+			val32);
+}
+EXPORT_SYMBOL_GPL(ocxl_config_set_afu_pasid);
+
+void ocxl_config_set_afu_state(struct pci_dev *dev, int pos, int enable)
+{
+	u8 val;
+
+	pci_read_config_byte(dev, pos + OCXL_DVSEC_AFU_CTRL_ENABLE, &val);
+	if (enable)
+		val |= 1;
+	else
+		val &= 0xFE;
+	pci_write_config_byte(dev, pos + OCXL_DVSEC_AFU_CTRL_ENABLE, val);
+}
+EXPORT_SYMBOL_GPL(ocxl_config_set_afu_state);
+
+int ocxl_config_set_TL(struct pci_dev *dev, int tl_dvsec)
+{
+	u32 val;
+	__be32 *be32ptr;
+	u8 timers;
+	int i, rc;
+	long recv_cap;
+	char *recv_rate;
+
+	/*
+	 * Skip on function != 0, as the TL can only be defined on 0
+	 */
+	if (PCI_FUNC(dev->devfn) != 0)
+		return 0;
+
+	recv_rate = kzalloc(PNV_OCXL_TL_RATE_BUF_SIZE, GFP_KERNEL);
+	if (!recv_rate)
+		return -ENOMEM;
+	/*
+	 * The spec defines 64 templates for messages in the
+	 * Transaction Layer (TL).
+	 *
+	 * The host and device each support a subset, so we need to
+	 * configure the transmitters on each side to send only
+	 * templates the receiver understands, at a rate the receiver
+	 * can process.  Per the spec, template 0 must be supported by
+	 * everybody. That's the template which has been used by the
+	 * host and device so far.
+	 *
+	 * The sending rate limit must be set before the template is
+	 * enabled.
+	 */
+
+	/*
+	 * Device -> host
+	 */
+	rc = pnv_ocxl_get_tl_cap(dev, &recv_cap, recv_rate,
+				PNV_OCXL_TL_RATE_BUF_SIZE);
+	if (rc)
+		goto out;
+
+	for (i = 0; i < PNV_OCXL_TL_RATE_BUF_SIZE; i += 4) {
+		be32ptr = (__be32 *) &recv_rate[i];
+		pci_write_config_dword(dev,
+				tl_dvsec + OCXL_DVSEC_TL_SEND_RATE + i,
+				be32_to_cpu(*be32ptr));
+	}
+	val = recv_cap >> 32;
+	pci_write_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_SEND_CAP, val);
+	val = recv_cap & GENMASK(31, 0);
+	pci_write_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_SEND_CAP + 4, val);
+
+	/*
+	 * Host -> device
+	 */
+	for (i = 0; i < PNV_OCXL_TL_RATE_BUF_SIZE; i += 4) {
+		pci_read_config_dword(dev,
+				tl_dvsec + OCXL_DVSEC_TL_RECV_RATE + i,
+				&val);
+		be32ptr = (__be32 *) &recv_rate[i];
+		*be32ptr = cpu_to_be32(val);
+	}
+	pci_read_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_RECV_CAP, &val);
+	recv_cap = (long) val << 32;
+	pci_read_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_RECV_CAP + 4, &val);
+	recv_cap |= val;
+
+	rc = pnv_ocxl_set_tl_conf(dev, recv_cap, __pa(recv_rate),
+				PNV_OCXL_TL_RATE_BUF_SIZE);
+	if (rc)
+		goto out;
+
+	/*
+	 * Opencapi commands needing to be retried are classified per
+	 * the TL in 2 groups: short and long commands.
+	 *
+	 * The short back off timer it not used for now. It will be
+	 * for opencapi 4.0.
+	 *
+	 * The long back off timer is typically used when an AFU hits
+	 * a page fault but the NPU is already processing one. So the
+	 * AFU needs to wait before it can resubmit. Having a value
+	 * too low doesn't break anything, but can generate extra
+	 * traffic on the link.
+	 * We set it to 1.6 us for now. It's shorter than, but in the
+	 * same order of magnitude as the time spent to process a page
+	 * fault.
+	 */
+	timers = 0x2 << 4; /* long timer = 1.6 us */
+	pci_write_config_byte(dev, tl_dvsec + OCXL_DVSEC_TL_BACKOFF_TIMERS,
+			timers);
+
+	rc = 0;
+out:
+	kfree(recv_rate);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocxl_config_set_TL);
+
+int ocxl_config_terminate_pasid(struct pci_dev *dev, int afu_control, int pasid)
+{
+	u32 val;
+	unsigned long timeout;
+
+	pci_read_config_dword(dev, afu_control + OCXL_DVSEC_AFU_CTRL_TERM_PASID,
+			&val);
+	if (EXTRACT_BIT(val, 20)) {
+		dev_err(&dev->dev,
+			"Can't terminate PASID %#x, previous termination didn't complete\n",
+			pasid);
+		return -EBUSY;
+	}
+
+	val &= ~OCXL_DVSEC_PASID_MASK;
+	val |= pasid & OCXL_DVSEC_PASID_MASK;
+	val |= BIT(20);
+	pci_write_config_dword(dev,
+			afu_control + OCXL_DVSEC_AFU_CTRL_TERM_PASID,
+			val);
+
+	timeout = jiffies + (HZ * OCXL_CFG_TIMEOUT);
+	pci_read_config_dword(dev, afu_control + OCXL_DVSEC_AFU_CTRL_TERM_PASID,
+			&val);
+	while (EXTRACT_BIT(val, 20)) {
+		if (time_after_eq(jiffies, timeout)) {
+			dev_err(&dev->dev,
+				"Timeout while waiting for AFU to terminate PASID %#x\n",
+				pasid);
+			return -EBUSY;
+		}
+		cpu_relax();
+		pci_read_config_dword(dev,
+				afu_control + OCXL_DVSEC_AFU_CTRL_TERM_PASID,
+				&val);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ocxl_config_terminate_pasid);
+
+void ocxl_config_set_actag(struct pci_dev *dev, int func_dvsec, u32 tag_first,
+			u32 tag_count)
+{
+	u32 val;
+
+	val = (tag_first & OCXL_DVSEC_ACTAG_MASK) << 16;
+	val |= tag_count & OCXL_DVSEC_ACTAG_MASK;
+	pci_write_config_dword(dev, func_dvsec + OCXL_DVSEC_FUNC_OFF_ACTAG,
+			val);
+}
+EXPORT_SYMBOL_GPL(ocxl_config_set_actag);
diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
new file mode 100644
index 000000000000..909e8807824a
--- /dev/null
+++ b/drivers/misc/ocxl/context.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/sched/mm.h>
+#include "trace.h"
+#include "ocxl_internal.h"
+
+struct ocxl_context *ocxl_context_alloc(void)
+{
+	return kzalloc(sizeof(struct ocxl_context), GFP_KERNEL);
+}
+
+int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu,
+		struct address_space *mapping)
+{
+	int pasid;
+
+	ctx->afu = afu;
+	mutex_lock(&afu->contexts_lock);
+	pasid = idr_alloc(&afu->contexts_idr, ctx, afu->pasid_base,
+			afu->pasid_base + afu->pasid_max, GFP_KERNEL);
+	if (pasid < 0) {
+		mutex_unlock(&afu->contexts_lock);
+		return pasid;
+	}
+	afu->pasid_count++;
+	mutex_unlock(&afu->contexts_lock);
+
+	ctx->pasid = pasid;
+	ctx->status = OPENED;
+	mutex_init(&ctx->status_mutex);
+	ctx->mapping = mapping;
+	mutex_init(&ctx->mapping_lock);
+	init_waitqueue_head(&ctx->events_wq);
+	mutex_init(&ctx->xsl_error_lock);
+	mutex_init(&ctx->irq_lock);
+	idr_init(&ctx->irq_idr);
+	/*
+	 * Keep a reference on the AFU to make sure it's valid for the
+	 * duration of the life of the context
+	 */
+	ocxl_afu_get(afu);
+	return 0;
+}
+
+/*
+ * Callback for when a translation fault triggers an error
+ * data:	a pointer to the context which triggered the fault
+ * addr:	the address that triggered the error
+ * dsisr:	the value of the PPC64 dsisr register
+ */
+static void xsl_fault_error(void *data, u64 addr, u64 dsisr)
+{
+	struct ocxl_context *ctx = (struct ocxl_context *) data;
+
+	mutex_lock(&ctx->xsl_error_lock);
+	ctx->xsl_error.addr = addr;
+	ctx->xsl_error.dsisr = dsisr;
+	ctx->xsl_error.count++;
+	mutex_unlock(&ctx->xsl_error_lock);
+
+	wake_up_all(&ctx->events_wq);
+}
+
+int ocxl_context_attach(struct ocxl_context *ctx, u64 amr)
+{
+	int rc;
+
+	mutex_lock(&ctx->status_mutex);
+	if (ctx->status != OPENED) {
+		rc = -EIO;
+		goto out;
+	}
+
+	rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid,
+			current->mm->context.id, 0, amr, current->mm,
+			xsl_fault_error, ctx);
+	if (rc)
+		goto out;
+
+	ctx->status = ATTACHED;
+out:
+	mutex_unlock(&ctx->status_mutex);
+	return rc;
+}
+
+static int map_afu_irq(struct vm_area_struct *vma, unsigned long address,
+		u64 offset, struct ocxl_context *ctx)
+{
+	u64 trigger_addr;
+
+	trigger_addr = ocxl_afu_irq_get_addr(ctx, offset);
+	if (!trigger_addr)
+		return VM_FAULT_SIGBUS;
+
+	vm_insert_pfn(vma, address, trigger_addr >> PAGE_SHIFT);
+	return VM_FAULT_NOPAGE;
+}
+
+static int map_pp_mmio(struct vm_area_struct *vma, unsigned long address,
+		u64 offset, struct ocxl_context *ctx)
+{
+	u64 pp_mmio_addr;
+	int pasid_off;
+
+	if (offset >= ctx->afu->config.pp_mmio_stride)
+		return VM_FAULT_SIGBUS;
+
+	mutex_lock(&ctx->status_mutex);
+	if (ctx->status != ATTACHED) {
+		mutex_unlock(&ctx->status_mutex);
+		pr_debug("%s: Context not attached, failing mmio mmap\n",
+			__func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pasid_off = ctx->pasid - ctx->afu->pasid_base;
+	pp_mmio_addr = ctx->afu->pp_mmio_start +
+		pasid_off * ctx->afu->config.pp_mmio_stride +
+		offset;
+
+	vm_insert_pfn(vma, address, pp_mmio_addr >> PAGE_SHIFT);
+	mutex_unlock(&ctx->status_mutex);
+	return VM_FAULT_NOPAGE;
+}
+
+static int ocxl_mmap_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct ocxl_context *ctx = vma->vm_file->private_data;
+	u64 offset;
+	int rc;
+
+	offset = vmf->pgoff << PAGE_SHIFT;
+	pr_debug("%s: pasid %d address 0x%lx offset 0x%llx\n", __func__,
+		ctx->pasid, vmf->address, offset);
+
+	if (offset < ctx->afu->irq_base_offset)
+		rc = map_pp_mmio(vma, vmf->address, offset, ctx);
+	else
+		rc = map_afu_irq(vma, vmf->address, offset, ctx);
+	return rc;
+}
+
+static const struct vm_operations_struct ocxl_vmops = {
+	.fault = ocxl_mmap_fault,
+};
+
+static int check_mmap_afu_irq(struct ocxl_context *ctx,
+			struct vm_area_struct *vma)
+{
+	/* only one page */
+	if (vma_pages(vma) != 1)
+		return -EINVAL;
+
+	/* check offset validty */
+	if (!ocxl_afu_irq_get_addr(ctx, vma->vm_pgoff << PAGE_SHIFT))
+		return -EINVAL;
+
+	/*
+	 * trigger page should only be accessible in write mode.
+	 *
+	 * It's a bit theoretical, as a page mmaped with only
+	 * PROT_WRITE is currently readable, but it doesn't hurt.
+	 */
+	if ((vma->vm_flags & VM_READ) || (vma->vm_flags & VM_EXEC) ||
+		!(vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+	vma->vm_flags &= ~(VM_MAYREAD | VM_MAYEXEC);
+	return 0;
+}
+
+static int check_mmap_mmio(struct ocxl_context *ctx,
+			struct vm_area_struct *vma)
+{
+	if ((vma_pages(vma) + vma->vm_pgoff) >
+		(ctx->afu->config.pp_mmio_stride >> PAGE_SHIFT))
+		return -EINVAL;
+	return 0;
+}
+
+int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma)
+{
+	int rc;
+
+	if ((vma->vm_pgoff << PAGE_SHIFT) < ctx->afu->irq_base_offset)
+		rc = check_mmap_mmio(ctx, vma);
+	else
+		rc = check_mmap_afu_irq(ctx, vma);
+	if (rc)
+		return rc;
+
+	vma->vm_flags |= VM_IO | VM_PFNMAP;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_ops = &ocxl_vmops;
+	return 0;
+}
+
+int ocxl_context_detach(struct ocxl_context *ctx)
+{
+	struct pci_dev *dev;
+	int afu_control_pos;
+	enum ocxl_context_status status;
+	int rc;
+
+	mutex_lock(&ctx->status_mutex);
+	status = ctx->status;
+	ctx->status = CLOSED;
+	mutex_unlock(&ctx->status_mutex);
+	if (status != ATTACHED)
+		return 0;
+
+	dev = to_pci_dev(ctx->afu->fn->dev.parent);
+	afu_control_pos = ctx->afu->config.dvsec_afu_control_pos;
+
+	mutex_lock(&ctx->afu->afu_control_lock);
+	rc = ocxl_config_terminate_pasid(dev, afu_control_pos, ctx->pasid);
+	mutex_unlock(&ctx->afu->afu_control_lock);
+	trace_ocxl_terminate_pasid(ctx->pasid, rc);
+	if (rc) {
+		/*
+		 * If we timeout waiting for the AFU to terminate the
+		 * pasid, then it's dangerous to clean up the Process
+		 * Element entry in the SPA, as it may be referenced
+		 * in the future by the AFU. In which case, we would
+		 * checkstop because of an invalid PE access (FIR
+		 * register 2, bit 42). So leave the PE
+		 * defined. Caller shouldn't free the context so that
+		 * PASID remains allocated.
+		 *
+		 * A link reset will be required to cleanup the AFU
+		 * and the SPA.
+		 */
+		if (rc == -EBUSY)
+			return rc;
+	}
+	rc = ocxl_link_remove_pe(ctx->afu->fn->link, ctx->pasid);
+	if (rc) {
+		dev_warn(&ctx->afu->dev,
+			"Couldn't remove PE entry cleanly: %d\n", rc);
+	}
+	return 0;
+}
+
+void ocxl_context_detach_all(struct ocxl_afu *afu)
+{
+	struct ocxl_context *ctx;
+	int tmp;
+
+	mutex_lock(&afu->contexts_lock);
+	idr_for_each_entry(&afu->contexts_idr, ctx, tmp) {
+		ocxl_context_detach(ctx);
+		/*
+		 * We are force detaching - remove any active mmio
+		 * mappings so userspace cannot interfere with the
+		 * card if it comes back.  Easiest way to exercise
+		 * this is to unbind and rebind the driver via sysfs
+		 * while it is in use.
+		 */
+		mutex_lock(&ctx->mapping_lock);
+		if (ctx->mapping)
+			unmap_mapping_range(ctx->mapping, 0, 0, 1);
+		mutex_unlock(&ctx->mapping_lock);
+	}
+	mutex_unlock(&afu->contexts_lock);
+}
+
+void ocxl_context_free(struct ocxl_context *ctx)
+{
+	mutex_lock(&ctx->afu->contexts_lock);
+	ctx->afu->pasid_count--;
+	idr_remove(&ctx->afu->contexts_idr, ctx->pasid);
+	mutex_unlock(&ctx->afu->contexts_lock);
+
+	ocxl_afu_irq_free_all(ctx);
+	idr_destroy(&ctx->irq_idr);
+	/* reference to the AFU taken in ocxl_context_init */
+	ocxl_afu_put(ctx->afu);
+	kfree(ctx);
+}
diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
new file mode 100644
index 000000000000..c90c1a578d2f
--- /dev/null
+++ b/drivers/misc/ocxl/file.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <uapi/misc/ocxl.h>
+#include "ocxl_internal.h"
+
+
+#define OCXL_NUM_MINORS 256 /* Total to reserve */
+
+static dev_t ocxl_dev;
+static struct class *ocxl_class;
+static struct mutex minors_idr_lock;
+static struct idr minors_idr;
+
+static struct ocxl_afu *find_and_get_afu(dev_t devno)
+{
+	struct ocxl_afu *afu;
+	int afu_minor;
+
+	afu_minor = MINOR(devno);
+	/*
+	 * We don't declare an RCU critical section here, as our AFU
+	 * is protected by a reference counter on the device. By the time the
+	 * minor number of a device is removed from the idr, the ref count of
+	 * the device is already at 0, so no user API will access that AFU and
+	 * this function can't return it.
+	 */
+	afu = idr_find(&minors_idr, afu_minor);
+	if (afu)
+		ocxl_afu_get(afu);
+	return afu;
+}
+
+static int allocate_afu_minor(struct ocxl_afu *afu)
+{
+	int minor;
+
+	mutex_lock(&minors_idr_lock);
+	minor = idr_alloc(&minors_idr, afu, 0, OCXL_NUM_MINORS, GFP_KERNEL);
+	mutex_unlock(&minors_idr_lock);
+	return minor;
+}
+
+static void free_afu_minor(struct ocxl_afu *afu)
+{
+	mutex_lock(&minors_idr_lock);
+	idr_remove(&minors_idr, MINOR(afu->dev.devt));
+	mutex_unlock(&minors_idr_lock);
+}
+
+static int afu_open(struct inode *inode, struct file *file)
+{
+	struct ocxl_afu *afu;
+	struct ocxl_context *ctx;
+	int rc;
+
+	pr_debug("%s for device %x\n", __func__, inode->i_rdev);
+
+	afu = find_and_get_afu(inode->i_rdev);
+	if (!afu)
+		return -ENODEV;
+
+	ctx = ocxl_context_alloc();
+	if (!ctx) {
+		rc = -ENOMEM;
+		goto put_afu;
+	}
+
+	rc = ocxl_context_init(ctx, afu, inode->i_mapping);
+	if (rc)
+		goto put_afu;
+	file->private_data = ctx;
+	ocxl_afu_put(afu);
+	return 0;
+
+put_afu:
+	ocxl_afu_put(afu);
+	return rc;
+}
+
+static long afu_ioctl_attach(struct ocxl_context *ctx,
+			struct ocxl_ioctl_attach __user *uarg)
+{
+	struct ocxl_ioctl_attach arg;
+	u64 amr = 0;
+	int rc;
+
+	pr_debug("%s for context %d\n", __func__, ctx->pasid);
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	/* Make sure reserved fields are not set for forward compatibility */
+	if (arg.reserved1 || arg.reserved2 || arg.reserved3)
+		return -EINVAL;
+
+	amr = arg.amr & mfspr(SPRN_UAMOR);
+	rc = ocxl_context_attach(ctx, amr);
+	return rc;
+}
+
+#define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" :			\
+			x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" :	\
+			x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" :		\
+			x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" :	\
+			"UNKNOWN")
+
+static long afu_ioctl(struct file *file, unsigned int cmd,
+		unsigned long args)
+{
+	struct ocxl_context *ctx = file->private_data;
+	struct ocxl_ioctl_irq_fd irq_fd;
+	u64 irq_offset;
+	long rc;
+
+	pr_debug("%s for context %d, command %s\n", __func__, ctx->pasid,
+		CMD_STR(cmd));
+
+	if (ctx->status == CLOSED)
+		return -EIO;
+
+	switch (cmd) {
+	case OCXL_IOCTL_ATTACH:
+		rc = afu_ioctl_attach(ctx,
+				(struct ocxl_ioctl_attach __user *) args);
+		break;
+
+	case OCXL_IOCTL_IRQ_ALLOC:
+		rc = ocxl_afu_irq_alloc(ctx, &irq_offset);
+		if (!rc) {
+			rc = copy_to_user((u64 __user *) args, &irq_offset,
+					sizeof(irq_offset));
+			if (rc)
+				ocxl_afu_irq_free(ctx, irq_offset);
+		}
+		break;
+
+	case OCXL_IOCTL_IRQ_FREE:
+		rc = copy_from_user(&irq_offset, (u64 __user *) args,
+				sizeof(irq_offset));
+		if (rc)
+			return -EFAULT;
+		rc = ocxl_afu_irq_free(ctx, irq_offset);
+		break;
+
+	case OCXL_IOCTL_IRQ_SET_FD:
+		rc = copy_from_user(&irq_fd, (u64 __user *) args,
+				sizeof(irq_fd));
+		if (rc)
+			return -EFAULT;
+		if (irq_fd.reserved)
+			return -EINVAL;
+		rc = ocxl_afu_irq_set_fd(ctx, irq_fd.irq_offset,
+					irq_fd.eventfd);
+		break;
+
+	default:
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+static long afu_compat_ioctl(struct file *file, unsigned int cmd,
+			unsigned long args)
+{
+	return afu_ioctl(file, cmd, args);
+}
+
+static int afu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct ocxl_context *ctx = file->private_data;
+
+	pr_debug("%s for context %d\n", __func__, ctx->pasid);
+	return ocxl_context_mmap(ctx, vma);
+}
+
+static bool has_xsl_error(struct ocxl_context *ctx)
+{
+	bool ret;
+
+	mutex_lock(&ctx->xsl_error_lock);
+	ret = !!ctx->xsl_error.addr;
+	mutex_unlock(&ctx->xsl_error_lock);
+
+	return ret;
+}
+
+/*
+ * Are there any events pending on the AFU
+ * ctx: The AFU context
+ * Returns: true if there are events pending
+ */
+static bool afu_events_pending(struct ocxl_context *ctx)
+{
+	if (has_xsl_error(ctx))
+		return true;
+	return false;
+}
+
+static unsigned int afu_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct ocxl_context *ctx = file->private_data;
+	unsigned int mask = 0;
+	bool closed;
+
+	pr_debug("%s for context %d\n", __func__, ctx->pasid);
+
+	poll_wait(file, &ctx->events_wq, wait);
+
+	mutex_lock(&ctx->status_mutex);
+	closed = (ctx->status == CLOSED);
+	mutex_unlock(&ctx->status_mutex);
+
+	if (afu_events_pending(ctx))
+		mask = POLLIN | POLLRDNORM;
+	else if (closed)
+		mask = POLLERR;
+
+	return mask;
+}
+
+/*
+ * Populate the supplied buffer with a single XSL error
+ * ctx:	The AFU context to report the error from
+ * header: the event header to populate
+ * buf: The buffer to write the body into (should be at least
+ *      AFU_EVENT_BODY_XSL_ERROR_SIZE)
+ * Return: the amount of buffer that was populated
+ */
+static ssize_t append_xsl_error(struct ocxl_context *ctx,
+				struct ocxl_kernel_event_header *header,
+				char __user *buf)
+{
+	struct ocxl_kernel_event_xsl_fault_error body;
+
+	memset(&body, 0, sizeof(body));
+
+	mutex_lock(&ctx->xsl_error_lock);
+	if (!ctx->xsl_error.addr) {
+		mutex_unlock(&ctx->xsl_error_lock);
+		return 0;
+	}
+
+	body.addr = ctx->xsl_error.addr;
+	body.dsisr = ctx->xsl_error.dsisr;
+	body.count = ctx->xsl_error.count;
+
+	ctx->xsl_error.addr = 0;
+	ctx->xsl_error.dsisr = 0;
+	ctx->xsl_error.count = 0;
+
+	mutex_unlock(&ctx->xsl_error_lock);
+
+	header->type = OCXL_AFU_EVENT_XSL_FAULT_ERROR;
+
+	if (copy_to_user(buf, &body, sizeof(body)))
+		return -EFAULT;
+
+	return sizeof(body);
+}
+
+#define AFU_EVENT_BODY_MAX_SIZE sizeof(struct ocxl_kernel_event_xsl_fault_error)
+
+/*
+ * Reports events on the AFU
+ * Format:
+ *	Header (struct ocxl_kernel_event_header)
+ *	Body (struct ocxl_kernel_event_*)
+ *	Header...
+ */
+static ssize_t afu_read(struct file *file, char __user *buf, size_t count,
+			loff_t *off)
+{
+	struct ocxl_context *ctx = file->private_data;
+	struct ocxl_kernel_event_header header;
+	ssize_t rc;
+	size_t used = 0;
+	DEFINE_WAIT(event_wait);
+
+	memset(&header, 0, sizeof(header));
+
+	/* Require offset to be 0 */
+	if (*off != 0)
+		return -EINVAL;
+
+	if (count < (sizeof(struct ocxl_kernel_event_header) +
+			AFU_EVENT_BODY_MAX_SIZE))
+		return -EINVAL;
+
+	for (;;) {
+		prepare_to_wait(&ctx->events_wq, &event_wait,
+				TASK_INTERRUPTIBLE);
+
+		if (afu_events_pending(ctx))
+			break;
+
+		if (ctx->status == CLOSED)
+			break;
+
+		if (file->f_flags & O_NONBLOCK) {
+			finish_wait(&ctx->events_wq, &event_wait);
+			return -EAGAIN;
+		}
+
+		if (signal_pending(current)) {
+			finish_wait(&ctx->events_wq, &event_wait);
+			return -ERESTARTSYS;
+		}
+
+		schedule();
+	}
+
+	finish_wait(&ctx->events_wq, &event_wait);
+
+	if (has_xsl_error(ctx)) {
+		used = append_xsl_error(ctx, &header, buf + sizeof(header));
+		if (used < 0)
+			return used;
+	}
+
+	if (!afu_events_pending(ctx))
+		header.flags |= OCXL_KERNEL_EVENT_FLAG_LAST;
+
+	if (copy_to_user(buf, &header, sizeof(header)))
+		return -EFAULT;
+
+	used += sizeof(header);
+
+	rc = (ssize_t) used;
+	return rc;
+}
+
+static int afu_release(struct inode *inode, struct file *file)
+{
+	struct ocxl_context *ctx = file->private_data;
+	int rc;
+
+	pr_debug("%s for device %x\n", __func__, inode->i_rdev);
+	rc = ocxl_context_detach(ctx);
+	mutex_lock(&ctx->mapping_lock);
+	ctx->mapping = NULL;
+	mutex_unlock(&ctx->mapping_lock);
+	wake_up_all(&ctx->events_wq);
+	if (rc != -EBUSY)
+		ocxl_context_free(ctx);
+	return 0;
+}
+
+static const struct file_operations ocxl_afu_fops = {
+	.owner		= THIS_MODULE,
+	.open           = afu_open,
+	.unlocked_ioctl = afu_ioctl,
+	.compat_ioctl   = afu_compat_ioctl,
+	.mmap           = afu_mmap,
+	.poll           = afu_poll,
+	.read           = afu_read,
+	.release        = afu_release,
+};
+
+int ocxl_create_cdev(struct ocxl_afu *afu)
+{
+	int rc;
+
+	cdev_init(&afu->cdev, &ocxl_afu_fops);
+	rc = cdev_add(&afu->cdev, afu->dev.devt, 1);
+	if (rc) {
+		dev_err(&afu->dev, "Unable to add afu char device: %d\n", rc);
+		return rc;
+	}
+	return 0;
+}
+
+void ocxl_destroy_cdev(struct ocxl_afu *afu)
+{
+	cdev_del(&afu->cdev);
+}
+
+int ocxl_register_afu(struct ocxl_afu *afu)
+{
+	int minor;
+
+	minor = allocate_afu_minor(afu);
+	if (minor < 0)
+		return minor;
+	afu->dev.devt = MKDEV(MAJOR(ocxl_dev), minor);
+	afu->dev.class = ocxl_class;
+	return device_register(&afu->dev);
+}
+
+void ocxl_unregister_afu(struct ocxl_afu *afu)
+{
+	free_afu_minor(afu);
+}
+
+static char *ocxl_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "ocxl/%s", dev_name(dev));
+}
+
+int ocxl_file_init(void)
+{
+	int rc;
+
+	mutex_init(&minors_idr_lock);
+	idr_init(&minors_idr);
+
+	rc = alloc_chrdev_region(&ocxl_dev, 0, OCXL_NUM_MINORS, "ocxl");
+	if (rc) {
+		pr_err("Unable to allocate ocxl major number: %d\n", rc);
+		return rc;
+	}
+
+	ocxl_class = class_create(THIS_MODULE, "ocxl");
+	if (IS_ERR(ocxl_class)) {
+		pr_err("Unable to create ocxl class\n");
+		unregister_chrdev_region(ocxl_dev, OCXL_NUM_MINORS);
+		return PTR_ERR(ocxl_class);
+	}
+
+	ocxl_class->devnode = ocxl_devnode;
+	return 0;
+}
+
+void ocxl_file_exit(void)
+{
+	class_destroy(ocxl_class);
+	unregister_chrdev_region(ocxl_dev, OCXL_NUM_MINORS);
+	idr_destroy(&minors_idr);
+}
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
new file mode 100644
index 000000000000..f30790582dc0
--- /dev/null
+++ b/drivers/misc/ocxl/link.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/sched/mm.h>
+#include <linux/mutex.h>
+#include <linux/mmu_context.h>
+#include <asm/copro.h>
+#include <asm/pnv-ocxl.h>
+#include <misc/ocxl.h>
+#include "ocxl_internal.h"
+#include "trace.h"
+
+
+#define SPA_PASID_BITS		15
+#define SPA_PASID_MAX		((1 << SPA_PASID_BITS) - 1)
+#define SPA_PE_MASK		SPA_PASID_MAX
+#define SPA_SPA_SIZE_LOG	22 /* Each SPA is 4 Mb */
+
+#define SPA_CFG_SF		(1ull << (63-0))
+#define SPA_CFG_TA		(1ull << (63-1))
+#define SPA_CFG_HV		(1ull << (63-3))
+#define SPA_CFG_UV		(1ull << (63-4))
+#define SPA_CFG_XLAT_hpt	(0ull << (63-6)) /* Hashed page table (HPT) mode */
+#define SPA_CFG_XLAT_roh	(2ull << (63-6)) /* Radix on HPT mode */
+#define SPA_CFG_XLAT_ror	(3ull << (63-6)) /* Radix on Radix mode */
+#define SPA_CFG_PR		(1ull << (63-49))
+#define SPA_CFG_TC		(1ull << (63-54))
+#define SPA_CFG_DR		(1ull << (63-59))
+
+#define SPA_XSL_TF		(1ull << (63-3))  /* Translation fault */
+#define SPA_XSL_S		(1ull << (63-38)) /* Store operation */
+
+#define SPA_PE_VALID		0x80000000
+
+
+struct pe_data {
+	struct mm_struct *mm;
+	/* callback to trigger when a translation fault occurs */
+	void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr);
+	/* opaque pointer to be passed to the above callback */
+	void *xsl_err_data;
+	struct rcu_head rcu;
+};
+
+struct spa {
+	struct ocxl_process_element *spa_mem;
+	int spa_order;
+	struct mutex spa_lock;
+	struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */
+	char *irq_name;
+	int virq;
+	void __iomem *reg_dsisr;
+	void __iomem *reg_dar;
+	void __iomem *reg_tfc;
+	void __iomem *reg_pe_handle;
+	/*
+	 * The following field are used by the memory fault
+	 * interrupt handler. We can only have one interrupt at a
+	 * time. The NPU won't raise another interrupt until the
+	 * previous one has been ack'd by writing to the TFC register
+	 */
+	struct xsl_fault {
+		struct work_struct fault_work;
+		u64 pe;
+		u64 dsisr;
+		u64 dar;
+		struct pe_data pe_data;
+	} xsl_fault;
+};
+
+/*
+ * A opencapi link can be used be by several PCI functions. We have
+ * one link per device slot.
+ *
+ * A linked list of opencapi links should suffice, as there's a
+ * limited number of opencapi slots on a system and lookup is only
+ * done when the device is probed
+ */
+struct link {
+	struct list_head list;
+	struct kref ref;
+	int domain;
+	int bus;
+	int dev;
+	atomic_t irq_available;
+	struct spa *spa;
+	void *platform_data;
+};
+static struct list_head links_list = LIST_HEAD_INIT(links_list);
+static DEFINE_MUTEX(links_list_lock);
+
+enum xsl_response {
+	CONTINUE,
+	ADDRESS_ERROR,
+	RESTART,
+};
+
+
+static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe)
+{
+	u64 reg;
+
+	*dsisr = in_be64(spa->reg_dsisr);
+	*dar = in_be64(spa->reg_dar);
+	reg = in_be64(spa->reg_pe_handle);
+	*pe = reg & SPA_PE_MASK;
+}
+
+static void ack_irq(struct spa *spa, enum xsl_response r)
+{
+	u64 reg = 0;
+
+	/* continue is not supported */
+	if (r == RESTART)
+		reg = PPC_BIT(31);
+	else if (r == ADDRESS_ERROR)
+		reg = PPC_BIT(30);
+	else
+		WARN(1, "Invalid irq response %d\n", r);
+
+	if (reg) {
+		trace_ocxl_fault_ack(spa->spa_mem, spa->xsl_fault.pe,
+				spa->xsl_fault.dsisr, spa->xsl_fault.dar, reg);
+		out_be64(spa->reg_tfc, reg);
+	}
+}
+
+static void xsl_fault_handler_bh(struct work_struct *fault_work)
+{
+	unsigned int flt = 0;
+	unsigned long access, flags, inv_flags = 0;
+	enum xsl_response r;
+	struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
+					fault_work);
+	struct spa *spa = container_of(fault, struct spa, xsl_fault);
+
+	int rc;
+
+	/*
+	 * We need to release a reference on the mm whenever exiting this
+	 * function (taken in the memory fault interrupt handler)
+	 */
+	rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
+				&flt);
+	if (rc) {
+		pr_debug("copro_handle_mm_fault failed: %d\n", rc);
+		if (fault->pe_data.xsl_err_cb) {
+			fault->pe_data.xsl_err_cb(
+				fault->pe_data.xsl_err_data,
+				fault->dar, fault->dsisr);
+		}
+		r = ADDRESS_ERROR;
+		goto ack;
+	}
+
+	if (!radix_enabled()) {
+		/*
+		 * update_mmu_cache() will not have loaded the hash
+		 * since current->trap is not a 0x400 or 0x300, so
+		 * just call hash_page_mm() here.
+		 */
+		access = _PAGE_PRESENT | _PAGE_READ;
+		if (fault->dsisr & SPA_XSL_S)
+			access |= _PAGE_WRITE;
+
+		if (REGION_ID(fault->dar) != USER_REGION_ID)
+			access |= _PAGE_PRIVILEGED;
+
+		local_irq_save(flags);
+		hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300,
+			inv_flags);
+		local_irq_restore(flags);
+	}
+	r = RESTART;
+ack:
+	mmdrop(fault->pe_data.mm);
+	ack_irq(spa, r);
+}
+
+static irqreturn_t xsl_fault_handler(int irq, void *data)
+{
+	struct link *link = (struct link *) data;
+	struct spa *spa = link->spa;
+	u64 dsisr, dar, pe_handle;
+	struct pe_data *pe_data;
+	struct ocxl_process_element *pe;
+	int lpid, pid, tid;
+
+	read_irq(spa, &dsisr, &dar, &pe_handle);
+	trace_ocxl_fault(spa->spa_mem, pe_handle, dsisr, dar, -1);
+
+	WARN_ON(pe_handle > SPA_PE_MASK);
+	pe = spa->spa_mem + pe_handle;
+	lpid = be32_to_cpu(pe->lpid);
+	pid = be32_to_cpu(pe->pid);
+	tid = be32_to_cpu(pe->tid);
+	/* We could be reading all null values here if the PE is being
+	 * removed while an interrupt kicks in. It's not supposed to
+	 * happen if the driver notified the AFU to terminate the
+	 * PASID, and the AFU waited for pending operations before
+	 * acknowledging. But even if it happens, we won't find a
+	 * memory context below and fail silently, so it should be ok.
+	 */
+	if (!(dsisr & SPA_XSL_TF)) {
+		WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
+		ack_irq(spa, ADDRESS_ERROR);
+		return IRQ_HANDLED;
+	}
+
+	rcu_read_lock();
+	pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle);
+	if (!pe_data) {
+		/*
+		 * Could only happen if the driver didn't notify the
+		 * AFU about PASID termination before removing the PE,
+		 * or the AFU didn't wait for all memory access to
+		 * have completed.
+		 *
+		 * Either way, we fail early, but we shouldn't log an
+		 * error message, as it is a valid (if unexpected)
+		 * scenario
+		 */
+		rcu_read_unlock();
+		pr_debug("Unknown mm context for xsl interrupt\n");
+		ack_irq(spa, ADDRESS_ERROR);
+		return IRQ_HANDLED;
+	}
+	WARN_ON(pe_data->mm->context.id != pid);
+
+	spa->xsl_fault.pe = pe_handle;
+	spa->xsl_fault.dar = dar;
+	spa->xsl_fault.dsisr = dsisr;
+	spa->xsl_fault.pe_data = *pe_data;
+	mmgrab(pe_data->mm); /* mm count is released by bottom half */
+
+	rcu_read_unlock();
+	schedule_work(&spa->xsl_fault.fault_work);
+	return IRQ_HANDLED;
+}
+
+static void unmap_irq_registers(struct spa *spa)
+{
+	pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc,
+				spa->reg_pe_handle);
+}
+
+static int map_irq_registers(struct pci_dev *dev, struct spa *spa)
+{
+	return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar,
+				&spa->reg_tfc, &spa->reg_pe_handle);
+}
+
+static int setup_xsl_irq(struct pci_dev *dev, struct link *link)
+{
+	struct spa *spa = link->spa;
+	int rc;
+	int hwirq;
+
+	rc = pnv_ocxl_get_xsl_irq(dev, &hwirq);
+	if (rc)
+		return rc;
+
+	rc = map_irq_registers(dev, spa);
+	if (rc)
+		return rc;
+
+	spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
+				link->domain, link->bus, link->dev);
+	if (!spa->irq_name) {
+		unmap_irq_registers(spa);
+		dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
+		return -ENOMEM;
+	}
+	/*
+	 * At some point, we'll need to look into allowing a higher
+	 * number of interrupts. Could we have an IRQ domain per link?
+	 */
+	spa->virq = irq_create_mapping(NULL, hwirq);
+	if (!spa->virq) {
+		kfree(spa->irq_name);
+		unmap_irq_registers(spa);
+		dev_err(&dev->dev,
+			"irq_create_mapping failed for translation interrupt\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
+
+	rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
+			link);
+	if (rc) {
+		irq_dispose_mapping(spa->virq);
+		kfree(spa->irq_name);
+		unmap_irq_registers(spa);
+		dev_err(&dev->dev,
+			"request_irq failed for translation interrupt: %d\n",
+			rc);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void release_xsl_irq(struct link *link)
+{
+	struct spa *spa = link->spa;
+
+	if (spa->virq) {
+		free_irq(spa->virq, link);
+		irq_dispose_mapping(spa->virq);
+	}
+	kfree(spa->irq_name);
+	unmap_irq_registers(spa);
+}
+
+static int alloc_spa(struct pci_dev *dev, struct link *link)
+{
+	struct spa *spa;
+
+	spa = kzalloc(sizeof(struct spa), GFP_KERNEL);
+	if (!spa)
+		return -ENOMEM;
+
+	mutex_init(&spa->spa_lock);
+	INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL);
+	INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh);
+
+	spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT;
+	spa->spa_mem = (struct ocxl_process_element *)
+		__get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order);
+	if (!spa->spa_mem) {
+		dev_err(&dev->dev, "Can't allocate Shared Process Area\n");
+		kfree(spa);
+		return -ENOMEM;
+	}
+	pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus,
+		link->dev, spa->spa_mem);
+
+	link->spa = spa;
+	return 0;
+}
+
+static void free_spa(struct link *link)
+{
+	struct spa *spa = link->spa;
+
+	pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus,
+		link->dev);
+
+	if (spa && spa->spa_mem) {
+		free_pages((unsigned long) spa->spa_mem, spa->spa_order);
+		kfree(spa);
+		link->spa = NULL;
+	}
+}
+
+static int alloc_link(struct pci_dev *dev, int PE_mask, struct link **out_link)
+{
+	struct link *link;
+	int rc;
+
+	link = kzalloc(sizeof(struct link), GFP_KERNEL);
+	if (!link)
+		return -ENOMEM;
+
+	kref_init(&link->ref);
+	link->domain = pci_domain_nr(dev->bus);
+	link->bus = dev->bus->number;
+	link->dev = PCI_SLOT(dev->devfn);
+	atomic_set(&link->irq_available, MAX_IRQ_PER_LINK);
+
+	rc = alloc_spa(dev, link);
+	if (rc)
+		goto err_free;
+
+	rc = setup_xsl_irq(dev, link);
+	if (rc)
+		goto err_spa;
+
+	/* platform specific hook */
+	rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
+				&link->platform_data);
+	if (rc)
+		goto err_xsl_irq;
+
+	*out_link = link;
+	return 0;
+
+err_xsl_irq:
+	release_xsl_irq(link);
+err_spa:
+	free_spa(link);
+err_free:
+	kfree(link);
+	return rc;
+}
+
+static void free_link(struct link *link)
+{
+	release_xsl_irq(link);
+	free_spa(link);
+	kfree(link);
+}
+
+int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle)
+{
+	int rc = 0;
+	struct link *link;
+
+	mutex_lock(&links_list_lock);
+	list_for_each_entry(link, &links_list, list) {
+		/* The functions of a device all share the same link */
+		if (link->domain == pci_domain_nr(dev->bus) &&
+			link->bus == dev->bus->number &&
+			link->dev == PCI_SLOT(dev->devfn)) {
+			kref_get(&link->ref);
+			*link_handle = link;
+			goto unlock;
+		}
+	}
+	rc = alloc_link(dev, PE_mask, &link);
+	if (rc)
+		goto unlock;
+
+	list_add(&link->list, &links_list);
+	*link_handle = link;
+unlock:
+	mutex_unlock(&links_list_lock);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocxl_link_setup);
+
+static void release_xsl(struct kref *ref)
+{
+	struct link *link = container_of(ref, struct link, ref);
+
+	list_del(&link->list);
+	/* call platform code before releasing data */
+	pnv_ocxl_spa_release(link->platform_data);
+	free_link(link);
+}
+
+void ocxl_link_release(struct pci_dev *dev, void *link_handle)
+{
+	struct link *link = (struct link *) link_handle;
+
+	mutex_lock(&links_list_lock);
+	kref_put(&link->ref, release_xsl);
+	mutex_unlock(&links_list_lock);
+}
+EXPORT_SYMBOL_GPL(ocxl_link_release);
+
+static u64 calculate_cfg_state(bool kernel)
+{
+	u64 state;
+
+	state = SPA_CFG_DR;
+	if (mfspr(SPRN_LPCR) & LPCR_TC)
+		state |= SPA_CFG_TC;
+	if (radix_enabled())
+		state |= SPA_CFG_XLAT_ror;
+	else
+		state |= SPA_CFG_XLAT_hpt;
+	state |= SPA_CFG_HV;
+	if (kernel) {
+		if (mfmsr() & MSR_SF)
+			state |= SPA_CFG_SF;
+	} else {
+		state |= SPA_CFG_PR;
+		if (!test_tsk_thread_flag(current, TIF_32BIT))
+			state |= SPA_CFG_SF;
+	}
+	return state;
+}
+
+int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
+		u64 amr, struct mm_struct *mm,
+		void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
+		void *xsl_err_data)
+{
+	struct link *link = (struct link *) link_handle;
+	struct spa *spa = link->spa;
+	struct ocxl_process_element *pe;
+	int pe_handle, rc = 0;
+	struct pe_data *pe_data;
+
+	BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128);
+	if (pasid > SPA_PASID_MAX)
+		return -EINVAL;
+
+	mutex_lock(&spa->spa_lock);
+	pe_handle = pasid & SPA_PE_MASK;
+	pe = spa->spa_mem + pe_handle;
+
+	if (pe->software_state) {
+		rc = -EBUSY;
+		goto unlock;
+	}
+
+	pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
+	if (!pe_data) {
+		rc = -ENOMEM;
+		goto unlock;
+	}
+
+	pe_data->mm = mm;
+	pe_data->xsl_err_cb = xsl_err_cb;
+	pe_data->xsl_err_data = xsl_err_data;
+
+	memset(pe, 0, sizeof(struct ocxl_process_element));
+	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
+	pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
+	pe->pid = cpu_to_be32(pidr);
+	pe->tid = cpu_to_be32(tidr);
+	pe->amr = cpu_to_be64(amr);
+	pe->software_state = cpu_to_be32(SPA_PE_VALID);
+
+	mm_context_add_copro(mm);
+	/*
+	 * Barrier is to make sure PE is visible in the SPA before it
+	 * is used by the device. It also helps with the global TLBI
+	 * invalidation
+	 */
+	mb();
+	radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
+
+	/*
+	 * The mm must stay valid for as long as the device uses it. We
+	 * lower the count when the context is removed from the SPA.
+	 *
+	 * We grab mm_count (and not mm_users), as we don't want to
+	 * end up in a circular dependency if a process mmaps its
+	 * mmio, therefore incrementing the file ref count when
+	 * calling mmap(), and forgets to unmap before exiting. In
+	 * that scenario, when the kernel handles the death of the
+	 * process, the file is not cleaned because unmap was not
+	 * called, and the mm wouldn't be freed because we would still
+	 * have a reference on mm_users. Incrementing mm_count solves
+	 * the problem.
+	 */
+	mmgrab(mm);
+	trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
+unlock:
+	mutex_unlock(&spa->spa_lock);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
+
+int ocxl_link_remove_pe(void *link_handle, int pasid)
+{
+	struct link *link = (struct link *) link_handle;
+	struct spa *spa = link->spa;
+	struct ocxl_process_element *pe;
+	struct pe_data *pe_data;
+	int pe_handle, rc;
+
+	if (pasid > SPA_PASID_MAX)
+		return -EINVAL;
+
+	/*
+	 * About synchronization with our memory fault handler:
+	 *
+	 * Before removing the PE, the driver is supposed to have
+	 * notified the AFU, which should have cleaned up and make
+	 * sure the PASID is no longer in use, including pending
+	 * interrupts. However, there's no way to be sure...
+	 *
+	 * We clear the PE and remove the context from our radix
+	 * tree. From that point on, any new interrupt for that
+	 * context will fail silently, which is ok. As mentioned
+	 * above, that's not expected, but it could happen if the
+	 * driver or AFU didn't do the right thing.
+	 *
+	 * There could still be a bottom half running, but we don't
+	 * need to wait/flush, as it is managing a reference count on
+	 * the mm it reads from the radix tree.
+	 */
+	pe_handle = pasid & SPA_PE_MASK;
+	pe = spa->spa_mem + pe_handle;
+
+	mutex_lock(&spa->spa_lock);
+
+	if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) {
+		rc = -EINVAL;
+		goto unlock;
+	}
+
+	trace_ocxl_context_remove(current->pid, spa->spa_mem, pasid,
+				be32_to_cpu(pe->pid), be32_to_cpu(pe->tid));
+
+	memset(pe, 0, sizeof(struct ocxl_process_element));
+	/*
+	 * The barrier makes sure the PE is removed from the SPA
+	 * before we clear the NPU context cache below, so that the
+	 * old PE cannot be reloaded erroneously.
+	 */
+	mb();
+
+	/*
+	 * hook to platform code
+	 * On powerpc, the entry needs to be cleared from the context
+	 * cache of the NPU.
+	 */
+	rc = pnv_ocxl_spa_remove_pe(link->platform_data, pe_handle);
+	WARN_ON(rc);
+
+	pe_data = radix_tree_delete(&spa->pe_tree, pe_handle);
+	if (!pe_data) {
+		WARN(1, "Couldn't find pe data when removing PE\n");
+	} else {
+		mm_context_remove_copro(pe_data->mm);
+		mmdrop(pe_data->mm);
+		kfree_rcu(pe_data, rcu);
+	}
+unlock:
+	mutex_unlock(&spa->spa_lock);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ocxl_link_remove_pe);
+
+int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *trigger_addr)
+{
+	struct link *link = (struct link *) link_handle;
+	int rc, irq;
+	u64 addr;
+
+	if (atomic_dec_if_positive(&link->irq_available) < 0)
+		return -ENOSPC;
+
+	rc = pnv_ocxl_alloc_xive_irq(&irq, &addr);
+	if (rc) {
+		atomic_inc(&link->irq_available);
+		return rc;
+	}
+
+	*hw_irq = irq;
+	*trigger_addr = addr;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc);
+
+void ocxl_link_free_irq(void *link_handle, int hw_irq)
+{
+	struct link *link = (struct link *) link_handle;
+
+	pnv_ocxl_free_xive_irq(hw_irq);
+	atomic_inc(&link->irq_available);
+}
+EXPORT_SYMBOL_GPL(ocxl_link_free_irq);
diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c
new file mode 100644
index 000000000000..7210d9e059be
--- /dev/null
+++ b/drivers/misc/ocxl/main.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/module.h>
+#include <linux/pci.h>
+#include "ocxl_internal.h"
+
+static int __init init_ocxl(void)
+{
+	int rc = 0;
+
+	rc = ocxl_file_init();
+	if (rc)
+		return rc;
+
+	rc = pci_register_driver(&ocxl_pci_driver);
+	if (rc) {
+		ocxl_file_exit();
+		return rc;
+	}
+	return 0;
+}
+
+static void exit_ocxl(void)
+{
+	pci_unregister_driver(&ocxl_pci_driver);
+	ocxl_file_exit();
+}
+
+module_init(init_ocxl);
+module_exit(exit_ocxl);
+
+MODULE_DESCRIPTION("Open Coherent Accelerator");
+MODULE_LICENSE("GPL");
diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
new file mode 100644
index 000000000000..5d421824afd9
--- /dev/null
+++ b/drivers/misc/ocxl/ocxl_internal.h
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#ifndef _OCXL_INTERNAL_H_
+#define _OCXL_INTERNAL_H_
+
+#include <linux/pci.h>
+#include <linux/cdev.h>
+#include <linux/list.h>
+#include <misc/ocxl.h>
+
+#define MAX_IRQ_PER_LINK	2000
+#define MAX_IRQ_PER_CONTEXT	MAX_IRQ_PER_LINK
+
+#define to_ocxl_function(d) container_of(d, struct ocxl_fn, dev)
+#define to_ocxl_afu(d) container_of(d, struct ocxl_afu, dev)
+
+extern struct pci_driver ocxl_pci_driver;
+
+
+struct ocxl_fn {
+	struct device dev;
+	int bar_used[3];
+	struct ocxl_fn_config config;
+	struct list_head afu_list;
+	int pasid_base;
+	int actag_base;
+	int actag_enabled;
+	int actag_supported;
+	struct list_head pasid_list;
+	struct list_head actag_list;
+	void *link;
+};
+
+struct ocxl_afu {
+	struct ocxl_fn *fn;
+	struct list_head list;
+	struct device dev;
+	struct cdev cdev;
+	struct ocxl_afu_config config;
+	int pasid_base;
+	int pasid_count; /* opened contexts */
+	int pasid_max; /* maximum number of contexts */
+	int actag_base;
+	int actag_enabled;
+	struct mutex contexts_lock;
+	struct idr contexts_idr;
+	struct mutex afu_control_lock;
+	u64 global_mmio_start;
+	u64 irq_base_offset;
+	void __iomem *global_mmio_ptr;
+	u64 pp_mmio_start;
+	struct bin_attribute attr_global_mmio;
+};
+
+enum ocxl_context_status {
+	CLOSED,
+	OPENED,
+	ATTACHED,
+};
+
+// Contains metadata about a translation fault
+struct ocxl_xsl_error {
+	u64 addr; // The address that triggered the fault
+	u64 dsisr; // the value of the dsisr register
+	u64 count; // The number of times this fault has been triggered
+};
+
+struct ocxl_context {
+	struct ocxl_afu *afu;
+	int pasid;
+	struct mutex status_mutex;
+	enum ocxl_context_status status;
+	struct address_space *mapping;
+	struct mutex mapping_lock;
+	wait_queue_head_t events_wq;
+	struct mutex xsl_error_lock;
+	struct ocxl_xsl_error xsl_error;
+	struct mutex irq_lock;
+	struct idr irq_idr;
+};
+
+struct ocxl_process_element {
+	__be64 config_state;
+	__be32 reserved1[11];
+	__be32 lpid;
+	__be32 tid;
+	__be32 pid;
+	__be32 reserved2[10];
+	__be64 amr;
+	__be32 reserved3[3];
+	__be32 software_state;
+};
+
+
+extern struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu);
+extern void ocxl_afu_put(struct ocxl_afu *afu);
+
+extern int ocxl_create_cdev(struct ocxl_afu *afu);
+extern void ocxl_destroy_cdev(struct ocxl_afu *afu);
+extern int ocxl_register_afu(struct ocxl_afu *afu);
+extern void ocxl_unregister_afu(struct ocxl_afu *afu);
+
+extern int ocxl_file_init(void);
+extern void ocxl_file_exit(void);
+
+extern int ocxl_pasid_afu_alloc(struct ocxl_fn *fn, u32 size);
+extern void ocxl_pasid_afu_free(struct ocxl_fn *fn, u32 start, u32 size);
+extern int ocxl_actag_afu_alloc(struct ocxl_fn *fn, u32 size);
+extern void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, u32 size);
+
+extern struct ocxl_context *ocxl_context_alloc(void);
+extern int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu,
+			struct address_space *mapping);
+extern int ocxl_context_attach(struct ocxl_context *ctx, u64 amr);
+extern int ocxl_context_mmap(struct ocxl_context *ctx,
+			struct vm_area_struct *vma);
+extern int ocxl_context_detach(struct ocxl_context *ctx);
+extern void ocxl_context_detach_all(struct ocxl_afu *afu);
+extern void ocxl_context_free(struct ocxl_context *ctx);
+
+extern int ocxl_sysfs_add_afu(struct ocxl_afu *afu);
+extern void ocxl_sysfs_remove_afu(struct ocxl_afu *afu);
+
+extern int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset);
+extern int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset);
+extern void ocxl_afu_irq_free_all(struct ocxl_context *ctx);
+extern int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset,
+			int eventfd);
+extern u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset);
+
+#endif /* _OCXL_INTERNAL_H_ */
diff --git a/drivers/misc/ocxl/pasid.c b/drivers/misc/ocxl/pasid.c
new file mode 100644
index 000000000000..d14cb56e6920
--- /dev/null
+++ b/drivers/misc/ocxl/pasid.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include "ocxl_internal.h"
+
+
+struct id_range {
+	struct list_head list;
+	u32 start;
+	u32 end;
+};
+
+#ifdef DEBUG
+static void dump_list(struct list_head *head, char *type_str)
+{
+	struct id_range *cur;
+
+	pr_debug("%s ranges allocated:\n", type_str);
+	list_for_each_entry(cur, head, list) {
+		pr_debug("Range %d->%d\n", cur->start, cur->end);
+	}
+}
+#endif
+
+static int range_alloc(struct list_head *head, u32 size, int max_id,
+		char *type_str)
+{
+	struct list_head *pos;
+	struct id_range *cur, *new;
+	int rc, last_end;
+
+	new = kmalloc(sizeof(struct id_range), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	pos = head;
+	last_end = -1;
+	list_for_each_entry(cur, head, list) {
+		if ((cur->start - last_end) > size)
+			break;
+		last_end = cur->end;
+		pos = &cur->list;
+	}
+
+	new->start = last_end + 1;
+	new->end = new->start + size - 1;
+
+	if (new->end > max_id) {
+		kfree(new);
+		rc = -ENOSPC;
+	} else {
+		list_add(&new->list, pos);
+		rc = new->start;
+	}
+
+#ifdef DEBUG
+	dump_list(head, type_str);
+#endif
+	return rc;
+}
+
+static void range_free(struct list_head *head, u32 start, u32 size,
+		char *type_str)
+{
+	bool found = false;
+	struct id_range *cur, *tmp;
+
+	list_for_each_entry_safe(cur, tmp, head, list) {
+		if (cur->start == start && cur->end == (start + size - 1)) {
+			found = true;
+			list_del(&cur->list);
+			kfree(cur);
+			break;
+		}
+	}
+	WARN_ON(!found);
+#ifdef DEBUG
+	dump_list(head, type_str);
+#endif
+}
+
+int ocxl_pasid_afu_alloc(struct ocxl_fn *fn, u32 size)
+{
+	int max_pasid;
+
+	if (fn->config.max_pasid_log < 0)
+		return -ENOSPC;
+	max_pasid = 1 << fn->config.max_pasid_log;
+	return range_alloc(&fn->pasid_list, size, max_pasid, "afu pasid");
+}
+
+void ocxl_pasid_afu_free(struct ocxl_fn *fn, u32 start, u32 size)
+{
+	return range_free(&fn->pasid_list, start, size, "afu pasid");
+}
+
+int ocxl_actag_afu_alloc(struct ocxl_fn *fn, u32 size)
+{
+	int max_actag;
+
+	max_actag = fn->actag_enabled;
+	return range_alloc(&fn->actag_list, size, max_actag, "afu actag");
+}
+
+void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, u32 size)
+{
+	return range_free(&fn->actag_list, start, size, "afu actag");
+}
diff --git a/drivers/misc/ocxl/pci.c b/drivers/misc/ocxl/pci.c
new file mode 100644
index 000000000000..0051d9ec76cc
--- /dev/null
+++ b/drivers/misc/ocxl/pci.c
@@ -0,0 +1,585 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/idr.h>
+#include <asm/pnv-ocxl.h>
+#include "ocxl_internal.h"
+
+/*
+ * Any opencapi device which wants to use this 'generic' driver should
+ * use the 0x062B device ID. Vendors should define the subsystem
+ * vendor/device ID to help differentiate devices.
+ */
+static const struct pci_device_id ocxl_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x062B), },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, ocxl_pci_tbl);
+
+
+static struct ocxl_fn *ocxl_fn_get(struct ocxl_fn *fn)
+{
+	return (get_device(&fn->dev) == NULL) ? NULL : fn;
+}
+
+static void ocxl_fn_put(struct ocxl_fn *fn)
+{
+	put_device(&fn->dev);
+}
+
+struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu)
+{
+	return (get_device(&afu->dev) == NULL) ? NULL : afu;
+}
+
+void ocxl_afu_put(struct ocxl_afu *afu)
+{
+	put_device(&afu->dev);
+}
+
+static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn)
+{
+	struct ocxl_afu *afu;
+
+	afu = kzalloc(sizeof(struct ocxl_afu), GFP_KERNEL);
+	if (!afu)
+		return NULL;
+
+	mutex_init(&afu->contexts_lock);
+	mutex_init(&afu->afu_control_lock);
+	idr_init(&afu->contexts_idr);
+	afu->fn = fn;
+	ocxl_fn_get(fn);
+	return afu;
+}
+
+static void free_afu(struct ocxl_afu *afu)
+{
+	idr_destroy(&afu->contexts_idr);
+	ocxl_fn_put(afu->fn);
+	kfree(afu);
+}
+
+static void free_afu_dev(struct device *dev)
+{
+	struct ocxl_afu *afu = to_ocxl_afu(dev);
+
+	ocxl_unregister_afu(afu);
+	free_afu(afu);
+}
+
+static int set_afu_device(struct ocxl_afu *afu, const char *location)
+{
+	struct ocxl_fn *fn = afu->fn;
+	int rc;
+
+	afu->dev.parent = &fn->dev;
+	afu->dev.release = free_afu_dev;
+	rc = dev_set_name(&afu->dev, "%s.%s.%hhu", afu->config.name, location,
+		afu->config.idx);
+	return rc;
+}
+
+static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev)
+{
+	struct ocxl_fn *fn = afu->fn;
+	int actag_count, actag_offset;
+
+	/*
+	 * if there were not enough actags for the function, each afu
+	 * reduces its count as well
+	 */
+	actag_count = afu->config.actag_supported *
+		fn->actag_enabled / fn->actag_supported;
+	actag_offset = ocxl_actag_afu_alloc(fn, actag_count);
+	if (actag_offset < 0) {
+		dev_err(&afu->dev, "Can't allocate %d actags for AFU: %d\n",
+			actag_count, actag_offset);
+		return actag_offset;
+	}
+	afu->actag_base = fn->actag_base + actag_offset;
+	afu->actag_enabled = actag_count;
+
+	ocxl_config_set_afu_actag(dev, afu->config.dvsec_afu_control_pos,
+				afu->actag_base, afu->actag_enabled);
+	dev_dbg(&afu->dev, "actag base=%d enabled=%d\n",
+		afu->actag_base, afu->actag_enabled);
+	return 0;
+}
+
+static void reclaim_afu_actag(struct ocxl_afu *afu)
+{
+	struct ocxl_fn *fn = afu->fn;
+	int start_offset, size;
+
+	start_offset = afu->actag_base - fn->actag_base;
+	size = afu->actag_enabled;
+	ocxl_actag_afu_free(afu->fn, start_offset, size);
+}
+
+static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev)
+{
+	struct ocxl_fn *fn = afu->fn;
+	int pasid_count, pasid_offset;
+
+	/*
+	 * We only support the case where the function configuration
+	 * requested enough PASIDs to cover all AFUs.
+	 */
+	pasid_count = 1 << afu->config.pasid_supported_log;
+	pasid_offset = ocxl_pasid_afu_alloc(fn, pasid_count);
+	if (pasid_offset < 0) {
+		dev_err(&afu->dev, "Can't allocate %d PASIDs for AFU: %d\n",
+			pasid_count, pasid_offset);
+		return pasid_offset;
+	}
+	afu->pasid_base = fn->pasid_base + pasid_offset;
+	afu->pasid_count = 0;
+	afu->pasid_max = pasid_count;
+
+	ocxl_config_set_afu_pasid(dev, afu->config.dvsec_afu_control_pos,
+				afu->pasid_base,
+				afu->config.pasid_supported_log);
+	dev_dbg(&afu->dev, "PASID base=%d, enabled=%d\n",
+		afu->pasid_base, pasid_count);
+	return 0;
+}
+
+static void reclaim_afu_pasid(struct ocxl_afu *afu)
+{
+	struct ocxl_fn *fn = afu->fn;
+	int start_offset, size;
+
+	start_offset = afu->pasid_base - fn->pasid_base;
+	size = 1 << afu->config.pasid_supported_log;
+	ocxl_pasid_afu_free(afu->fn, start_offset, size);
+}
+
+static int reserve_fn_bar(struct ocxl_fn *fn, int bar)
+{
+	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
+	int rc, idx;
+
+	if (bar != 0 && bar != 2 && bar != 4)
+		return -EINVAL;
+
+	idx = bar >> 1;
+	if (fn->bar_used[idx]++ == 0) {
+		rc = pci_request_region(dev, bar, "ocxl");
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+static void release_fn_bar(struct ocxl_fn *fn, int bar)
+{
+	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
+	int idx;
+
+	if (bar != 0 && bar != 2 && bar != 4)
+		return;
+
+	idx = bar >> 1;
+	if (--fn->bar_used[idx] == 0)
+		pci_release_region(dev, bar);
+	WARN_ON(fn->bar_used[idx] < 0);
+}
+
+static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev)
+{
+	int rc;
+
+	rc = reserve_fn_bar(afu->fn, afu->config.global_mmio_bar);
+	if (rc)
+		return rc;
+
+	rc = reserve_fn_bar(afu->fn, afu->config.pp_mmio_bar);
+	if (rc) {
+		release_fn_bar(afu->fn, afu->config.global_mmio_bar);
+		return rc;
+	}
+
+	afu->global_mmio_start =
+		pci_resource_start(dev, afu->config.global_mmio_bar) +
+		afu->config.global_mmio_offset;
+	afu->pp_mmio_start =
+		pci_resource_start(dev, afu->config.pp_mmio_bar) +
+		afu->config.pp_mmio_offset;
+
+	afu->global_mmio_ptr = ioremap(afu->global_mmio_start,
+				afu->config.global_mmio_size);
+	if (!afu->global_mmio_ptr) {
+		release_fn_bar(afu->fn, afu->config.pp_mmio_bar);
+		release_fn_bar(afu->fn, afu->config.global_mmio_bar);
+		dev_err(&dev->dev, "Error mapping global mmio area\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Leave an empty page between the per-process mmio area and
+	 * the AFU interrupt mappings
+	 */
+	afu->irq_base_offset = afu->config.pp_mmio_stride + PAGE_SIZE;
+	return 0;
+}
+
+static void unmap_mmio_areas(struct ocxl_afu *afu)
+{
+	if (afu->global_mmio_ptr) {
+		iounmap(afu->global_mmio_ptr);
+		afu->global_mmio_ptr = NULL;
+	}
+	afu->global_mmio_start = 0;
+	afu->pp_mmio_start = 0;
+	release_fn_bar(afu->fn, afu->config.pp_mmio_bar);
+	release_fn_bar(afu->fn, afu->config.global_mmio_bar);
+}
+
+static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev)
+{
+	int rc;
+
+	rc = ocxl_config_read_afu(dev, &afu->fn->config, &afu->config, afu_idx);
+	if (rc)
+		return rc;
+
+	rc = set_afu_device(afu, dev_name(&dev->dev));
+	if (rc)
+		return rc;
+
+	rc = assign_afu_actag(afu, dev);
+	if (rc)
+		return rc;
+
+	rc = assign_afu_pasid(afu, dev);
+	if (rc) {
+		reclaim_afu_actag(afu);
+		return rc;
+	}
+
+	rc = map_mmio_areas(afu, dev);
+	if (rc) {
+		reclaim_afu_pasid(afu);
+		reclaim_afu_actag(afu);
+		return rc;
+	}
+	return 0;
+}
+
+static void deconfigure_afu(struct ocxl_afu *afu)
+{
+	unmap_mmio_areas(afu);
+	reclaim_afu_pasid(afu);
+	reclaim_afu_actag(afu);
+}
+
+static int activate_afu(struct pci_dev *dev, struct ocxl_afu *afu)
+{
+	int rc;
+
+	ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 1);
+	/*
+	 * Char device creation is the last step, as processes can
+	 * call our driver immediately, so all our inits must be finished.
+	 */
+	rc = ocxl_create_cdev(afu);
+	if (rc)
+		return rc;
+	return 0;
+}
+
+static void deactivate_afu(struct ocxl_afu *afu)
+{
+	struct pci_dev *dev = to_pci_dev(afu->fn->dev.parent);
+
+	ocxl_destroy_cdev(afu);
+	ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 0);
+}
+
+static int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx)
+{
+	int rc;
+	struct ocxl_afu *afu;
+
+	afu = alloc_afu(fn);
+	if (!afu)
+		return -ENOMEM;
+
+	rc = configure_afu(afu, afu_idx, dev);
+	if (rc) {
+		free_afu(afu);
+		return rc;
+	}
+
+	rc = ocxl_register_afu(afu);
+	if (rc)
+		goto err;
+
+	rc = ocxl_sysfs_add_afu(afu);
+	if (rc)
+		goto err;
+
+	rc = activate_afu(dev, afu);
+	if (rc)
+		goto err_sys;
+
+	list_add_tail(&afu->list, &fn->afu_list);
+	return 0;
+
+err_sys:
+	ocxl_sysfs_remove_afu(afu);
+err:
+	deconfigure_afu(afu);
+	device_unregister(&afu->dev);
+	return rc;
+}
+
+static void remove_afu(struct ocxl_afu *afu)
+{
+	list_del(&afu->list);
+	ocxl_context_detach_all(afu);
+	deactivate_afu(afu);
+	ocxl_sysfs_remove_afu(afu);
+	deconfigure_afu(afu);
+	device_unregister(&afu->dev);
+}
+
+static struct ocxl_fn *alloc_function(struct pci_dev *dev)
+{
+	struct ocxl_fn *fn;
+
+	fn = kzalloc(sizeof(struct ocxl_fn), GFP_KERNEL);
+	if (!fn)
+		return NULL;
+
+	INIT_LIST_HEAD(&fn->afu_list);
+	INIT_LIST_HEAD(&fn->pasid_list);
+	INIT_LIST_HEAD(&fn->actag_list);
+	return fn;
+}
+
+static void free_function(struct ocxl_fn *fn)
+{
+	WARN_ON(!list_empty(&fn->afu_list));
+	WARN_ON(!list_empty(&fn->pasid_list));
+	kfree(fn);
+}
+
+static void free_function_dev(struct device *dev)
+{
+	struct ocxl_fn *fn = to_ocxl_function(dev);
+
+	free_function(fn);
+}
+
+static int set_function_device(struct ocxl_fn *fn, struct pci_dev *dev)
+{
+	int rc;
+
+	fn->dev.parent = &dev->dev;
+	fn->dev.release = free_function_dev;
+	rc = dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev));
+	if (rc)
+		return rc;
+	pci_set_drvdata(dev, fn);
+	return 0;
+}
+
+static int assign_function_actag(struct ocxl_fn *fn)
+{
+	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
+	u16 base, enabled, supported;
+	int rc;
+
+	rc = ocxl_config_get_actag_info(dev, &base, &enabled, &supported);
+	if (rc)
+		return rc;
+
+	fn->actag_base = base;
+	fn->actag_enabled = enabled;
+	fn->actag_supported = supported;
+
+	ocxl_config_set_actag(dev, fn->config.dvsec_function_pos,
+			fn->actag_base,	fn->actag_enabled);
+	dev_dbg(&fn->dev, "actag range starting at %d, enabled %d\n",
+		fn->actag_base, fn->actag_enabled);
+	return 0;
+}
+
+static int set_function_pasid(struct ocxl_fn *fn)
+{
+	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
+	int rc, desired_count, max_count;
+
+	/* A function may not require any PASID */
+	if (fn->config.max_pasid_log < 0)
+		return 0;
+
+	rc = ocxl_config_get_pasid_info(dev, &max_count);
+	if (rc)
+		return rc;
+
+	desired_count = 1 << fn->config.max_pasid_log;
+
+	if (desired_count > max_count) {
+		dev_err(&fn->dev,
+			"Function requires more PASIDs than is available (%d vs. %d)\n",
+			desired_count, max_count);
+		return -ENOSPC;
+	}
+
+	fn->pasid_base = 0;
+	return 0;
+}
+
+static int configure_function(struct ocxl_fn *fn, struct pci_dev *dev)
+{
+	int rc;
+
+	rc = pci_enable_device(dev);
+	if (rc) {
+		dev_err(&dev->dev, "pci_enable_device failed: %d\n", rc);
+		return rc;
+	}
+
+	/*
+	 * Once it has been confirmed to work on our hardware, we
+	 * should reset the function, to force the adapter to restart
+	 * from scratch.
+	 * A function reset would also reset all its AFUs.
+	 *
+	 * Some hints for implementation:
+	 *
+	 * - there's not status bit to know when the reset is done. We
+	 *   should try reading the config space to know when it's
+	 *   done.
+	 * - probably something like:
+	 *	Reset
+	 *	wait 100ms
+	 *	issue config read
+	 *	allow device up to 1 sec to return success on config
+	 *	read before declaring it broken
+	 *
+	 * Some shared logic on the card (CFG, TLX) won't be reset, so
+	 * there's no guarantee that it will be enough.
+	 */
+	rc = ocxl_config_read_function(dev, &fn->config);
+	if (rc)
+		return rc;
+
+	rc = set_function_device(fn, dev);
+	if (rc)
+		return rc;
+
+	rc = assign_function_actag(fn);
+	if (rc)
+		return rc;
+
+	rc = set_function_pasid(fn);
+	if (rc)
+		return rc;
+
+	rc = ocxl_link_setup(dev, 0, &fn->link);
+	if (rc)
+		return rc;
+
+	rc = ocxl_config_set_TL(dev, fn->config.dvsec_tl_pos);
+	if (rc) {
+		ocxl_link_release(dev, fn->link);
+		return rc;
+	}
+	return 0;
+}
+
+static void deconfigure_function(struct ocxl_fn *fn)
+{
+	struct pci_dev *dev = to_pci_dev(fn->dev.parent);
+
+	ocxl_link_release(dev, fn->link);
+	pci_disable_device(dev);
+}
+
+static struct ocxl_fn *init_function(struct pci_dev *dev)
+{
+	struct ocxl_fn *fn;
+	int rc;
+
+	fn = alloc_function(dev);
+	if (!fn)
+		return ERR_PTR(-ENOMEM);
+
+	rc = configure_function(fn, dev);
+	if (rc) {
+		free_function(fn);
+		return ERR_PTR(rc);
+	}
+
+	rc = device_register(&fn->dev);
+	if (rc) {
+		deconfigure_function(fn);
+		device_unregister(&fn->dev);
+		return ERR_PTR(rc);
+	}
+	return fn;
+}
+
+static void remove_function(struct ocxl_fn *fn)
+{
+	deconfigure_function(fn);
+	device_unregister(&fn->dev);
+}
+
+static int ocxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	int rc, afu_count = 0;
+	u8 afu;
+	struct ocxl_fn *fn;
+
+	if (!radix_enabled()) {
+		dev_err(&dev->dev, "Unsupported memory model (hash)\n");
+		return -ENODEV;
+	}
+
+	fn = init_function(dev);
+	if (IS_ERR(fn)) {
+		dev_err(&dev->dev, "function init failed: %li\n",
+			PTR_ERR(fn));
+		return PTR_ERR(fn);
+	}
+
+	for (afu = 0; afu <= fn->config.max_afu_index; afu++) {
+		rc = ocxl_config_check_afu_index(dev, &fn->config, afu);
+		if (rc > 0) {
+			rc = init_afu(dev, fn, afu);
+			if (rc) {
+				dev_err(&dev->dev,
+					"Can't initialize AFU index %d\n", afu);
+				continue;
+			}
+			afu_count++;
+		}
+	}
+	dev_info(&dev->dev, "%d AFU(s) configured\n", afu_count);
+	return 0;
+}
+
+static void ocxl_remove(struct pci_dev *dev)
+{
+	struct ocxl_afu *afu, *tmp;
+	struct ocxl_fn *fn = pci_get_drvdata(dev);
+
+	list_for_each_entry_safe(afu, tmp, &fn->afu_list, list) {
+		remove_afu(afu);
+	}
+	remove_function(fn);
+}
+
+struct pci_driver ocxl_pci_driver = {
+	.name = "ocxl",
+	.id_table = ocxl_pci_tbl,
+	.probe = ocxl_probe,
+	.remove = ocxl_remove,
+	.shutdown = ocxl_remove,
+};
diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c
new file mode 100644
index 000000000000..d9753a1db14b
--- /dev/null
+++ b/drivers/misc/ocxl/sysfs.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <linux/sysfs.h>
+#include "ocxl_internal.h"
+
+static ssize_t global_mmio_size_show(struct device *device,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct ocxl_afu *afu = to_ocxl_afu(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			afu->config.global_mmio_size);
+}
+
+static ssize_t pp_mmio_size_show(struct device *device,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct ocxl_afu *afu = to_ocxl_afu(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			afu->config.pp_mmio_stride);
+}
+
+static ssize_t afu_version_show(struct device *device,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct ocxl_afu *afu = to_ocxl_afu(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%hhu:%hhu\n",
+			afu->config.version_major,
+			afu->config.version_minor);
+}
+
+static ssize_t contexts_show(struct device *device,
+		struct device_attribute *attr,
+		char *buf)
+{
+	struct ocxl_afu *afu = to_ocxl_afu(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%d/%d\n",
+			afu->pasid_count, afu->pasid_max);
+}
+
+static struct device_attribute afu_attrs[] = {
+	__ATTR_RO(global_mmio_size),
+	__ATTR_RO(pp_mmio_size),
+	__ATTR_RO(afu_version),
+	__ATTR_RO(contexts),
+};
+
+static ssize_t global_mmio_read(struct file *filp, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *buf,
+				loff_t off, size_t count)
+{
+	struct ocxl_afu *afu = to_ocxl_afu(kobj_to_dev(kobj));
+
+	if (count == 0 || off < 0 ||
+		off >= afu->config.global_mmio_size)
+		return 0;
+	memcpy_fromio(buf, afu->global_mmio_ptr + off, count);
+	return count;
+}
+
+static int global_mmio_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct ocxl_afu *afu = vma->vm_private_data;
+	unsigned long offset;
+
+	if (vmf->pgoff >= (afu->config.global_mmio_size >> PAGE_SHIFT))
+		return VM_FAULT_SIGBUS;
+
+	offset = vmf->pgoff;
+	offset += (afu->global_mmio_start >> PAGE_SHIFT);
+	vm_insert_pfn(vma, vmf->address, offset);
+	return VM_FAULT_NOPAGE;
+}
+
+static const struct vm_operations_struct global_mmio_vmops = {
+	.fault = global_mmio_fault,
+};
+
+static int global_mmio_mmap(struct file *filp, struct kobject *kobj,
+			struct bin_attribute *bin_attr,
+			struct vm_area_struct *vma)
+{
+	struct ocxl_afu *afu = to_ocxl_afu(kobj_to_dev(kobj));
+
+	if ((vma_pages(vma) + vma->vm_pgoff) >
+		(afu->config.global_mmio_size >> PAGE_SHIFT))
+		return -EINVAL;
+
+	vma->vm_flags |= VM_IO | VM_PFNMAP;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_ops = &global_mmio_vmops;
+	vma->vm_private_data = afu;
+	return 0;
+}
+
+int ocxl_sysfs_add_afu(struct ocxl_afu *afu)
+{
+	int i, rc;
+
+	for (i = 0; i < ARRAY_SIZE(afu_attrs); i++) {
+		rc = device_create_file(&afu->dev, &afu_attrs[i]);
+		if (rc)
+			goto err;
+	}
+
+	sysfs_attr_init(&afu->attr_global_mmio.attr);
+	afu->attr_global_mmio.attr.name = "global_mmio_area";
+	afu->attr_global_mmio.attr.mode = 0600;
+	afu->attr_global_mmio.size = afu->config.global_mmio_size;
+	afu->attr_global_mmio.read = global_mmio_read;
+	afu->attr_global_mmio.mmap = global_mmio_mmap;
+	rc = device_create_bin_file(&afu->dev, &afu->attr_global_mmio);
+	if (rc) {
+		dev_err(&afu->dev,
+			"Unable to create global mmio attr for afu: %d\n",
+			rc);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	for (i--; i >= 0; i--)
+		device_remove_file(&afu->dev, &afu_attrs[i]);
+	return rc;
+}
+
+void ocxl_sysfs_remove_afu(struct ocxl_afu *afu)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(afu_attrs); i++)
+		device_remove_file(&afu->dev, &afu_attrs[i]);
+	device_remove_bin_file(&afu->dev, &afu->attr_global_mmio);
+}
diff --git a/drivers/misc/ocxl/trace.c b/drivers/misc/ocxl/trace.c
new file mode 100644
index 000000000000..1e6947049697
--- /dev/null
+++ b/drivers/misc/ocxl/trace.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+#endif
diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h
new file mode 100644
index 000000000000..bcb7ff330c1e
--- /dev/null
+++ b/drivers/misc/ocxl/trace.h
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocxl
+
+#if !defined(_TRACE_OCXL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCXL_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ocxl_context,
+	TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr),
+	TP_ARGS(pid, spa, pasid, pidr, tidr),
+
+	TP_STRUCT__entry(
+		__field(pid_t, pid)
+		__field(void*, spa)
+		__field(int, pasid)
+		__field(u32, pidr)
+		__field(u32, tidr)
+	),
+
+	TP_fast_assign(
+		__entry->pid = pid;
+		__entry->spa = spa;
+		__entry->pasid = pasid;
+		__entry->pidr = pidr;
+		__entry->tidr = tidr;
+	),
+
+	TP_printk("linux pid=%d spa=0x%p pasid=0x%x pidr=0x%x tidr=0x%x",
+		__entry->pid,
+		__entry->spa,
+		__entry->pasid,
+		__entry->pidr,
+		__entry->tidr
+	)
+);
+
+DEFINE_EVENT(ocxl_context, ocxl_context_add,
+	TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr),
+	TP_ARGS(pid, spa, pasid, pidr, tidr)
+);
+
+DEFINE_EVENT(ocxl_context, ocxl_context_remove,
+	TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr),
+	TP_ARGS(pid, spa, pasid, pidr, tidr)
+);
+
+TRACE_EVENT(ocxl_terminate_pasid,
+	TP_PROTO(int pasid, int rc),
+	TP_ARGS(pasid, rc),
+
+	TP_STRUCT__entry(
+		__field(int, pasid)
+		__field(int, rc)
+	),
+
+	TP_fast_assign(
+		__entry->pasid = pasid;
+		__entry->rc = rc;
+	),
+
+	TP_printk("pasid=0x%x rc=%d",
+		__entry->pasid,
+		__entry->rc
+	)
+);
+
+DECLARE_EVENT_CLASS(ocxl_fault_handler,
+	TP_PROTO(void *spa, u64 pe, u64 dsisr, u64 dar, u64 tfc),
+	TP_ARGS(spa, pe, dsisr, dar, tfc),
+
+	TP_STRUCT__entry(
+		__field(void *, spa)
+		__field(u64, pe)
+		__field(u64, dsisr)
+		__field(u64, dar)
+		__field(u64, tfc)
+	),
+
+	TP_fast_assign(
+		__entry->spa = spa;
+		__entry->pe = pe;
+		__entry->dsisr = dsisr;
+		__entry->dar = dar;
+		__entry->tfc = tfc;
+	),
+
+	TP_printk("spa=%p pe=0x%llx dsisr=0x%llx dar=0x%llx tfc=0x%llx",
+		__entry->spa,
+		__entry->pe,
+		__entry->dsisr,
+		__entry->dar,
+		__entry->tfc
+	)
+);
+
+DEFINE_EVENT(ocxl_fault_handler, ocxl_fault,
+	TP_PROTO(void *spa, u64 pe, u64 dsisr, u64 dar, u64 tfc),
+	TP_ARGS(spa, pe, dsisr, dar, tfc)
+);
+
+DEFINE_EVENT(ocxl_fault_handler, ocxl_fault_ack,
+	TP_PROTO(void *spa, u64 pe, u64 dsisr, u64 dar, u64 tfc),
+	TP_ARGS(spa, pe, dsisr, dar, tfc)
+);
+
+TRACE_EVENT(ocxl_afu_irq_alloc,
+	TP_PROTO(int pasid, int irq_id, unsigned int virq, int hw_irq,
+		u64 irq_offset),
+	TP_ARGS(pasid, irq_id, virq, hw_irq, irq_offset),
+
+	TP_STRUCT__entry(
+		__field(int, pasid)
+		__field(int, irq_id)
+		__field(unsigned int, virq)
+		__field(int, hw_irq)
+		__field(u64, irq_offset)
+	),
+
+	TP_fast_assign(
+		__entry->pasid = pasid;
+		__entry->irq_id = irq_id;
+		__entry->virq = virq;
+		__entry->hw_irq = hw_irq;
+		__entry->irq_offset = irq_offset;
+	),
+
+	TP_printk("pasid=0x%x irq_id=%d virq=%u hw_irq=%d irq_offset=0x%llx",
+		__entry->pasid,
+		__entry->irq_id,
+		__entry->virq,
+		__entry->hw_irq,
+		__entry->irq_offset
+	)
+);
+
+TRACE_EVENT(ocxl_afu_irq_free,
+	TP_PROTO(int pasid, int irq_id),
+	TP_ARGS(pasid, irq_id),
+
+	TP_STRUCT__entry(
+		__field(int, pasid)
+		__field(int, irq_id)
+	),
+
+	TP_fast_assign(
+		__entry->pasid = pasid;
+		__entry->irq_id = irq_id;
+	),
+
+	TP_printk("pasid=0x%x irq_id=%d",
+		__entry->pasid,
+		__entry->irq_id
+	)
+);
+
+TRACE_EVENT(ocxl_afu_irq_receive,
+	TP_PROTO(int virq),
+	TP_ARGS(virq),
+
+	TP_STRUCT__entry(
+		__field(int, virq)
+	),
+
+	TP_fast_assign(
+		__entry->virq = virq;
+	),
+
+	TP_printk("virq=%d",
+		__entry->virq
+	)
+);
+
+#endif /* _TRACE_OCXL_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-02-02 10:01:04 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-02-02 10:01:04 -0800
commit	03f51d4efa2287cc628bb20b0c032036d2a9e66a (patch)
tree	ec7fb3b6624d53092e2768578f3ef887c8d77f22 /drivers/misc
parent	367b0df173b0ebea5d18b6971c244e260b5feb17 (diff)
parent	015eb1b89e959c9349f0a01803fb8ed1ced36f09 (diff)