summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristophe Lombard <clombard@linux.vnet.ibm.com>2016-04-22 15:39:22 +0200
committerMichael Ellerman <mpe@ellerman.id.au>2016-05-11 21:54:11 +1000
commit266eab8f32cc43b688c2e9aaab63c2565a3998c2 (patch)
tree3170b67c1bdd3484fa934695beb50fcea4dc3921
parent7a0d85d313c2066712e530e668bc02bb741a685c (diff)
cxl: Check periodically the coherent platform function's state
In the PowerVM environment, the PHYP CoherentAccel component manages the state of the Coherent Accelerator Processor Interface adapter and virtualizes CAPI resources, handles CAPP, PSL, PSL Slice errors - and interrupts - and provides a new set of hcalls for the OS APIs to utilize Accelerator Function Unit (AFU). During the course of operation, a coherent platform function can encounter errors. Some possible reason for errors are: • Hardware recoverable and unrecoverable errors • Transient and over-threshold correctable errors PHYP implements its own state model for the coherent platform function. The state of the AFU is available through a hcall. The current implementation of the cxl driver, for the PowerVM environment, checks this state of the AFU only when an action is requested - open a device, ioctl command, memory map, attach/detach a process - from an external driver - cxlflash, libcxl. If an error is detected the cxl driver handles the error according the content of the Power Architecture Platform Requirements document. But in case of low-level troubles (or error injection), the PHYP component may reset the card and change the AFU state. The PHYP interface doesn't provide any way to be notified when that happens thus implies that the cxl driver: • cannot handle immediatly the state change of the AFU. • cannot notify other drivers (cxlflash, ...) The purpose of this patch is to wake up the cpu periodically to check the current state of each AFU and to see if we need to enter an error recovery path. Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com> Acked-by: Ian Munsie <imunsie@au1.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--drivers/misc/cxl/cxl.h4
-rw-r--r--drivers/misc/cxl/guest.c57
2 files changed, 33 insertions, 28 deletions
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 9dd3a0e7917f..d23a3a59a712 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -366,11 +366,13 @@ struct cxl_afu_native {
};
struct cxl_afu_guest {
+ struct cxl_afu *parent;
u64 handle;
phys_addr_t p2n_phys;
u64 p2n_size;
int max_ints;
- struct mutex recovery_lock;
+ bool handle_err;
+ struct delayed_work work_err;
int previous_state;
};
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index c2815b97de2e..bc8d0b9870eb 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -178,6 +178,9 @@ static int afu_read_error_state(struct cxl_afu *afu, int *state_out)
u64 state;
int rc = 0;
+ if (!afu)
+ return -EIO;
+
rc = cxl_h_read_error_state(afu->guest->handle, &state);
if (!rc) {
WARN_ON(state != H_STATE_NORMAL &&
@@ -833,7 +836,6 @@ static int afu_update_state(struct cxl_afu *afu)
switch (cur_state) {
case H_STATE_NORMAL:
afu->guest->previous_state = cur_state;
- rc = 1;
break;
case H_STATE_DISABLE:
@@ -849,7 +851,6 @@ static int afu_update_state(struct cxl_afu *afu)
pci_error_handlers(afu, CXL_SLOT_RESET_EVENT,
pci_channel_io_normal);
pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
- rc = 1;
}
afu->guest->previous_state = 0;
break;
@@ -874,39 +875,30 @@ static int afu_update_state(struct cxl_afu *afu)
return rc;
}
-static int afu_do_recovery(struct cxl_afu *afu)
+static void afu_handle_errstate(struct work_struct *work)
{
- int rc;
+ struct cxl_afu_guest *afu_guest =
+ container_of(to_delayed_work(work), struct cxl_afu_guest, work_err);
- /* many threads can arrive here, in case of detach_all for example.
- * Only one needs to drive the recovery
- */
- if (mutex_trylock(&afu->guest->recovery_lock)) {
- rc = afu_update_state(afu);
- mutex_unlock(&afu->guest->recovery_lock);
- return rc;
- }
- return 0;
+ if (!afu_update_state(afu_guest->parent) &&
+ afu_guest->previous_state == H_STATE_PERM_UNAVAILABLE)
+ return;
+
+ if (afu_guest->handle_err == true)
+ schedule_delayed_work(&afu_guest->work_err,
+ msecs_to_jiffies(3000));
}
static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu)
{
int state;
- if (afu) {
- if (afu_read_error_state(afu, &state) ||
- state != H_STATE_NORMAL) {
- if (afu_do_recovery(afu) > 0) {
- /* check again in case we've just fixed it */
- if (!afu_read_error_state(afu, &state) &&
- state == H_STATE_NORMAL)
- return true;
- }
- return false;
- }
+ if (afu && (!afu_read_error_state(afu, &state))) {
+ if (state == H_STATE_NORMAL)
+ return true;
}
- return true;
+ return false;
}
static int afu_properties_look_ok(struct cxl_afu *afu)
@@ -944,8 +936,6 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n
return -ENOMEM;
}
- mutex_init(&afu->guest->recovery_lock);
-
if ((rc = dev_set_name(&afu->dev, "afu%i.%i",
adapter->adapter_num,
slice)))
@@ -1001,6 +991,15 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n
afu->enabled = true;
+ /*
+ * wake up the cpu periodically to check the state
+ * of the AFU using "afu" stored in the guest structure.
+ */
+ afu->guest->parent = afu;
+ afu->guest->handle_err = true;
+ INIT_DELAYED_WORK(&afu->guest->work_err, afu_handle_errstate);
+ schedule_delayed_work(&afu->guest->work_err, msecs_to_jiffies(1000));
+
if ((rc = cxl_pci_vphb_add(afu)))
dev_info(&afu->dev, "Can't register vPHB\n");
@@ -1029,6 +1028,10 @@ void cxl_guest_remove_afu(struct cxl_afu *afu)
if (!afu)
return;
+ /* flush and stop pending job */
+ afu->guest->handle_err = false;
+ flush_delayed_work(&afu->guest->work_err);
+
cxl_pci_vphb_remove(afu);
cxl_sysfs_afu_remove(afu);