diff options
-rw-r--r-- | arch/powerpc/include/asm/eeh.h | 11 | ||||
-rw-r--r-- | arch/powerpc/kernel/eeh.c | 15 | ||||
-rw-r--r-- | arch/powerpc/kernel/eeh_driver.c | 38 | ||||
-rw-r--r-- | arch/powerpc/kernel/eeh_event.c | 26 |
4 files changed, 64 insertions, 26 deletions
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index c13119a5e69b..9d0e1694a94d 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -88,6 +88,17 @@ struct eeh_pe { struct list_head child_list; /* List of PEs below this PE */ struct list_head child; /* Memb. child_list/eeh_phb_pe */ struct list_head edevs; /* List of eeh_dev in this PE */ + + /* + * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure + * the stack trace is saved here so we can print it in the recovery + * thread if it turns out to due to a real problem rather than + * a hot-remove. + * + * A max of 64 entries might be overkill, but it also might not be. + */ + unsigned long stack_trace[64]; + int trace_entries; }; #define eeh_pe_for_each_dev(pe, edev, tmp) \ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 7b2755f5c6fd..398def61f8a6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -420,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) eeh_pe_mark_isolated(phb_pe); eeh_serialize_unlock(flags); - pr_err("EEH: PHB#%x failure detected, location: %s\n", + pr_debug("EEH: PHB#%x failure detected, location: %s\n", phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); - dump_stack(); eeh_send_failure_event(phb_pe); - return 1; out: eeh_serialize_unlock(flags); @@ -451,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) unsigned long flags; struct device_node *dn; struct pci_dev *dev; - struct eeh_pe *pe, *parent_pe, *phb_pe; + struct eeh_pe *pe, *parent_pe; int rc = 0; const char *location = NULL; @@ -581,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ - phb_pe = eeh_phb_pe_get(pe->phb); - pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", - pe->phb->global_number, pe->addr); - pr_err("EEH: PE location: %s, PHB location: %s\n", - eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); - dump_stack(); - + pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n", + __func__, pe->phb->global_number, pe->addr); eeh_send_failure_event(pe); return 1; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 52ce7584af43..0d34cc12c529 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -863,8 +863,44 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (eeh_slot_presence_check(edev->pdev)) devices++; - if (!devices) + if (!devices) { + pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", + pe->phb->global_number, pe->addr); goto out; /* nothing to recover */ + } + + /* Log the event */ + if (pe->type & EEH_PE_PHB) { + pr_err("EEH: PHB#%x failure detected, location: %s\n", + pe->phb->global_number, eeh_pe_loc_get(pe)); + } else { + struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); + + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); + } + + /* + * Print the saved stack trace now that we've verified there's + * something to recover. + */ + if (pe->trace_entries) { + void **ptrs = (void **) pe->stack_trace; + int i; + + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + + /* FIXME: Use the same format as dump_stack() */ + pr_err("EEH: Call Trace:\n"); + for (i = 0; i < pe->trace_entries; i++) + pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); + + pe->trace_entries = 0; + } + eeh_pe_update_time_stamp(pe); pe->freeze_count++; diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index e36653e5f76b..1d55486adb0f 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; - struct eeh_pe *pe; while (!kthread_should_stop()) { if (wait_for_completion_interruptible(&eeh_eventlist_event)) @@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy) continue; /* We might have event without binding PE */ - pe = event->pe; - if (pe) { - if (pe->type & EEH_PE_PHB) - pr_info("EEH: Detected error on PHB#%x\n", - pe->phb->global_number); - else - pr_info("EEH: Detected PCI bus error on " - "PHB#%x-PE#%x\n", - pe->phb->global_number, pe->addr); - eeh_handle_normal_event(pe); - } else { + if (event->pe) + eeh_handle_normal_event(event->pe); + else eeh_handle_special_event(); - } kfree(event); } @@ -126,8 +116,16 @@ int __eeh_send_failure_event(struct eeh_pe *pe) * This prevents the PE from being free()ed by a hotplug driver * while the PE is sitting in the event queue. */ - if (pe) + if (pe) { + /* + * Save the current stack trace so we can dump it from the + * event handler thread. + */ + pe->trace_entries = stack_trace_save(pe->stack_trace, + ARRAY_SIZE(pe->stack_trace), 0); + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + } /* We may or may not be called in an interrupt context */ spin_lock_irqsave(&eeh_eventlist_lock, flags); |