From 0d654a75c1a913afd606636b873601bd645a64ff Mon Sep 17 00:00:00 2001 From: Xiaofei Tan Date: Fri, 11 Jun 2021 20:37:07 +0800 Subject: [PATCH 1/2] ACPI: APEI: fix synchronous external aborts in user-mode [Upstream commit ccb5ecdc2ddeaff744ee075b54cdff8a689e8fa7] Before commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work"), do_sea() would unconditionally signal the affected task from the arch code. Since that change, the GHES driver sends the signals. This exposes a problem as errors the GHES driver doesn't understand or doesn't handle effectively are silently ignored. It will cause the errors get taken again, and circulate endlessly. User-space task get stuck in this loop. Existing firmware on Kunpeng9xx systems reports cache errors with the 'ARM Processor Error' CPER records. Do memory failure handling for ARM Processor Error Section just like for Memory Error Section. Fixes: 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work") Signed-off-by: Xiaofei Tan Reviewed-by: James Morse [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki Signed-off-by: huwentao --- drivers/acpi/apei/ghes.c | 84 +++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 4adf9ef2b0e2..e6da36ab1473 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -434,29 +434,35 @@ static void ghes_kick_task_work(struct callback_head *head) gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len); } -static bool ghes_handle_memory_failure(struct ghes *ghes, - struct acpi_hest_generic_data *gdata, - int sev) +static bool ghes_do_memory_failure(u64 physical_addr, int flags) { unsigned long pfn; - int flags = -1; - int sec_sev = ghes_severity(gdata->error_severity); - struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) return false; - if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) - return false; - - pfn = mem_err->physical_addr >> PAGE_SHIFT; + pfn = PHYS_PFN(physical_addr); if (!pfn_valid(pfn)) { pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid address in generic error data: %#llx\n", - mem_err->physical_addr); + physical_addr); return false; } + memory_failure_queue(pfn, flags); + return true; +} + +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, + int sev) +{ + int flags = -1; + int sec_sev = ghes_severity(gdata->error_severity); + struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return false; + /* iff following two events can be handled properly by now */ if (sec_sev == GHES_SEV_CORRECTED && (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED)) @@ -464,14 +470,56 @@ static bool ghes_handle_memory_failure(struct ghes *ghes, if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE) flags = 0; - if (flags != -1) { - memory_failure_queue(pfn, flags); - return true; - } + if (flags != -1) + return ghes_do_memory_failure(mem_err->physical_addr, flags); return false; } +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev) +{ + struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); + bool queued = false; + int sec_sev, i; + char *p; + + log_arm_hw_error(err); + + sec_sev = ghes_severity(gdata->error_severity); + if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) + return false; + + p = (char *)(err + 1); + for (i = 0; i < err->err_info_num; i++) { + struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p; + bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR); + bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR); + const char *error_type = "unknown error"; + + /* + * The field (err_info->error_info & BIT(26)) is fixed to set to + * 1 in some old firmware of HiSilicon Kunpeng920. We assume that + * firmware won't mix corrected errors in an uncorrected section, + * and don't filter out 'corrected' error here. + */ + if (is_cache && has_pa) { + queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0); + p += err_info->length; + continue; + } + + if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs)) + error_type = cper_proc_error_type_strs[err_info->type]; + + pr_warn_ratelimited(FW_WARN GHES_PFX + "Unhandled processor error type: %s\n", + error_type); + p += err_info->length; + } + + return queued; +} + /* * PCIe AER errors need to be sent to the AER driver for reporting and * recovery. The GHES severities map to the following AER severities and @@ -543,16 +591,14 @@ static bool ghes_do_proc(struct ghes *ghes, ghes_edac_report_mem_error(sev, mem_err); arch_apei_report_mem_error(sev, mem_err); - if (ghes_handle_memory_failure(ghes, gdata, sev)) + if (ghes_handle_memory_failure(gdata, sev)) work_queued = true; } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { ghes_handle_aer(gdata); } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { - struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); - - log_arm_hw_error(err); + work_queued = ghes_handle_arm_hw_error(gdata, sev); } else { void *err = acpi_hest_get_payload(gdata); -- Gitee From ca4fde27365147853023a8146464116e9f114d49 Mon Sep 17 00:00:00 2001 From: Shengwei Luo Date: Wed, 23 Feb 2022 16:53:40 +0800 Subject: [PATCH 2/2] RAS: Report ARM processor information to userspace The original arm_event trace code only traces out ARM processor error information data. It's not enough for user to take appropriate action. According to UEFI_2_9 specification chapter N2.4.4, the ARM processor error section includes several ARM processor error information, several ARM processor context information and several vendor specific error information structures. In addition to these info, there are error severity and cpu logical index about the event. Report all of these information to userspace via perf i/f. So that the user can do cpu core isolation according to error severity and other info. Original-Author: Jason Tian Signed-off-by: huwentao --- drivers/acpi/apei/ghes.c | 3 +-- drivers/ras/ras.c | 46 ++++++++++++++++++++++++++++++++++++-- include/linux/ras.h | 15 +++++++++++-- include/ras/ras_event.h | 48 +++++++++++++++++++++++++++++++++++----- 4 files changed, 101 insertions(+), 11 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index e6da36ab1473..411fa703f937 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -483,9 +483,8 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s int sec_sev, i; char *p; - log_arm_hw_error(err); - sec_sev = ghes_severity(gdata->error_severity); + log_arm_hw_error(err, sec_sev); if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) return false; diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 95540ea8dd9d..2a7f424d59b9 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -21,9 +21,51 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len); } -void log_arm_hw_error(struct cper_sec_proc_arm *err) +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { - trace_arm_event(err); + u32 pei_len; + u32 ctx_len = 0; + s32 vsei_len; + u8 *pei_err; + u8 *ctx_err; + u8 *ven_err_data; + struct cper_arm_err_info *err_info; + struct cper_arm_ctx_info *ctx_info; + int n, sz; + int cpu; + + pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num; + pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm); + + err_info = (struct cper_arm_err_info *)(err + 1); + ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num); + ctx_err = (u8 *)ctx_info; + for (n = 0; n < err->context_info_num; n++) { + sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; + ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); + ctx_len += sz; + } + + vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + + pei_len + ctx_len); + if (vsei_len < 0) { + pr_warn(FW_BUG + "section length: %d\n", err->section_length); + pr_warn(FW_BUG + "section length is too small\n"); + pr_warn(FW_BUG + "firmware-generated error record is incorrect\n"); + vsei_len = 0; + } + ven_err_data = (u8 *)ctx_info; + + cpu = GET_LOGICAL_INDEX(err->mpidr); + /* when return value is invalid, set cpu index to -1 */ + if (cpu < 0) + cpu = -1; + + trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len, + ven_err_data, (u32)vsei_len, sev, cpu); } static int __init ras_init(void) diff --git a/include/linux/ras.h b/include/linux/ras.h index 1f4048bf2674..4529775374d0 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -24,7 +24,7 @@ int __init parse_cec_param(char *str); void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); -void log_arm_hw_error(struct cper_sec_proc_arm *err); +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev); #else static inline void log_non_standard_event(const guid_t *sec_type, @@ -32,7 +32,18 @@ log_non_standard_event(const guid_t *sec_type, const u8 sev, const u8 *err, const u32 len) { return; } static inline void -log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } +log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; } #endif +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) +#include +/* + * Include ARM specific SMP header which provides a function mapping mpidr to + * cpu logical index. + */ +#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK) +#else +#define GET_LOGICAL_INDEX(mpidr) -EINVAL +#endif /* CONFIG_ARM || CONFIG_ARM64 */ + #endif /* __RAS_H__ */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 36c5c5e38c1d..3cc89c8d0e41 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -168,11 +168,24 @@ TRACE_EVENT(mc_event, * This event is generated when hardware detects an ARM processor error * has occurred. UEFI 2.6 spec section N.2.4.4. */ +#define APEIL "ARM Processor Err Info data len" +#define APEID "ARM Processor Err Info raw data" +#define APECIL "ARM Processor Err Context Info data len" +#define APECID "ARM Processor Err Context Info raw data" +#define VSEIL "Vendor Specific Err Info data len" +#define VSEID "Vendor Specific Err Info raw data" TRACE_EVENT(arm_event, - TP_PROTO(const struct cper_sec_proc_arm *proc), + TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err, + const u32 pei_len, + const u8 *ctx_err, + const u32 ctx_len, + const u8 *oem, + const u32 oem_len, + u8 sev, + int cpu), - TP_ARGS(proc), + TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu), TP_STRUCT__entry( __field(u64, mpidr) @@ -180,6 +193,14 @@ TRACE_EVENT(arm_event, __field(u32, running_state) __field(u32, psci_state) __field(u8, affinity) + __field(u32, pei_len) + __dynamic_array(u8, buf, pei_len) + __field(u32, ctx_len) + __dynamic_array(u8, buf1, ctx_len) + __field(u32, oem_len) + __dynamic_array(u8, buf2, oem_len) + __field(u8, sev) + __field(int, cpu) ), TP_fast_assign( @@ -199,12 +220,29 @@ TRACE_EVENT(arm_event, __entry->running_state = ~0; __entry->psci_state = ~0; } + __entry->pei_len = pei_len; + memcpy(__get_dynamic_array(buf), pei_err, pei_len); + __entry->ctx_len = ctx_len; + memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len); + __entry->oem_len = oem_len; + memcpy(__get_dynamic_array(buf2), oem, oem_len); + __entry->sev = sev; + __entry->cpu = cpu; ), - TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " - "running state: %d; PSCI state: %d", + TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " + "running state: %d; PSCI state: %d; " + "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s", + __entry->cpu, + __entry->sev, __entry->affinity, __entry->mpidr, __entry->midr, - __entry->running_state, __entry->psci_state) + __entry->running_state, __entry->psci_state, + APEIL, __entry->pei_len, APEID, + __print_hex(__get_dynamic_array(buf), __entry->pei_len), + APECIL, __entry->ctx_len, APECID, + __print_hex(__get_dynamic_array(buf1), __entry->ctx_len), + VSEIL, __entry->oem_len, VSEID, + __print_hex(__get_dynamic_array(buf2), __entry->oem_len)) ); /* -- Gitee