MFC r200033: mca: improve status checking, recording and reporting

This commit is contained in:
Andriy Gapon 2009-12-19 10:38:28 +00:00
parent 39fb869679
commit c9ac7946d7
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/stable/8/; revision=200711
4 changed files with 126 additions and 98 deletions

View file

@ -117,48 +117,6 @@ sysctl_mca_records(SYSCTL_HANDLER_ARGS)
return (SYSCTL_OUT(req, &record, sizeof(record)));
}
static struct mca_record *
mca_record_entry(int bank)
{
struct mca_internal *rec;
uint64_t status;
u_int p[4];
status = rdmsr(MSR_MC_STATUS(bank));
if (!(status & MC_STATUS_VAL))
return (NULL);
rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT | M_ZERO);
if (rec == NULL) {
printf("MCA: Unable to allocate space for an event.\n");
return (NULL);
}
/* Save exception information. */
rec->rec.mr_status = status;
if (status & MC_STATUS_ADDRV)
rec->rec.mr_addr = rdmsr(MSR_MC_ADDR(bank));
if (status & MC_STATUS_MISCV)
rec->rec.mr_misc = rdmsr(MSR_MC_MISC(bank));
rec->rec.mr_tsc = rdtsc();
rec->rec.mr_apic_id = PCPU_GET(apic_id);
/*
* Clear machine check. Don't do this for uncorrectable
* errors so that the BIOS can see them.
*/
if (!(rec->rec.mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
wrmsr(MSR_MC_STATUS(bank), 0);
do_cpuid(0, p);
}
mtx_lock_spin(&mca_lock);
STAILQ_INSERT_TAIL(&mca_records, rec, link);
mca_count++;
mtx_unlock_spin(&mca_lock);
return (&rec->rec);
}
static const char *
mca_error_ttype(uint16_t mca_error)
{
@ -219,11 +177,13 @@ mca_error_request(uint16_t mca_error)
}
/* Dump details about a single machine check. */
static void
mca_log(struct mca_record *rec)
static void __nonnull(1)
mca_log(const struct mca_record *rec)
{
uint16_t mca_error;
printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank,
(long long)rec->mr_status);
printf("MCA: CPU %d ", rec->mr_apic_id);
if (rec->mr_status & MC_STATUS_UC)
printf("UNCOR ");
@ -329,6 +289,59 @@ mca_log(struct mca_record *rec)
printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
}
static int __nonnull(2)
mca_check_status(int bank, struct mca_record *rec)
{
uint64_t status;
u_int p[4];
status = rdmsr(MSR_MC_STATUS(bank));
if (!(status & MC_STATUS_VAL))
return (0);
/* Save exception information. */
rec->mr_status = status;
rec->mr_bank = bank;
rec->mr_addr = 0;
if (status & MC_STATUS_ADDRV)
rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
rec->mr_misc = 0;
if (status & MC_STATUS_MISCV)
rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
rec->mr_tsc = rdtsc();
rec->mr_apic_id = PCPU_GET(apic_id);
/*
* Clear machine check. Don't do this for uncorrectable
* errors so that the BIOS can see them.
*/
if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
wrmsr(MSR_MC_STATUS(bank), 0);
do_cpuid(0, p);
}
return (1);
}
static void __nonnull(1)
mca_record_entry(const struct mca_record *record)
{
struct mca_internal *rec;
rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
if (rec == NULL) {
printf("MCA: Unable to allocate space for an event.\n");
mca_log(record);
return;
}
rec->rec = *record;
rec->logged = 0;
mtx_lock_spin(&mca_lock);
STAILQ_INSERT_TAIL(&mca_records, rec, link);
mca_count++;
mtx_unlock_spin(&mca_lock);
}
/*
* This scans all the machine check banks of the current CPU to see if
* there are any machine checks. Any non-recoverable errors are
@ -341,7 +354,7 @@ mca_log(struct mca_record *rec)
static int
mca_scan(int mcip)
{
struct mca_record *rec;
struct mca_record rec;
uint64_t mcg_cap, ucmask;
int count, i, recoverable;
@ -354,13 +367,13 @@ mca_scan(int mcip)
ucmask |= MC_STATUS_OVER;
mcg_cap = rdmsr(MSR_MCG_CAP);
for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
rec = mca_record_entry(i);
if (rec != NULL) {
if (mca_check_status(i, &rec)) {
count++;
if (rec->mr_status & ucmask) {
if (rec.mr_status & ucmask) {
recoverable = 0;
mca_log(rec);
mca_log(&rec);
}
mca_record_entry(&rec);
}
}
return (mcip ? recoverable : count);

View file

@ -36,6 +36,7 @@ struct mca_record {
uint64_t mr_misc;
uint64_t mr_tsc;
int mr_apic_id;
int mr_bank;
};
#ifdef _KERNEL

View file

@ -117,48 +117,6 @@ sysctl_mca_records(SYSCTL_HANDLER_ARGS)
return (SYSCTL_OUT(req, &record, sizeof(record)));
}
static struct mca_record *
mca_record_entry(int bank)
{
struct mca_internal *rec;
uint64_t status;
u_int p[4];
status = rdmsr(MSR_MC_STATUS(bank));
if (!(status & MC_STATUS_VAL))
return (NULL);
rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT | M_ZERO);
if (rec == NULL) {
printf("MCA: Unable to allocate space for an event.\n");
return (NULL);
}
/* Save exception information. */
rec->rec.mr_status = status;
if (status & MC_STATUS_ADDRV)
rec->rec.mr_addr = rdmsr(MSR_MC_ADDR(bank));
if (status & MC_STATUS_MISCV)
rec->rec.mr_misc = rdmsr(MSR_MC_MISC(bank));
rec->rec.mr_tsc = rdtsc();
rec->rec.mr_apic_id = PCPU_GET(apic_id);
/*
* Clear machine check. Don't do this for uncorrectable
* errors so that the BIOS can see them.
*/
if (!(rec->rec.mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
wrmsr(MSR_MC_STATUS(bank), 0);
do_cpuid(0, p);
}
mtx_lock_spin(&mca_lock);
STAILQ_INSERT_TAIL(&mca_records, rec, link);
mca_count++;
mtx_unlock_spin(&mca_lock);
return (&rec->rec);
}
static const char *
mca_error_ttype(uint16_t mca_error)
{
@ -219,11 +177,13 @@ mca_error_request(uint16_t mca_error)
}
/* Dump details about a single machine check. */
static void
mca_log(struct mca_record *rec)
static void __nonnull(1)
mca_log(const struct mca_record *rec)
{
uint16_t mca_error;
printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank,
(long long)rec->mr_status);
printf("MCA: CPU %d ", rec->mr_apic_id);
if (rec->mr_status & MC_STATUS_UC)
printf("UNCOR ");
@ -329,6 +289,59 @@ mca_log(struct mca_record *rec)
printf("MCA: Address 0x%llx\n", (long long)rec->mr_addr);
}
static int __nonnull(2)
mca_check_status(int bank, struct mca_record *rec)
{
uint64_t status;
u_int p[4];
status = rdmsr(MSR_MC_STATUS(bank));
if (!(status & MC_STATUS_VAL))
return (0);
/* Save exception information. */
rec->mr_status = status;
rec->mr_bank = bank;
rec->mr_addr = 0;
if (status & MC_STATUS_ADDRV)
rec->mr_addr = rdmsr(MSR_MC_ADDR(bank));
rec->mr_misc = 0;
if (status & MC_STATUS_MISCV)
rec->mr_misc = rdmsr(MSR_MC_MISC(bank));
rec->mr_tsc = rdtsc();
rec->mr_apic_id = PCPU_GET(apic_id);
/*
* Clear machine check. Don't do this for uncorrectable
* errors so that the BIOS can see them.
*/
if (!(rec->mr_status & (MC_STATUS_PCC | MC_STATUS_UC))) {
wrmsr(MSR_MC_STATUS(bank), 0);
do_cpuid(0, p);
}
return (1);
}
static void __nonnull(1)
mca_record_entry(const struct mca_record *record)
{
struct mca_internal *rec;
rec = malloc(sizeof(*rec), M_MCA, M_NOWAIT);
if (rec == NULL) {
printf("MCA: Unable to allocate space for an event.\n");
mca_log(record);
return;
}
rec->rec = *record;
rec->logged = 0;
mtx_lock_spin(&mca_lock);
STAILQ_INSERT_TAIL(&mca_records, rec, link);
mca_count++;
mtx_unlock_spin(&mca_lock);
}
/*
* This scans all the machine check banks of the current CPU to see if
* there are any machine checks. Any non-recoverable errors are
@ -341,7 +354,7 @@ mca_log(struct mca_record *rec)
static int
mca_scan(int mcip)
{
struct mca_record *rec;
struct mca_record rec;
uint64_t mcg_cap, ucmask;
int count, i, recoverable;
@ -354,13 +367,13 @@ mca_scan(int mcip)
ucmask |= MC_STATUS_OVER;
mcg_cap = rdmsr(MSR_MCG_CAP);
for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
rec = mca_record_entry(i);
if (rec != NULL) {
if (mca_check_status(i, &rec)) {
count++;
if (rec->mr_status & ucmask) {
if (rec.mr_status & ucmask) {
recoverable = 0;
mca_log(rec);
mca_log(&rec);
}
mca_record_entry(&rec);
}
}
return (mcip ? recoverable : count);

View file

@ -36,6 +36,7 @@ struct mca_record {
uint64_t mr_misc;
uint64_t mr_tsc;
int mr_apic_id;
int mr_bank;
};
#ifdef _KERNEL