PageRenderTime 60ms CodeModel.GetById 21ms app.highlight 32ms RepoModel.GetById 0ms app.codeStats 0ms

/drivers/edac/mce_amd.c

https://bitbucket.org/emiliolopez/linux
C | 1149 lines | 913 code | 211 blank | 25 comment | 207 complexity | 768d99e0fe812754a0f82e8f83325bf5 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, AGPL-1.0
   1#include <linux/module.h>
   2#include <linux/slab.h>
   3
   4#include <asm/cpu.h>
   5
   6#include "mce_amd.h"
   7
   8static struct amd_decoder_ops *fam_ops;
   9
  10static u8 xec_mask	 = 0xf;
  11
  12static bool report_gart_errors;
  13static void (*decode_dram_ecc)(int node_id, struct mce *m);
  14
  15void amd_report_gart_errors(bool v)
  16{
  17	report_gart_errors = v;
  18}
  19EXPORT_SYMBOL_GPL(amd_report_gart_errors);
  20
  21void amd_register_ecc_decoder(void (*f)(int, struct mce *))
  22{
  23	decode_dram_ecc = f;
  24}
  25EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  26
  27void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
  28{
  29	if (decode_dram_ecc) {
  30		WARN_ON(decode_dram_ecc != f);
  31
  32		decode_dram_ecc = NULL;
  33	}
  34}
  35EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  36
  37/*
  38 * string representation for the different MCA reported error types, see F3x48
  39 * or MSR0000_0411.
  40 */
  41
  42/* transaction type */
  43static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
  44
  45/* cache level */
  46static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
  47
  48/* memory transaction type */
  49static const char * const rrrr_msgs[] = {
  50       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  51};
  52
  53/* participating processor */
  54const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  55EXPORT_SYMBOL_GPL(pp_msgs);
  56
  57/* request timeout */
  58static const char * const to_msgs[] = { "no timeout", "timed out" };
  59
  60/* memory or i/o */
  61static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
  62
  63/* internal error type */
  64static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
  65
  66static const char * const f15h_mc1_mce_desc[] = {
  67	"UC during a demand linefill from L2",
  68	"Parity error during data load from IC",
  69	"Parity error for IC valid bit",
  70	"Main tag parity error",
  71	"Parity error in prediction queue",
  72	"PFB data/address parity error",
  73	"Parity error in the branch status reg",
  74	"PFB promotion address error",
  75	"Tag error during probe/victimization",
  76	"Parity error for IC probe tag valid bit",
  77	"PFB non-cacheable bit parity error",
  78	"PFB valid bit parity error",			/* xec = 0xd */
  79	"Microcode Patch Buffer",			/* xec = 010 */
  80	"uop queue",
  81	"insn buffer",
  82	"predecode buffer",
  83	"fetch address FIFO",
  84	"dispatch uop queue"
  85};
  86
  87static const char * const f15h_mc2_mce_desc[] = {
  88	"Fill ECC error on data fills",			/* xec = 0x4 */
  89	"Fill parity error on insn fills",
  90	"Prefetcher request FIFO parity error",
  91	"PRQ address parity error",
  92	"PRQ data parity error",
  93	"WCC Tag ECC error",
  94	"WCC Data ECC error",
  95	"WCB Data parity error",
  96	"VB Data ECC or parity error",
  97	"L2 Tag ECC error",				/* xec = 0x10 */
  98	"Hard L2 Tag ECC error",
  99	"Multiple hits on L2 tag",
 100	"XAB parity error",
 101	"PRB address parity error"
 102};
 103
 104static const char * const mc4_mce_desc[] = {
 105	"DRAM ECC error detected on the NB",
 106	"CRC error detected on HT link",
 107	"Link-defined sync error packets detected on HT link",
 108	"HT Master abort",
 109	"HT Target abort",
 110	"Invalid GART PTE entry during GART table walk",
 111	"Unsupported atomic RMW received from an IO link",
 112	"Watchdog timeout due to lack of progress",
 113	"DRAM ECC error detected on the NB",
 114	"SVM DMA Exclusion Vector error",
 115	"HT data error detected on link",
 116	"Protocol error (link, L3, probe filter)",
 117	"NB internal arrays parity error",
 118	"DRAM addr/ctl signals parity error",
 119	"IO link transmission error",
 120	"L3 data cache ECC error",			/* xec = 0x1c */
 121	"L3 cache tag error",
 122	"L3 LRU parity bits error",
 123	"ECC Error in the Probe Filter directory"
 124};
 125
 126static const char * const mc5_mce_desc[] = {
 127	"CPU Watchdog timer expire",
 128	"Wakeup array dest tag",
 129	"AG payload array",
 130	"EX payload array",
 131	"IDRF array",
 132	"Retire dispatch queue",
 133	"Mapper checkpoint array",
 134	"Physical register file EX0 port",
 135	"Physical register file EX1 port",
 136	"Physical register file AG0 port",
 137	"Physical register file AG1 port",
 138	"Flag register file",
 139	"DE error occurred",
 140	"Retire status queue"
 141};
 142
 143static const char * const mc6_mce_desc[] = {
 144	"Hardware Assertion",
 145	"Free List",
 146	"Physical Register File",
 147	"Retire Queue",
 148	"Scheduler table",
 149	"Status Register File",
 150};
 151
 152/* Scalable MCA error strings */
 153static const char * const smca_ls_mce_desc[] = {
 154	"Load queue parity",
 155	"Store queue parity",
 156	"Miss address buffer payload parity",
 157	"L1 TLB parity",
 158	"Reserved",
 159	"DC tag error type 6",
 160	"DC tag error type 1",
 161	"Internal error type 1",
 162	"Internal error type 2",
 163	"Sys Read data error thread 0",
 164	"Sys read data error thread 1",
 165	"DC tag error type 2",
 166	"DC data error type 1 (poison consumption)",
 167	"DC data error type 2",
 168	"DC data error type 3",
 169	"DC tag error type 4",
 170	"L2 TLB parity",
 171	"PDC parity error",
 172	"DC tag error type 3",
 173	"DC tag error type 5",
 174	"L2 fill data error",
 175};
 176
 177static const char * const smca_if_mce_desc[] = {
 178	"microtag probe port parity error",
 179	"IC microtag or full tag multi-hit error",
 180	"IC full tag parity",
 181	"IC data array parity",
 182	"Decoupling queue phys addr parity error",
 183	"L0 ITLB parity error",
 184	"L1 ITLB parity error",
 185	"L2 ITLB parity error",
 186	"BPQ snoop parity on Thread 0",
 187	"BPQ snoop parity on Thread 1",
 188	"L1 BTB multi-match error",
 189	"L2 BTB multi-match error",
 190	"L2 Cache Response Poison error",
 191	"System Read Data error",
 192};
 193
 194static const char * const smca_l2_mce_desc[] = {
 195	"L2M tag multi-way-hit error",
 196	"L2M tag ECC error",
 197	"L2M data ECC error",
 198	"HW assert",
 199};
 200
 201static const char * const smca_de_mce_desc[] = {
 202	"uop cache tag parity error",
 203	"uop cache data parity error",
 204	"Insn buffer parity error",
 205	"uop queue parity error",
 206	"Insn dispatch queue parity error",
 207	"Fetch address FIFO parity",
 208	"Patch RAM data parity",
 209	"Patch RAM sequencer parity",
 210	"uop buffer parity"
 211};
 212
 213static const char * const smca_ex_mce_desc[] = {
 214	"Watchdog timeout error",
 215	"Phy register file parity",
 216	"Flag register file parity",
 217	"Immediate displacement register file parity",
 218	"Address generator payload parity",
 219	"EX payload parity",
 220	"Checkpoint queue parity",
 221	"Retire dispatch queue parity",
 222	"Retire status queue parity error",
 223	"Scheduling queue parity error",
 224	"Branch buffer queue parity error",
 225};
 226
 227static const char * const smca_fp_mce_desc[] = {
 228	"Physical register file parity",
 229	"Freelist parity error",
 230	"Schedule queue parity",
 231	"NSQ parity error",
 232	"Retire queue parity",
 233	"Status register file parity",
 234	"Hardware assertion",
 235};
 236
 237static const char * const smca_l3_mce_desc[] = {
 238	"Shadow tag macro ECC error",
 239	"Shadow tag macro multi-way-hit error",
 240	"L3M tag ECC error",
 241	"L3M tag multi-way-hit error",
 242	"L3M data ECC error",
 243	"XI parity, L3 fill done channel error",
 244	"L3 victim queue parity",
 245	"L3 HW assert",
 246};
 247
 248static const char * const smca_cs_mce_desc[] = {
 249	"Illegal request from transport layer",
 250	"Address violation",
 251	"Security violation",
 252	"Illegal response from transport layer",
 253	"Unexpected response",
 254	"Parity error on incoming request or probe response data",
 255	"Parity error on incoming read response data",
 256	"Atomic request parity",
 257	"ECC error on probe filter access",
 258};
 259
 260static const char * const smca_pie_mce_desc[] = {
 261	"HW assert",
 262	"Internal PIE register security violation",
 263	"Error on GMI link",
 264	"Poison data written to internal PIE register",
 265};
 266
 267static const char * const smca_umc_mce_desc[] = {
 268	"DRAM ECC error",
 269	"Data poison error on DRAM",
 270	"SDP parity error",
 271	"Advanced peripheral bus error",
 272	"Command/address parity error",
 273	"Write data CRC error",
 274};
 275
 276static const char * const smca_pb_mce_desc[] = {
 277	"Parameter Block RAM ECC error",
 278};
 279
 280static const char * const smca_psp_mce_desc[] = {
 281	"PSP RAM ECC or parity error",
 282};
 283
 284static const char * const smca_smu_mce_desc[] = {
 285	"SMU RAM ECC or parity error",
 286};
 287
 288struct smca_mce_desc {
 289	const char * const *descs;
 290	unsigned int num_descs;
 291};
 292
 293static struct smca_mce_desc smca_mce_descs[] = {
 294	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
 295	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
 296	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
 297	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
 298	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
 299	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
 300	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
 301	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
 302	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
 303	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
 304	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
 305	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
 306	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
 307};
 308
 309static bool f12h_mc0_mce(u16 ec, u8 xec)
 310{
 311	bool ret = false;
 312
 313	if (MEM_ERROR(ec)) {
 314		u8 ll = LL(ec);
 315		ret = true;
 316
 317		if (ll == LL_L2)
 318			pr_cont("during L1 linefill from L2.\n");
 319		else if (ll == LL_L1)
 320			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
 321		else
 322			ret = false;
 323	}
 324	return ret;
 325}
 326
 327static bool f10h_mc0_mce(u16 ec, u8 xec)
 328{
 329	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
 330		pr_cont("during data scrub.\n");
 331		return true;
 332	}
 333	return f12h_mc0_mce(ec, xec);
 334}
 335
 336static bool k8_mc0_mce(u16 ec, u8 xec)
 337{
 338	if (BUS_ERROR(ec)) {
 339		pr_cont("during system linefill.\n");
 340		return true;
 341	}
 342
 343	return f10h_mc0_mce(ec, xec);
 344}
 345
 346static bool cat_mc0_mce(u16 ec, u8 xec)
 347{
 348	u8 r4	 = R4(ec);
 349	bool ret = true;
 350
 351	if (MEM_ERROR(ec)) {
 352
 353		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
 354			return false;
 355
 356		switch (r4) {
 357		case R4_DRD:
 358		case R4_DWR:
 359			pr_cont("Data/Tag parity error due to %s.\n",
 360				(r4 == R4_DRD ? "load/hw prf" : "store"));
 361			break;
 362		case R4_EVICT:
 363			pr_cont("Copyback parity error on a tag miss.\n");
 364			break;
 365		case R4_SNOOP:
 366			pr_cont("Tag parity error during snoop.\n");
 367			break;
 368		default:
 369			ret = false;
 370		}
 371	} else if (BUS_ERROR(ec)) {
 372
 373		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
 374			return false;
 375
 376		pr_cont("System read data error on a ");
 377
 378		switch (r4) {
 379		case R4_RD:
 380			pr_cont("TLB reload.\n");
 381			break;
 382		case R4_DWR:
 383			pr_cont("store.\n");
 384			break;
 385		case R4_DRD:
 386			pr_cont("load.\n");
 387			break;
 388		default:
 389			ret = false;
 390		}
 391	} else {
 392		ret = false;
 393	}
 394
 395	return ret;
 396}
 397
 398static bool f15h_mc0_mce(u16 ec, u8 xec)
 399{
 400	bool ret = true;
 401
 402	if (MEM_ERROR(ec)) {
 403
 404		switch (xec) {
 405		case 0x0:
 406			pr_cont("Data Array access error.\n");
 407			break;
 408
 409		case 0x1:
 410			pr_cont("UC error during a linefill from L2/NB.\n");
 411			break;
 412
 413		case 0x2:
 414		case 0x11:
 415			pr_cont("STQ access error.\n");
 416			break;
 417
 418		case 0x3:
 419			pr_cont("SCB access error.\n");
 420			break;
 421
 422		case 0x10:
 423			pr_cont("Tag error.\n");
 424			break;
 425
 426		case 0x12:
 427			pr_cont("LDQ access error.\n");
 428			break;
 429
 430		default:
 431			ret = false;
 432		}
 433	} else if (BUS_ERROR(ec)) {
 434
 435		if (!xec)
 436			pr_cont("System Read Data Error.\n");
 437		else
 438			pr_cont(" Internal error condition type %d.\n", xec);
 439	} else if (INT_ERROR(ec)) {
 440		if (xec <= 0x1f)
 441			pr_cont("Hardware Assert.\n");
 442		else
 443			ret = false;
 444
 445	} else
 446		ret = false;
 447
 448	return ret;
 449}
 450
 451static void decode_mc0_mce(struct mce *m)
 452{
 453	u16 ec = EC(m->status);
 454	u8 xec = XEC(m->status, xec_mask);
 455
 456	pr_emerg(HW_ERR "MC0 Error: ");
 457
 458	/* TLB error signatures are the same across families */
 459	if (TLB_ERROR(ec)) {
 460		if (TT(ec) == TT_DATA) {
 461			pr_cont("%s TLB %s.\n", LL_MSG(ec),
 462				((xec == 2) ? "locked miss"
 463					    : (xec ? "multimatch" : "parity")));
 464			return;
 465		}
 466	} else if (fam_ops->mc0_mce(ec, xec))
 467		;
 468	else
 469		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
 470}
 471
 472static bool k8_mc1_mce(u16 ec, u8 xec)
 473{
 474	u8 ll	 = LL(ec);
 475	bool ret = true;
 476
 477	if (!MEM_ERROR(ec))
 478		return false;
 479
 480	if (ll == 0x2)
 481		pr_cont("during a linefill from L2.\n");
 482	else if (ll == 0x1) {
 483		switch (R4(ec)) {
 484		case R4_IRD:
 485			pr_cont("Parity error during data load.\n");
 486			break;
 487
 488		case R4_EVICT:
 489			pr_cont("Copyback Parity/Victim error.\n");
 490			break;
 491
 492		case R4_SNOOP:
 493			pr_cont("Tag Snoop error.\n");
 494			break;
 495
 496		default:
 497			ret = false;
 498			break;
 499		}
 500	} else
 501		ret = false;
 502
 503	return ret;
 504}
 505
 506static bool cat_mc1_mce(u16 ec, u8 xec)
 507{
 508	u8 r4    = R4(ec);
 509	bool ret = true;
 510
 511	if (!MEM_ERROR(ec))
 512		return false;
 513
 514	if (TT(ec) != TT_INSTR)
 515		return false;
 516
 517	if (r4 == R4_IRD)
 518		pr_cont("Data/tag array parity error for a tag hit.\n");
 519	else if (r4 == R4_SNOOP)
 520		pr_cont("Tag error during snoop/victimization.\n");
 521	else if (xec == 0x0)
 522		pr_cont("Tag parity error from victim castout.\n");
 523	else if (xec == 0x2)
 524		pr_cont("Microcode patch RAM parity error.\n");
 525	else
 526		ret = false;
 527
 528	return ret;
 529}
 530
 531static bool f15h_mc1_mce(u16 ec, u8 xec)
 532{
 533	bool ret = true;
 534
 535	if (!MEM_ERROR(ec))
 536		return false;
 537
 538	switch (xec) {
 539	case 0x0 ... 0xa:
 540		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
 541		break;
 542
 543	case 0xd:
 544		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
 545		break;
 546
 547	case 0x10:
 548		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
 549		break;
 550
 551	case 0x11 ... 0x15:
 552		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
 553		break;
 554
 555	default:
 556		ret = false;
 557	}
 558	return ret;
 559}
 560
 561static void decode_mc1_mce(struct mce *m)
 562{
 563	u16 ec = EC(m->status);
 564	u8 xec = XEC(m->status, xec_mask);
 565
 566	pr_emerg(HW_ERR "MC1 Error: ");
 567
 568	if (TLB_ERROR(ec))
 569		pr_cont("%s TLB %s.\n", LL_MSG(ec),
 570			(xec ? "multimatch" : "parity error"));
 571	else if (BUS_ERROR(ec)) {
 572		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
 573
 574		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
 575	} else if (INT_ERROR(ec)) {
 576		if (xec <= 0x3f)
 577			pr_cont("Hardware Assert.\n");
 578		else
 579			goto wrong_mc1_mce;
 580	} else if (fam_ops->mc1_mce(ec, xec))
 581		;
 582	else
 583		goto wrong_mc1_mce;
 584
 585	return;
 586
 587wrong_mc1_mce:
 588	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 589}
 590
 591static bool k8_mc2_mce(u16 ec, u8 xec)
 592{
 593	bool ret = true;
 594
 595	if (xec == 0x1)
 596		pr_cont(" in the write data buffers.\n");
 597	else if (xec == 0x3)
 598		pr_cont(" in the victim data buffers.\n");
 599	else if (xec == 0x2 && MEM_ERROR(ec))
 600		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
 601	else if (xec == 0x0) {
 602		if (TLB_ERROR(ec))
 603			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
 604				TT_MSG(ec));
 605		else if (BUS_ERROR(ec))
 606			pr_cont(": %s/ECC error in data read from NB: %s.\n",
 607				R4_MSG(ec), PP_MSG(ec));
 608		else if (MEM_ERROR(ec)) {
 609			u8 r4 = R4(ec);
 610
 611			if (r4 >= 0x7)
 612				pr_cont(": %s error during data copyback.\n",
 613					R4_MSG(ec));
 614			else if (r4 <= 0x1)
 615				pr_cont(": %s parity/ECC error during data "
 616					"access from L2.\n", R4_MSG(ec));
 617			else
 618				ret = false;
 619		} else
 620			ret = false;
 621	} else
 622		ret = false;
 623
 624	return ret;
 625}
 626
 627static bool f15h_mc2_mce(u16 ec, u8 xec)
 628{
 629	bool ret = true;
 630
 631	if (TLB_ERROR(ec)) {
 632		if (xec == 0x0)
 633			pr_cont("Data parity TLB read error.\n");
 634		else if (xec == 0x1)
 635			pr_cont("Poison data provided for TLB fill.\n");
 636		else
 637			ret = false;
 638	} else if (BUS_ERROR(ec)) {
 639		if (xec > 2)
 640			ret = false;
 641
 642		pr_cont("Error during attempted NB data read.\n");
 643	} else if (MEM_ERROR(ec)) {
 644		switch (xec) {
 645		case 0x4 ... 0xc:
 646			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
 647			break;
 648
 649		case 0x10 ... 0x14:
 650			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
 651			break;
 652
 653		default:
 654			ret = false;
 655		}
 656	} else if (INT_ERROR(ec)) {
 657		if (xec <= 0x3f)
 658			pr_cont("Hardware Assert.\n");
 659		else
 660			ret = false;
 661	}
 662
 663	return ret;
 664}
 665
 666static bool f16h_mc2_mce(u16 ec, u8 xec)
 667{
 668	u8 r4 = R4(ec);
 669
 670	if (!MEM_ERROR(ec))
 671		return false;
 672
 673	switch (xec) {
 674	case 0x04 ... 0x05:
 675		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
 676		break;
 677
 678	case 0x09 ... 0x0b:
 679	case 0x0d ... 0x0f:
 680		pr_cont("ECC error in L2 tag (%s).\n",
 681			((r4 == R4_GEN)   ? "BankReq" :
 682			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
 683		break;
 684
 685	case 0x10 ... 0x19:
 686	case 0x1b:
 687		pr_cont("ECC error in L2 data array (%s).\n",
 688			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
 689			((r4 == R4_GEN)   ? "Attr" :
 690			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
 691		break;
 692
 693	case 0x1c ... 0x1d:
 694	case 0x1f:
 695		pr_cont("Parity error in L2 attribute bits (%s).\n",
 696			((r4 == R4_RD)  ? "Hit"  :
 697			((r4 == R4_GEN) ? "Attr" : "Fill")));
 698		break;
 699
 700	default:
 701		return false;
 702	}
 703
 704	return true;
 705}
 706
 707static void decode_mc2_mce(struct mce *m)
 708{
 709	u16 ec = EC(m->status);
 710	u8 xec = XEC(m->status, xec_mask);
 711
 712	pr_emerg(HW_ERR "MC2 Error: ");
 713
 714	if (!fam_ops->mc2_mce(ec, xec))
 715		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 716}
 717
 718static void decode_mc3_mce(struct mce *m)
 719{
 720	u16 ec = EC(m->status);
 721	u8 xec = XEC(m->status, xec_mask);
 722
 723	if (boot_cpu_data.x86 >= 0x14) {
 724		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
 725			 " please report on LKML.\n");
 726		return;
 727	}
 728
 729	pr_emerg(HW_ERR "MC3 Error");
 730
 731	if (xec == 0x0) {
 732		u8 r4 = R4(ec);
 733
 734		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
 735			goto wrong_mc3_mce;
 736
 737		pr_cont(" during %s.\n", R4_MSG(ec));
 738	} else
 739		goto wrong_mc3_mce;
 740
 741	return;
 742
 743 wrong_mc3_mce:
 744	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
 745}
 746
 747static void decode_mc4_mce(struct mce *m)
 748{
 749	unsigned int fam = x86_family(m->cpuid);
 750	int node_id = amd_get_nb_id(m->extcpu);
 751	u16 ec = EC(m->status);
 752	u8 xec = XEC(m->status, 0x1f);
 753	u8 offset = 0;
 754
 755	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
 756
 757	switch (xec) {
 758	case 0x0 ... 0xe:
 759
 760		/* special handling for DRAM ECCs */
 761		if (xec == 0x0 || xec == 0x8) {
 762			/* no ECCs on F11h */
 763			if (fam == 0x11)
 764				goto wrong_mc4_mce;
 765
 766			pr_cont("%s.\n", mc4_mce_desc[xec]);
 767
 768			if (decode_dram_ecc)
 769				decode_dram_ecc(node_id, m);
 770			return;
 771		}
 772		break;
 773
 774	case 0xf:
 775		if (TLB_ERROR(ec))
 776			pr_cont("GART Table Walk data error.\n");
 777		else if (BUS_ERROR(ec))
 778			pr_cont("DMA Exclusion Vector Table Walk error.\n");
 779		else
 780			goto wrong_mc4_mce;
 781		return;
 782
 783	case 0x19:
 784		if (fam == 0x15 || fam == 0x16)
 785			pr_cont("Compute Unit Data Error.\n");
 786		else
 787			goto wrong_mc4_mce;
 788		return;
 789
 790	case 0x1c ... 0x1f:
 791		offset = 13;
 792		break;
 793
 794	default:
 795		goto wrong_mc4_mce;
 796	}
 797
 798	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
 799	return;
 800
 801 wrong_mc4_mce:
 802	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
 803}
 804
 805static void decode_mc5_mce(struct mce *m)
 806{
 807	unsigned int fam = x86_family(m->cpuid);
 808	u16 ec = EC(m->status);
 809	u8 xec = XEC(m->status, xec_mask);
 810
 811	if (fam == 0xf || fam == 0x11)
 812		goto wrong_mc5_mce;
 813
 814	pr_emerg(HW_ERR "MC5 Error: ");
 815
 816	if (INT_ERROR(ec)) {
 817		if (xec <= 0x1f) {
 818			pr_cont("Hardware Assert.\n");
 819			return;
 820		} else
 821			goto wrong_mc5_mce;
 822	}
 823
 824	if (xec == 0x0 || xec == 0xc)
 825		pr_cont("%s.\n", mc5_mce_desc[xec]);
 826	else if (xec <= 0xd)
 827		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
 828	else
 829		goto wrong_mc5_mce;
 830
 831	return;
 832
 833 wrong_mc5_mce:
 834	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
 835}
 836
 837static void decode_mc6_mce(struct mce *m)
 838{
 839	u8 xec = XEC(m->status, xec_mask);
 840
 841	pr_emerg(HW_ERR "MC6 Error: ");
 842
 843	if (xec > 0x5)
 844		goto wrong_mc6_mce;
 845
 846	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
 847	return;
 848
 849 wrong_mc6_mce:
 850	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 851}
 852
 853/* Decode errors according to Scalable MCA specification */
 854static void decode_smca_error(struct mce *m)
 855{
 856	struct smca_hwid *hwid;
 857	unsigned int bank_type;
 858	const char *ip_name;
 859	u8 xec = XEC(m->status, xec_mask);
 860
 861	if (m->bank >= ARRAY_SIZE(smca_banks))
 862		return;
 863
 864	if (x86_family(m->cpuid) >= 0x17 && m->bank == 4)
 865		pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
 866
 867	hwid = smca_banks[m->bank].hwid;
 868	if (!hwid)
 869		return;
 870
 871	bank_type = hwid->bank_type;
 872	ip_name = smca_get_long_name(bank_type);
 873
 874	pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
 875
 876	/* Only print the decode of valid error codes */
 877	if (xec < smca_mce_descs[bank_type].num_descs &&
 878			(hwid->xec_bitmap & BIT_ULL(xec))) {
 879		pr_emerg(HW_ERR "%s Error: ", ip_name);
 880		pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
 881	}
 882
 883	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
 884		decode_dram_ecc(cpu_to_node(m->extcpu), m);
 885}
 886
 887static inline void amd_decode_err_code(u16 ec)
 888{
 889	if (INT_ERROR(ec)) {
 890		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
 891		return;
 892	}
 893
 894	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
 895
 896	if (BUS_ERROR(ec))
 897		pr_cont(", mem/io: %s", II_MSG(ec));
 898	else
 899		pr_cont(", tx: %s", TT_MSG(ec));
 900
 901	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
 902		pr_cont(", mem-tx: %s", R4_MSG(ec));
 903
 904		if (BUS_ERROR(ec))
 905			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
 906	}
 907
 908	pr_cont("\n");
 909}
 910
 911/*
 912 * Filter out unwanted MCE signatures here.
 913 */
 914static bool amd_filter_mce(struct mce *m)
 915{
 916	/*
 917	 * NB GART TLB error reporting is disabled by default.
 918	 */
 919	if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
 920		return true;
 921
 922	return false;
 923}
 924
 925static const char *decode_error_status(struct mce *m)
 926{
 927	if (m->status & MCI_STATUS_UC) {
 928		if (m->status & MCI_STATUS_PCC)
 929			return "System Fatal error.";
 930		if (m->mcgstatus & MCG_STATUS_RIPV)
 931			return "Uncorrected, software restartable error.";
 932		return "Uncorrected, software containable error.";
 933	}
 934
 935	if (m->status & MCI_STATUS_DEFERRED)
 936		return "Deferred error, no action required.";
 937
 938	return "Corrected error, no action required.";
 939}
 940
 941static int
 942amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 943{
 944	struct mce *m = (struct mce *)data;
 945	unsigned int fam = x86_family(m->cpuid);
 946	int ecc;
 947
 948	if (amd_filter_mce(m))
 949		return NOTIFY_STOP;
 950
 951	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
 952
 953	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
 954		m->extcpu,
 955		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
 956		m->bank,
 957		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
 958		((m->status & MCI_STATUS_UC)	? "UE"	  :
 959		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
 960		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
 961		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
 962		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
 963
 964	if (fam >= 0x15) {
 965		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
 966
 967		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
 968		if (fam != 0x15 || m->bank != 4)
 969			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
 970	}
 971
 972	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 973		u32 low, high;
 974		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 975
 976		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
 977
 978		if (!rdmsr_safe(addr, &low, &high) &&
 979		    (low & MCI_CONFIG_MCAX))
 980			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 981	}
 982
 983	/* do the two bits[14:13] together */
 984	ecc = (m->status >> 45) & 0x3;
 985	if (ecc)
 986		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
 987
 988	pr_cont("]: 0x%016llx\n", m->status);
 989
 990	if (m->status & MCI_STATUS_ADDRV)
 991		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
 992
 993	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 994		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
 995
 996		if (m->status & MCI_STATUS_SYNDV)
 997			pr_cont(", Syndrome: 0x%016llx", m->synd);
 998
 999		pr_cont("\n");
1000
1001		decode_smca_error(m);
1002		goto err_code;
1003	}
1004
1005	if (m->tsc)
1006		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1007
1008	if (!fam_ops)
1009		goto err_code;
1010
1011	switch (m->bank) {
1012	case 0:
1013		decode_mc0_mce(m);
1014		break;
1015
1016	case 1:
1017		decode_mc1_mce(m);
1018		break;
1019
1020	case 2:
1021		decode_mc2_mce(m);
1022		break;
1023
1024	case 3:
1025		decode_mc3_mce(m);
1026		break;
1027
1028	case 4:
1029		decode_mc4_mce(m);
1030		break;
1031
1032	case 5:
1033		decode_mc5_mce(m);
1034		break;
1035
1036	case 6:
1037		decode_mc6_mce(m);
1038		break;
1039
1040	default:
1041		break;
1042	}
1043
1044 err_code:
1045	amd_decode_err_code(m->status & 0xffff);
1046
1047	return NOTIFY_STOP;
1048}
1049
1050static struct notifier_block amd_mce_dec_nb = {
1051	.notifier_call	= amd_decode_mce,
1052	.priority	= MCE_PRIO_EDAC,
1053};
1054
1055static int __init mce_amd_init(void)
1056{
1057	struct cpuinfo_x86 *c = &boot_cpu_data;
1058
1059	if (c->x86_vendor != X86_VENDOR_AMD)
1060		return -ENODEV;
1061
1062	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1063	if (!fam_ops)
1064		return -ENOMEM;
1065
1066	switch (c->x86) {
1067	case 0xf:
1068		fam_ops->mc0_mce = k8_mc0_mce;
1069		fam_ops->mc1_mce = k8_mc1_mce;
1070		fam_ops->mc2_mce = k8_mc2_mce;
1071		break;
1072
1073	case 0x10:
1074		fam_ops->mc0_mce = f10h_mc0_mce;
1075		fam_ops->mc1_mce = k8_mc1_mce;
1076		fam_ops->mc2_mce = k8_mc2_mce;
1077		break;
1078
1079	case 0x11:
1080		fam_ops->mc0_mce = k8_mc0_mce;
1081		fam_ops->mc1_mce = k8_mc1_mce;
1082		fam_ops->mc2_mce = k8_mc2_mce;
1083		break;
1084
1085	case 0x12:
1086		fam_ops->mc0_mce = f12h_mc0_mce;
1087		fam_ops->mc1_mce = k8_mc1_mce;
1088		fam_ops->mc2_mce = k8_mc2_mce;
1089		break;
1090
1091	case 0x14:
1092		fam_ops->mc0_mce = cat_mc0_mce;
1093		fam_ops->mc1_mce = cat_mc1_mce;
1094		fam_ops->mc2_mce = k8_mc2_mce;
1095		break;
1096
1097	case 0x15:
1098		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1099
1100		fam_ops->mc0_mce = f15h_mc0_mce;
1101		fam_ops->mc1_mce = f15h_mc1_mce;
1102		fam_ops->mc2_mce = f15h_mc2_mce;
1103		break;
1104
1105	case 0x16:
1106		xec_mask = 0x1f;
1107		fam_ops->mc0_mce = cat_mc0_mce;
1108		fam_ops->mc1_mce = cat_mc1_mce;
1109		fam_ops->mc2_mce = f16h_mc2_mce;
1110		break;
1111
1112	case 0x17:
1113		xec_mask = 0x3f;
1114		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1115			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1116			goto err_out;
1117		}
1118		break;
1119
1120	default:
1121		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1122		goto err_out;
1123	}
1124
1125	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1126
1127	mce_register_decode_chain(&amd_mce_dec_nb);
1128
1129	return 0;
1130
1131err_out:
1132	kfree(fam_ops);
1133	fam_ops = NULL;
1134	return -EINVAL;
1135}
1136early_initcall(mce_amd_init);
1137
1138#ifdef MODULE
1139static void __exit mce_amd_exit(void)
1140{
1141	mce_unregister_decode_chain(&amd_mce_dec_nb);
1142	kfree(fam_ops);
1143}
1144
1145MODULE_DESCRIPTION("AMD MCE decoder");
1146MODULE_ALIAS("edac-mce-amd");
1147MODULE_LICENSE("GPL");
1148module_exit(mce_amd_exit);
1149#endif