PageRenderTime 4ms CodeModel.GetById 56ms app.highlight 78ms RepoModel.GetById 1ms app.codeStats 0ms

/drivers/edac/mce_amd.c

https://bitbucket.org/emiliolopez/linux
C | 1153 lines | 913 code | 211 blank | 29 comment | 207 complexity | 770951bdedec7fd062241347b6afbe2d MD5 | raw file
   1#include <linux/module.h>
   2#include <linux/slab.h>
   3
   4#include "mce_amd.h"
   5
   6static struct amd_decoder_ops *fam_ops;
   7
   8static u8 xec_mask	 = 0xf;
   9
  10static bool report_gart_errors;
  11static void (*decode_dram_ecc)(int node_id, struct mce *m);
  12
  13void amd_report_gart_errors(bool v)
  14{
  15	report_gart_errors = v;
  16}
  17EXPORT_SYMBOL_GPL(amd_report_gart_errors);
  18
  19void amd_register_ecc_decoder(void (*f)(int, struct mce *))
  20{
  21	decode_dram_ecc = f;
  22}
  23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  24
  25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
  26{
  27	if (decode_dram_ecc) {
  28		WARN_ON(decode_dram_ecc != f);
  29
  30		decode_dram_ecc = NULL;
  31	}
  32}
  33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  34
  35/*
  36 * string representation for the different MCA reported error types, see F3x48
  37 * or MSR0000_0411.
  38 */
  39
  40/* transaction type */
  41static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
  42
  43/* cache level */
  44static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
  45
  46/* memory transaction type */
  47static const char * const rrrr_msgs[] = {
  48       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  49};
  50
  51/* participating processor */
  52const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  53EXPORT_SYMBOL_GPL(pp_msgs);
  54
  55/* request timeout */
  56static const char * const to_msgs[] = { "no timeout", "timed out" };
  57
  58/* memory or i/o */
  59static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
  60
  61/* internal error type */
  62static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
  63
  64static const char * const f15h_mc1_mce_desc[] = {
  65	"UC during a demand linefill from L2",
  66	"Parity error during data load from IC",
  67	"Parity error for IC valid bit",
  68	"Main tag parity error",
  69	"Parity error in prediction queue",
  70	"PFB data/address parity error",
  71	"Parity error in the branch status reg",
  72	"PFB promotion address error",
  73	"Tag error during probe/victimization",
  74	"Parity error for IC probe tag valid bit",
  75	"PFB non-cacheable bit parity error",
  76	"PFB valid bit parity error",			/* xec = 0xd */
  77	"Microcode Patch Buffer",			/* xec = 010 */
  78	"uop queue",
  79	"insn buffer",
  80	"predecode buffer",
  81	"fetch address FIFO",
  82	"dispatch uop queue"
  83};
  84
  85static const char * const f15h_mc2_mce_desc[] = {
  86	"Fill ECC error on data fills",			/* xec = 0x4 */
  87	"Fill parity error on insn fills",
  88	"Prefetcher request FIFO parity error",
  89	"PRQ address parity error",
  90	"PRQ data parity error",
  91	"WCC Tag ECC error",
  92	"WCC Data ECC error",
  93	"WCB Data parity error",
  94	"VB Data ECC or parity error",
  95	"L2 Tag ECC error",				/* xec = 0x10 */
  96	"Hard L2 Tag ECC error",
  97	"Multiple hits on L2 tag",
  98	"XAB parity error",
  99	"PRB address parity error"
 100};
 101
 102static const char * const mc4_mce_desc[] = {
 103	"DRAM ECC error detected on the NB",
 104	"CRC error detected on HT link",
 105	"Link-defined sync error packets detected on HT link",
 106	"HT Master abort",
 107	"HT Target abort",
 108	"Invalid GART PTE entry during GART table walk",
 109	"Unsupported atomic RMW received from an IO link",
 110	"Watchdog timeout due to lack of progress",
 111	"DRAM ECC error detected on the NB",
 112	"SVM DMA Exclusion Vector error",
 113	"HT data error detected on link",
 114	"Protocol error (link, L3, probe filter)",
 115	"NB internal arrays parity error",
 116	"DRAM addr/ctl signals parity error",
 117	"IO link transmission error",
 118	"L3 data cache ECC error",			/* xec = 0x1c */
 119	"L3 cache tag error",
 120	"L3 LRU parity bits error",
 121	"ECC Error in the Probe Filter directory"
 122};
 123
 124static const char * const mc5_mce_desc[] = {
 125	"CPU Watchdog timer expire",
 126	"Wakeup array dest tag",
 127	"AG payload array",
 128	"EX payload array",
 129	"IDRF array",
 130	"Retire dispatch queue",
 131	"Mapper checkpoint array",
 132	"Physical register file EX0 port",
 133	"Physical register file EX1 port",
 134	"Physical register file AG0 port",
 135	"Physical register file AG1 port",
 136	"Flag register file",
 137	"DE error occurred",
 138	"Retire status queue"
 139};
 140
 141static const char * const mc6_mce_desc[] = {
 142	"Hardware Assertion",
 143	"Free List",
 144	"Physical Register File",
 145	"Retire Queue",
 146	"Scheduler table",
 147	"Status Register File",
 148};
 149
 150/* Scalable MCA error strings */
 151static const char * const smca_ls_mce_desc[] = {
 152	"Load queue parity",
 153	"Store queue parity",
 154	"Miss address buffer payload parity",
 155	"L1 TLB parity",
 156	"Reserved",
 157	"DC tag error type 6",
 158	"DC tag error type 1",
 159	"Internal error type 1",
 160	"Internal error type 2",
 161	"Sys Read data error thread 0",
 162	"Sys read data error thread 1",
 163	"DC tag error type 2",
 164	"DC data error type 1 (poison consumption)",
 165	"DC data error type 2",
 166	"DC data error type 3",
 167	"DC tag error type 4",
 168	"L2 TLB parity",
 169	"PDC parity error",
 170	"DC tag error type 3",
 171	"DC tag error type 5",
 172	"L2 fill data error",
 173};
 174
 175static const char * const smca_if_mce_desc[] = {
 176	"microtag probe port parity error",
 177	"IC microtag or full tag multi-hit error",
 178	"IC full tag parity",
 179	"IC data array parity",
 180	"Decoupling queue phys addr parity error",
 181	"L0 ITLB parity error",
 182	"L1 ITLB parity error",
 183	"L2 ITLB parity error",
 184	"BPQ snoop parity on Thread 0",
 185	"BPQ snoop parity on Thread 1",
 186	"L1 BTB multi-match error",
 187	"L2 BTB multi-match error",
 188	"L2 Cache Response Poison error",
 189	"System Read Data error",
 190};
 191
 192static const char * const smca_l2_mce_desc[] = {
 193	"L2M tag multi-way-hit error",
 194	"L2M tag ECC error",
 195	"L2M data ECC error",
 196	"HW assert",
 197};
 198
 199static const char * const smca_de_mce_desc[] = {
 200	"uop cache tag parity error",
 201	"uop cache data parity error",
 202	"Insn buffer parity error",
 203	"uop queue parity error",
 204	"Insn dispatch queue parity error",
 205	"Fetch address FIFO parity",
 206	"Patch RAM data parity",
 207	"Patch RAM sequencer parity",
 208	"uop buffer parity"
 209};
 210
 211static const char * const smca_ex_mce_desc[] = {
 212	"Watchdog timeout error",
 213	"Phy register file parity",
 214	"Flag register file parity",
 215	"Immediate displacement register file parity",
 216	"Address generator payload parity",
 217	"EX payload parity",
 218	"Checkpoint queue parity",
 219	"Retire dispatch queue parity",
 220	"Retire status queue parity error",
 221	"Scheduling queue parity error",
 222	"Branch buffer queue parity error",
 223};
 224
 225static const char * const smca_fp_mce_desc[] = {
 226	"Physical register file parity",
 227	"Freelist parity error",
 228	"Schedule queue parity",
 229	"NSQ parity error",
 230	"Retire queue parity",
 231	"Status register file parity",
 232	"Hardware assertion",
 233};
 234
 235static const char * const smca_l3_mce_desc[] = {
 236	"Shadow tag macro ECC error",
 237	"Shadow tag macro multi-way-hit error",
 238	"L3M tag ECC error",
 239	"L3M tag multi-way-hit error",
 240	"L3M data ECC error",
 241	"XI parity, L3 fill done channel error",
 242	"L3 victim queue parity",
 243	"L3 HW assert",
 244};
 245
 246static const char * const smca_cs_mce_desc[] = {
 247	"Illegal request from transport layer",
 248	"Address violation",
 249	"Security violation",
 250	"Illegal response from transport layer",
 251	"Unexpected response",
 252	"Parity error on incoming request or probe response data",
 253	"Parity error on incoming read response data",
 254	"Atomic request parity",
 255	"ECC error on probe filter access",
 256};
 257
 258static const char * const smca_pie_mce_desc[] = {
 259	"HW assert",
 260	"Internal PIE register security violation",
 261	"Error on GMI link",
 262	"Poison data written to internal PIE register",
 263};
 264
 265static const char * const smca_umc_mce_desc[] = {
 266	"DRAM ECC error",
 267	"Data poison error on DRAM",
 268	"SDP parity error",
 269	"Advanced peripheral bus error",
 270	"Command/address parity error",
 271	"Write data CRC error",
 272};
 273
 274static const char * const smca_pb_mce_desc[] = {
 275	"Parameter Block RAM ECC error",
 276};
 277
 278static const char * const smca_psp_mce_desc[] = {
 279	"PSP RAM ECC or parity error",
 280};
 281
 282static const char * const smca_smu_mce_desc[] = {
 283	"SMU RAM ECC or parity error",
 284};
 285
 286struct smca_mce_desc {
 287	const char * const *descs;
 288	unsigned int num_descs;
 289};
 290
 291static struct smca_mce_desc smca_mce_descs[] = {
 292	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
 293	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
 294	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
 295	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
 296	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
 297	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
 298	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
 299	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
 300	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
 301	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
 302	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
 303	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
 304	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
 305};
 306
 307static bool f12h_mc0_mce(u16 ec, u8 xec)
 308{
 309	bool ret = false;
 310
 311	if (MEM_ERROR(ec)) {
 312		u8 ll = LL(ec);
 313		ret = true;
 314
 315		if (ll == LL_L2)
 316			pr_cont("during L1 linefill from L2.\n");
 317		else if (ll == LL_L1)
 318			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
 319		else
 320			ret = false;
 321	}
 322	return ret;
 323}
 324
 325static bool f10h_mc0_mce(u16 ec, u8 xec)
 326{
 327	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
 328		pr_cont("during data scrub.\n");
 329		return true;
 330	}
 331	return f12h_mc0_mce(ec, xec);
 332}
 333
 334static bool k8_mc0_mce(u16 ec, u8 xec)
 335{
 336	if (BUS_ERROR(ec)) {
 337		pr_cont("during system linefill.\n");
 338		return true;
 339	}
 340
 341	return f10h_mc0_mce(ec, xec);
 342}
 343
 344static bool cat_mc0_mce(u16 ec, u8 xec)
 345{
 346	u8 r4	 = R4(ec);
 347	bool ret = true;
 348
 349	if (MEM_ERROR(ec)) {
 350
 351		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
 352			return false;
 353
 354		switch (r4) {
 355		case R4_DRD:
 356		case R4_DWR:
 357			pr_cont("Data/Tag parity error due to %s.\n",
 358				(r4 == R4_DRD ? "load/hw prf" : "store"));
 359			break;
 360		case R4_EVICT:
 361			pr_cont("Copyback parity error on a tag miss.\n");
 362			break;
 363		case R4_SNOOP:
 364			pr_cont("Tag parity error during snoop.\n");
 365			break;
 366		default:
 367			ret = false;
 368		}
 369	} else if (BUS_ERROR(ec)) {
 370
 371		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
 372			return false;
 373
 374		pr_cont("System read data error on a ");
 375
 376		switch (r4) {
 377		case R4_RD:
 378			pr_cont("TLB reload.\n");
 379			break;
 380		case R4_DWR:
 381			pr_cont("store.\n");
 382			break;
 383		case R4_DRD:
 384			pr_cont("load.\n");
 385			break;
 386		default:
 387			ret = false;
 388		}
 389	} else {
 390		ret = false;
 391	}
 392
 393	return ret;
 394}
 395
 396static bool f15h_mc0_mce(u16 ec, u8 xec)
 397{
 398	bool ret = true;
 399
 400	if (MEM_ERROR(ec)) {
 401
 402		switch (xec) {
 403		case 0x0:
 404			pr_cont("Data Array access error.\n");
 405			break;
 406
 407		case 0x1:
 408			pr_cont("UC error during a linefill from L2/NB.\n");
 409			break;
 410
 411		case 0x2:
 412		case 0x11:
 413			pr_cont("STQ access error.\n");
 414			break;
 415
 416		case 0x3:
 417			pr_cont("SCB access error.\n");
 418			break;
 419
 420		case 0x10:
 421			pr_cont("Tag error.\n");
 422			break;
 423
 424		case 0x12:
 425			pr_cont("LDQ access error.\n");
 426			break;
 427
 428		default:
 429			ret = false;
 430		}
 431	} else if (BUS_ERROR(ec)) {
 432
 433		if (!xec)
 434			pr_cont("System Read Data Error.\n");
 435		else
 436			pr_cont(" Internal error condition type %d.\n", xec);
 437	} else if (INT_ERROR(ec)) {
 438		if (xec <= 0x1f)
 439			pr_cont("Hardware Assert.\n");
 440		else
 441			ret = false;
 442
 443	} else
 444		ret = false;
 445
 446	return ret;
 447}
 448
 449static void decode_mc0_mce(struct mce *m)
 450{
 451	u16 ec = EC(m->status);
 452	u8 xec = XEC(m->status, xec_mask);
 453
 454	pr_emerg(HW_ERR "MC0 Error: ");
 455
 456	/* TLB error signatures are the same across families */
 457	if (TLB_ERROR(ec)) {
 458		if (TT(ec) == TT_DATA) {
 459			pr_cont("%s TLB %s.\n", LL_MSG(ec),
 460				((xec == 2) ? "locked miss"
 461					    : (xec ? "multimatch" : "parity")));
 462			return;
 463		}
 464	} else if (fam_ops->mc0_mce(ec, xec))
 465		;
 466	else
 467		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
 468}
 469
 470static bool k8_mc1_mce(u16 ec, u8 xec)
 471{
 472	u8 ll	 = LL(ec);
 473	bool ret = true;
 474
 475	if (!MEM_ERROR(ec))
 476		return false;
 477
 478	if (ll == 0x2)
 479		pr_cont("during a linefill from L2.\n");
 480	else if (ll == 0x1) {
 481		switch (R4(ec)) {
 482		case R4_IRD:
 483			pr_cont("Parity error during data load.\n");
 484			break;
 485
 486		case R4_EVICT:
 487			pr_cont("Copyback Parity/Victim error.\n");
 488			break;
 489
 490		case R4_SNOOP:
 491			pr_cont("Tag Snoop error.\n");
 492			break;
 493
 494		default:
 495			ret = false;
 496			break;
 497		}
 498	} else
 499		ret = false;
 500
 501	return ret;
 502}
 503
 504static bool cat_mc1_mce(u16 ec, u8 xec)
 505{
 506	u8 r4    = R4(ec);
 507	bool ret = true;
 508
 509	if (!MEM_ERROR(ec))
 510		return false;
 511
 512	if (TT(ec) != TT_INSTR)
 513		return false;
 514
 515	if (r4 == R4_IRD)
 516		pr_cont("Data/tag array parity error for a tag hit.\n");
 517	else if (r4 == R4_SNOOP)
 518		pr_cont("Tag error during snoop/victimization.\n");
 519	else if (xec == 0x0)
 520		pr_cont("Tag parity error from victim castout.\n");
 521	else if (xec == 0x2)
 522		pr_cont("Microcode patch RAM parity error.\n");
 523	else
 524		ret = false;
 525
 526	return ret;
 527}
 528
 529static bool f15h_mc1_mce(u16 ec, u8 xec)
 530{
 531	bool ret = true;
 532
 533	if (!MEM_ERROR(ec))
 534		return false;
 535
 536	switch (xec) {
 537	case 0x0 ... 0xa:
 538		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
 539		break;
 540
 541	case 0xd:
 542		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
 543		break;
 544
 545	case 0x10:
 546		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
 547		break;
 548
 549	case 0x11 ... 0x15:
 550		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
 551		break;
 552
 553	default:
 554		ret = false;
 555	}
 556	return ret;
 557}
 558
 559static void decode_mc1_mce(struct mce *m)
 560{
 561	u16 ec = EC(m->status);
 562	u8 xec = XEC(m->status, xec_mask);
 563
 564	pr_emerg(HW_ERR "MC1 Error: ");
 565
 566	if (TLB_ERROR(ec))
 567		pr_cont("%s TLB %s.\n", LL_MSG(ec),
 568			(xec ? "multimatch" : "parity error"));
 569	else if (BUS_ERROR(ec)) {
 570		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
 571
 572		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
 573	} else if (INT_ERROR(ec)) {
 574		if (xec <= 0x3f)
 575			pr_cont("Hardware Assert.\n");
 576		else
 577			goto wrong_mc1_mce;
 578	} else if (fam_ops->mc1_mce(ec, xec))
 579		;
 580	else
 581		goto wrong_mc1_mce;
 582
 583	return;
 584
 585wrong_mc1_mce:
 586	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 587}
 588
 589static bool k8_mc2_mce(u16 ec, u8 xec)
 590{
 591	bool ret = true;
 592
 593	if (xec == 0x1)
 594		pr_cont(" in the write data buffers.\n");
 595	else if (xec == 0x3)
 596		pr_cont(" in the victim data buffers.\n");
 597	else if (xec == 0x2 && MEM_ERROR(ec))
 598		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
 599	else if (xec == 0x0) {
 600		if (TLB_ERROR(ec))
 601			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
 602				TT_MSG(ec));
 603		else if (BUS_ERROR(ec))
 604			pr_cont(": %s/ECC error in data read from NB: %s.\n",
 605				R4_MSG(ec), PP_MSG(ec));
 606		else if (MEM_ERROR(ec)) {
 607			u8 r4 = R4(ec);
 608
 609			if (r4 >= 0x7)
 610				pr_cont(": %s error during data copyback.\n",
 611					R4_MSG(ec));
 612			else if (r4 <= 0x1)
 613				pr_cont(": %s parity/ECC error during data "
 614					"access from L2.\n", R4_MSG(ec));
 615			else
 616				ret = false;
 617		} else
 618			ret = false;
 619	} else
 620		ret = false;
 621
 622	return ret;
 623}
 624
 625static bool f15h_mc2_mce(u16 ec, u8 xec)
 626{
 627	bool ret = true;
 628
 629	if (TLB_ERROR(ec)) {
 630		if (xec == 0x0)
 631			pr_cont("Data parity TLB read error.\n");
 632		else if (xec == 0x1)
 633			pr_cont("Poison data provided for TLB fill.\n");
 634		else
 635			ret = false;
 636	} else if (BUS_ERROR(ec)) {
 637		if (xec > 2)
 638			ret = false;
 639
 640		pr_cont("Error during attempted NB data read.\n");
 641	} else if (MEM_ERROR(ec)) {
 642		switch (xec) {
 643		case 0x4 ... 0xc:
 644			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
 645			break;
 646
 647		case 0x10 ... 0x14:
 648			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
 649			break;
 650
 651		default:
 652			ret = false;
 653		}
 654	} else if (INT_ERROR(ec)) {
 655		if (xec <= 0x3f)
 656			pr_cont("Hardware Assert.\n");
 657		else
 658			ret = false;
 659	}
 660
 661	return ret;
 662}
 663
 664static bool f16h_mc2_mce(u16 ec, u8 xec)
 665{
 666	u8 r4 = R4(ec);
 667
 668	if (!MEM_ERROR(ec))
 669		return false;
 670
 671	switch (xec) {
 672	case 0x04 ... 0x05:
 673		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
 674		break;
 675
 676	case 0x09 ... 0x0b:
 677	case 0x0d ... 0x0f:
 678		pr_cont("ECC error in L2 tag (%s).\n",
 679			((r4 == R4_GEN)   ? "BankReq" :
 680			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
 681		break;
 682
 683	case 0x10 ... 0x19:
 684	case 0x1b:
 685		pr_cont("ECC error in L2 data array (%s).\n",
 686			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
 687			((r4 == R4_GEN)   ? "Attr" :
 688			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
 689		break;
 690
 691	case 0x1c ... 0x1d:
 692	case 0x1f:
 693		pr_cont("Parity error in L2 attribute bits (%s).\n",
 694			((r4 == R4_RD)  ? "Hit"  :
 695			((r4 == R4_GEN) ? "Attr" : "Fill")));
 696		break;
 697
 698	default:
 699		return false;
 700	}
 701
 702	return true;
 703}
 704
 705static void decode_mc2_mce(struct mce *m)
 706{
 707	u16 ec = EC(m->status);
 708	u8 xec = XEC(m->status, xec_mask);
 709
 710	pr_emerg(HW_ERR "MC2 Error: ");
 711
 712	if (!fam_ops->mc2_mce(ec, xec))
 713		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 714}
 715
 716static void decode_mc3_mce(struct mce *m)
 717{
 718	u16 ec = EC(m->status);
 719	u8 xec = XEC(m->status, xec_mask);
 720
 721	if (boot_cpu_data.x86 >= 0x14) {
 722		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
 723			 " please report on LKML.\n");
 724		return;
 725	}
 726
 727	pr_emerg(HW_ERR "MC3 Error");
 728
 729	if (xec == 0x0) {
 730		u8 r4 = R4(ec);
 731
 732		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
 733			goto wrong_mc3_mce;
 734
 735		pr_cont(" during %s.\n", R4_MSG(ec));
 736	} else
 737		goto wrong_mc3_mce;
 738
 739	return;
 740
 741 wrong_mc3_mce:
 742	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
 743}
 744
 745static void decode_mc4_mce(struct mce *m)
 746{
 747	struct cpuinfo_x86 *c = &boot_cpu_data;
 748	int node_id = amd_get_nb_id(m->extcpu);
 749	u16 ec = EC(m->status);
 750	u8 xec = XEC(m->status, 0x1f);
 751	u8 offset = 0;
 752
 753	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
 754
 755	switch (xec) {
 756	case 0x0 ... 0xe:
 757
 758		/* special handling for DRAM ECCs */
 759		if (xec == 0x0 || xec == 0x8) {
 760			/* no ECCs on F11h */
 761			if (c->x86 == 0x11)
 762				goto wrong_mc4_mce;
 763
 764			pr_cont("%s.\n", mc4_mce_desc[xec]);
 765
 766			if (decode_dram_ecc)
 767				decode_dram_ecc(node_id, m);
 768			return;
 769		}
 770		break;
 771
 772	case 0xf:
 773		if (TLB_ERROR(ec))
 774			pr_cont("GART Table Walk data error.\n");
 775		else if (BUS_ERROR(ec))
 776			pr_cont("DMA Exclusion Vector Table Walk error.\n");
 777		else
 778			goto wrong_mc4_mce;
 779		return;
 780
 781	case 0x19:
 782		if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
 783			pr_cont("Compute Unit Data Error.\n");
 784		else
 785			goto wrong_mc4_mce;
 786		return;
 787
 788	case 0x1c ... 0x1f:
 789		offset = 13;
 790		break;
 791
 792	default:
 793		goto wrong_mc4_mce;
 794	}
 795
 796	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
 797	return;
 798
 799 wrong_mc4_mce:
 800	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
 801}
 802
 803static void decode_mc5_mce(struct mce *m)
 804{
 805	struct cpuinfo_x86 *c = &boot_cpu_data;
 806	u16 ec = EC(m->status);
 807	u8 xec = XEC(m->status, xec_mask);
 808
 809	if (c->x86 == 0xf || c->x86 == 0x11)
 810		goto wrong_mc5_mce;
 811
 812	pr_emerg(HW_ERR "MC5 Error: ");
 813
 814	if (INT_ERROR(ec)) {
 815		if (xec <= 0x1f) {
 816			pr_cont("Hardware Assert.\n");
 817			return;
 818		} else
 819			goto wrong_mc5_mce;
 820	}
 821
 822	if (xec == 0x0 || xec == 0xc)
 823		pr_cont("%s.\n", mc5_mce_desc[xec]);
 824	else if (xec <= 0xd)
 825		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
 826	else
 827		goto wrong_mc5_mce;
 828
 829	return;
 830
 831 wrong_mc5_mce:
 832	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
 833}
 834
 835static void decode_mc6_mce(struct mce *m)
 836{
 837	u8 xec = XEC(m->status, xec_mask);
 838
 839	pr_emerg(HW_ERR "MC6 Error: ");
 840
 841	if (xec > 0x5)
 842		goto wrong_mc6_mce;
 843
 844	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
 845	return;
 846
 847 wrong_mc6_mce:
 848	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 849}
 850
 851/* Decode errors according to Scalable MCA specification */
 852static void decode_smca_errors(struct mce *m)
 853{
 854	struct smca_hwid *hwid;
 855	unsigned int bank_type;
 856	const char *ip_name;
 857	u8 xec = XEC(m->status, xec_mask);
 858
 859	if (m->bank >= ARRAY_SIZE(smca_banks))
 860		return;
 861
 862	if (boot_cpu_data.x86 >= 0x17 && m->bank == 4)
 863		pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
 864
 865	hwid = smca_banks[m->bank].hwid;
 866	if (!hwid)
 867		return;
 868
 869	bank_type = hwid->bank_type;
 870	ip_name = smca_get_long_name(bank_type);
 871
 872	pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
 873
 874	/* Only print the decode of valid error codes */
 875	if (xec < smca_mce_descs[bank_type].num_descs &&
 876			(hwid->xec_bitmap & BIT_ULL(xec))) {
 877		pr_emerg(HW_ERR "%s Error: ", ip_name);
 878		pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
 879	}
 880
 881	/*
 882	 * amd_get_nb_id() returns the last level cache id.
 883	 * The last level cache on Fam17h is 1 level below the node.
 884	 */
 885	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
 886		decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m);
 887}
 888
 889static inline void amd_decode_err_code(u16 ec)
 890{
 891	if (INT_ERROR(ec)) {
 892		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
 893		return;
 894	}
 895
 896	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
 897
 898	if (BUS_ERROR(ec))
 899		pr_cont(", mem/io: %s", II_MSG(ec));
 900	else
 901		pr_cont(", tx: %s", TT_MSG(ec));
 902
 903	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
 904		pr_cont(", mem-tx: %s", R4_MSG(ec));
 905
 906		if (BUS_ERROR(ec))
 907			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
 908	}
 909
 910	pr_cont("\n");
 911}
 912
 913/*
 914 * Filter out unwanted MCE signatures here.
 915 */
 916static bool amd_filter_mce(struct mce *m)
 917{
 918	u8 xec = (m->status >> 16) & 0x1f;
 919
 920	/*
 921	 * NB GART TLB error reporting is disabled by default.
 922	 */
 923	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
 924		return true;
 925
 926	return false;
 927}
 928
 929static const char *decode_error_status(struct mce *m)
 930{
 931	if (m->status & MCI_STATUS_UC) {
 932		if (m->status & MCI_STATUS_PCC)
 933			return "System Fatal error.";
 934		if (m->mcgstatus & MCG_STATUS_RIPV)
 935			return "Uncorrected, software restartable error.";
 936		return "Uncorrected, software containable error.";
 937	}
 938
 939	if (m->status & MCI_STATUS_DEFERRED)
 940		return "Deferred error, no action required.";
 941
 942	return "Corrected error, no action required.";
 943}
 944
 945static int
 946amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 947{
 948	struct mce *m = (struct mce *)data;
 949	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
 950	int ecc;
 951
 952	if (amd_filter_mce(m))
 953		return NOTIFY_STOP;
 954
 955	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
 956
 957	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
 958		m->extcpu,
 959		c->x86, c->x86_model, c->x86_mask,
 960		m->bank,
 961		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
 962		((m->status & MCI_STATUS_UC)	? "UE"	  :
 963		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
 964		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
 965		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
 966		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
 967
 968	if (c->x86 >= 0x15) {
 969		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
 970
 971		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
 972		if (c->x86 != 0x15 || m->bank != 4)
 973			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
 974	}
 975
 976	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 977		u32 low, high;
 978		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 979
 980		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
 981
 982		if (!rdmsr_safe(addr, &low, &high) &&
 983		    (low & MCI_CONFIG_MCAX))
 984			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 985	}
 986
 987	/* do the two bits[14:13] together */
 988	ecc = (m->status >> 45) & 0x3;
 989	if (ecc)
 990		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
 991
 992	pr_cont("]: 0x%016llx\n", m->status);
 993
 994	if (m->status & MCI_STATUS_ADDRV)
 995		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
 996
 997	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 998		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
 999
1000		if (m->status & MCI_STATUS_SYNDV)
1001			pr_cont(", Syndrome: 0x%016llx", m->synd);
1002
1003		pr_cont("\n");
1004
1005		decode_smca_errors(m);
1006		goto err_code;
1007	}
1008
1009	if (m->tsc)
1010		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1011
1012	if (!fam_ops)
1013		goto err_code;
1014
1015	switch (m->bank) {
1016	case 0:
1017		decode_mc0_mce(m);
1018		break;
1019
1020	case 1:
1021		decode_mc1_mce(m);
1022		break;
1023
1024	case 2:
1025		decode_mc2_mce(m);
1026		break;
1027
1028	case 3:
1029		decode_mc3_mce(m);
1030		break;
1031
1032	case 4:
1033		decode_mc4_mce(m);
1034		break;
1035
1036	case 5:
1037		decode_mc5_mce(m);
1038		break;
1039
1040	case 6:
1041		decode_mc6_mce(m);
1042		break;
1043
1044	default:
1045		break;
1046	}
1047
1048 err_code:
1049	amd_decode_err_code(m->status & 0xffff);
1050
1051	return NOTIFY_STOP;
1052}
1053
1054static struct notifier_block amd_mce_dec_nb = {
1055	.notifier_call	= amd_decode_mce,
1056	.priority	= MCE_PRIO_EDAC,
1057};
1058
1059static int __init mce_amd_init(void)
1060{
1061	struct cpuinfo_x86 *c = &boot_cpu_data;
1062
1063	if (c->x86_vendor != X86_VENDOR_AMD)
1064		return -ENODEV;
1065
1066	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1067	if (!fam_ops)
1068		return -ENOMEM;
1069
1070	switch (c->x86) {
1071	case 0xf:
1072		fam_ops->mc0_mce = k8_mc0_mce;
1073		fam_ops->mc1_mce = k8_mc1_mce;
1074		fam_ops->mc2_mce = k8_mc2_mce;
1075		break;
1076
1077	case 0x10:
1078		fam_ops->mc0_mce = f10h_mc0_mce;
1079		fam_ops->mc1_mce = k8_mc1_mce;
1080		fam_ops->mc2_mce = k8_mc2_mce;
1081		break;
1082
1083	case 0x11:
1084		fam_ops->mc0_mce = k8_mc0_mce;
1085		fam_ops->mc1_mce = k8_mc1_mce;
1086		fam_ops->mc2_mce = k8_mc2_mce;
1087		break;
1088
1089	case 0x12:
1090		fam_ops->mc0_mce = f12h_mc0_mce;
1091		fam_ops->mc1_mce = k8_mc1_mce;
1092		fam_ops->mc2_mce = k8_mc2_mce;
1093		break;
1094
1095	case 0x14:
1096		fam_ops->mc0_mce = cat_mc0_mce;
1097		fam_ops->mc1_mce = cat_mc1_mce;
1098		fam_ops->mc2_mce = k8_mc2_mce;
1099		break;
1100
1101	case 0x15:
1102		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1103
1104		fam_ops->mc0_mce = f15h_mc0_mce;
1105		fam_ops->mc1_mce = f15h_mc1_mce;
1106		fam_ops->mc2_mce = f15h_mc2_mce;
1107		break;
1108
1109	case 0x16:
1110		xec_mask = 0x1f;
1111		fam_ops->mc0_mce = cat_mc0_mce;
1112		fam_ops->mc1_mce = cat_mc1_mce;
1113		fam_ops->mc2_mce = f16h_mc2_mce;
1114		break;
1115
1116	case 0x17:
1117		xec_mask = 0x3f;
1118		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1119			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1120			goto err_out;
1121		}
1122		break;
1123
1124	default:
1125		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1126		goto err_out;
1127	}
1128
1129	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1130
1131	mce_register_decode_chain(&amd_mce_dec_nb);
1132
1133	return 0;
1134
1135err_out:
1136	kfree(fam_ops);
1137	fam_ops = NULL;
1138	return -EINVAL;
1139}
1140early_initcall(mce_amd_init);
1141
1142#ifdef MODULE
1143static void __exit mce_amd_exit(void)
1144{
1145	mce_unregister_decode_chain(&amd_mce_dec_nb);
1146	kfree(fam_ops);
1147}
1148
1149MODULE_DESCRIPTION("AMD MCE decoder");
1150MODULE_ALIAS("edac-mce-amd");
1151MODULE_LICENSE("GPL");
1152module_exit(mce_amd_exit);
1153#endif