1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 
7 #include "mce_amd.h"
8 
9 static struct amd_decoder_ops fam_ops;
10 
11 static u8 xec_mask	 = 0xf;
12 
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14 
amd_register_ecc_decoder(void (* f)(int,struct mce *))15 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16 {
17 	decode_dram_ecc = f;
18 }
19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20 
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 	if (decode_dram_ecc) {
24 		WARN_ON(decode_dram_ecc != f);
25 
26 		decode_dram_ecc = NULL;
27 	}
28 }
29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30 
31 /*
32  * string representation for the different MCA reported error types, see F3x48
33  * or MSR0000_0411.
34  */
35 
36 /* transaction type */
37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38 
39 /* cache level */
40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41 
42 /* memory transaction type */
43 static const char * const rrrr_msgs[] = {
44        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45 };
46 
47 /* participating processor */
48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49 EXPORT_SYMBOL_GPL(pp_msgs);
50 
51 /* request timeout */
52 static const char * const to_msgs[] = { "no timeout", "timed out" };
53 
54 /* memory or i/o */
55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56 
57 /* internal error type */
58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59 
60 static const char * const f15h_mc1_mce_desc[] = {
61 	"UC during a demand linefill from L2",
62 	"Parity error during data load from IC",
63 	"Parity error for IC valid bit",
64 	"Main tag parity error",
65 	"Parity error in prediction queue",
66 	"PFB data/address parity error",
67 	"Parity error in the branch status reg",
68 	"PFB promotion address error",
69 	"Tag error during probe/victimization",
70 	"Parity error for IC probe tag valid bit",
71 	"PFB non-cacheable bit parity error",
72 	"PFB valid bit parity error",			/* xec = 0xd */
73 	"Microcode Patch Buffer",			/* xec = 010 */
74 	"uop queue",
75 	"insn buffer",
76 	"predecode buffer",
77 	"fetch address FIFO",
78 	"dispatch uop queue"
79 };
80 
81 static const char * const f15h_mc2_mce_desc[] = {
82 	"Fill ECC error on data fills",			/* xec = 0x4 */
83 	"Fill parity error on insn fills",
84 	"Prefetcher request FIFO parity error",
85 	"PRQ address parity error",
86 	"PRQ data parity error",
87 	"WCC Tag ECC error",
88 	"WCC Data ECC error",
89 	"WCB Data parity error",
90 	"VB Data ECC or parity error",
91 	"L2 Tag ECC error",				/* xec = 0x10 */
92 	"Hard L2 Tag ECC error",
93 	"Multiple hits on L2 tag",
94 	"XAB parity error",
95 	"PRB address parity error"
96 };
97 
98 static const char * const mc4_mce_desc[] = {
99 	"DRAM ECC error detected on the NB",
100 	"CRC error detected on HT link",
101 	"Link-defined sync error packets detected on HT link",
102 	"HT Master abort",
103 	"HT Target abort",
104 	"Invalid GART PTE entry during GART table walk",
105 	"Unsupported atomic RMW received from an IO link",
106 	"Watchdog timeout due to lack of progress",
107 	"DRAM ECC error detected on the NB",
108 	"SVM DMA Exclusion Vector error",
109 	"HT data error detected on link",
110 	"Protocol error (link, L3, probe filter)",
111 	"NB internal arrays parity error",
112 	"DRAM addr/ctl signals parity error",
113 	"IO link transmission error",
114 	"L3 data cache ECC error",			/* xec = 0x1c */
115 	"L3 cache tag error",
116 	"L3 LRU parity bits error",
117 	"ECC Error in the Probe Filter directory"
118 };
119 
120 static const char * const mc5_mce_desc[] = {
121 	"CPU Watchdog timer expire",
122 	"Wakeup array dest tag",
123 	"AG payload array",
124 	"EX payload array",
125 	"IDRF array",
126 	"Retire dispatch queue",
127 	"Mapper checkpoint array",
128 	"Physical register file EX0 port",
129 	"Physical register file EX1 port",
130 	"Physical register file AG0 port",
131 	"Physical register file AG1 port",
132 	"Flag register file",
133 	"DE error occurred",
134 	"Retire status queue"
135 };
136 
137 static const char * const mc6_mce_desc[] = {
138 	"Hardware Assertion",
139 	"Free List",
140 	"Physical Register File",
141 	"Retire Queue",
142 	"Scheduler table",
143 	"Status Register File",
144 };
145 
f12h_mc0_mce(u16 ec,u8 xec)146 static bool f12h_mc0_mce(u16 ec, u8 xec)
147 {
148 	bool ret = false;
149 
150 	if (MEM_ERROR(ec)) {
151 		u8 ll = LL(ec);
152 		ret = true;
153 
154 		if (ll == LL_L2)
155 			pr_cont("during L1 linefill from L2.\n");
156 		else if (ll == LL_L1)
157 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
158 		else
159 			ret = false;
160 	}
161 	return ret;
162 }
163 
f10h_mc0_mce(u16 ec,u8 xec)164 static bool f10h_mc0_mce(u16 ec, u8 xec)
165 {
166 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
167 		pr_cont("during data scrub.\n");
168 		return true;
169 	}
170 	return f12h_mc0_mce(ec, xec);
171 }
172 
k8_mc0_mce(u16 ec,u8 xec)173 static bool k8_mc0_mce(u16 ec, u8 xec)
174 {
175 	if (BUS_ERROR(ec)) {
176 		pr_cont("during system linefill.\n");
177 		return true;
178 	}
179 
180 	return f10h_mc0_mce(ec, xec);
181 }
182 
cat_mc0_mce(u16 ec,u8 xec)183 static bool cat_mc0_mce(u16 ec, u8 xec)
184 {
185 	u8 r4	 = R4(ec);
186 	bool ret = true;
187 
188 	if (MEM_ERROR(ec)) {
189 
190 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
191 			return false;
192 
193 		switch (r4) {
194 		case R4_DRD:
195 		case R4_DWR:
196 			pr_cont("Data/Tag parity error due to %s.\n",
197 				(r4 == R4_DRD ? "load/hw prf" : "store"));
198 			break;
199 		case R4_EVICT:
200 			pr_cont("Copyback parity error on a tag miss.\n");
201 			break;
202 		case R4_SNOOP:
203 			pr_cont("Tag parity error during snoop.\n");
204 			break;
205 		default:
206 			ret = false;
207 		}
208 	} else if (BUS_ERROR(ec)) {
209 
210 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
211 			return false;
212 
213 		pr_cont("System read data error on a ");
214 
215 		switch (r4) {
216 		case R4_RD:
217 			pr_cont("TLB reload.\n");
218 			break;
219 		case R4_DWR:
220 			pr_cont("store.\n");
221 			break;
222 		case R4_DRD:
223 			pr_cont("load.\n");
224 			break;
225 		default:
226 			ret = false;
227 		}
228 	} else {
229 		ret = false;
230 	}
231 
232 	return ret;
233 }
234 
f15h_mc0_mce(u16 ec,u8 xec)235 static bool f15h_mc0_mce(u16 ec, u8 xec)
236 {
237 	bool ret = true;
238 
239 	if (MEM_ERROR(ec)) {
240 
241 		switch (xec) {
242 		case 0x0:
243 			pr_cont("Data Array access error.\n");
244 			break;
245 
246 		case 0x1:
247 			pr_cont("UC error during a linefill from L2/NB.\n");
248 			break;
249 
250 		case 0x2:
251 		case 0x11:
252 			pr_cont("STQ access error.\n");
253 			break;
254 
255 		case 0x3:
256 			pr_cont("SCB access error.\n");
257 			break;
258 
259 		case 0x10:
260 			pr_cont("Tag error.\n");
261 			break;
262 
263 		case 0x12:
264 			pr_cont("LDQ access error.\n");
265 			break;
266 
267 		default:
268 			ret = false;
269 		}
270 	} else if (BUS_ERROR(ec)) {
271 
272 		if (!xec)
273 			pr_cont("System Read Data Error.\n");
274 		else
275 			pr_cont(" Internal error condition type %d.\n", xec);
276 	} else if (INT_ERROR(ec)) {
277 		if (xec <= 0x1f)
278 			pr_cont("Hardware Assert.\n");
279 		else
280 			ret = false;
281 
282 	} else
283 		ret = false;
284 
285 	return ret;
286 }
287 
decode_mc0_mce(struct mce * m)288 static void decode_mc0_mce(struct mce *m)
289 {
290 	u16 ec = EC(m->status);
291 	u8 xec = XEC(m->status, xec_mask);
292 
293 	pr_emerg(HW_ERR "MC0 Error: ");
294 
295 	/* TLB error signatures are the same across families */
296 	if (TLB_ERROR(ec)) {
297 		if (TT(ec) == TT_DATA) {
298 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
299 				((xec == 2) ? "locked miss"
300 					    : (xec ? "multimatch" : "parity")));
301 			return;
302 		}
303 	} else if (fam_ops.mc0_mce(ec, xec))
304 		;
305 	else
306 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
307 }
308 
k8_mc1_mce(u16 ec,u8 xec)309 static bool k8_mc1_mce(u16 ec, u8 xec)
310 {
311 	u8 ll	 = LL(ec);
312 	bool ret = true;
313 
314 	if (!MEM_ERROR(ec))
315 		return false;
316 
317 	if (ll == 0x2)
318 		pr_cont("during a linefill from L2.\n");
319 	else if (ll == 0x1) {
320 		switch (R4(ec)) {
321 		case R4_IRD:
322 			pr_cont("Parity error during data load.\n");
323 			break;
324 
325 		case R4_EVICT:
326 			pr_cont("Copyback Parity/Victim error.\n");
327 			break;
328 
329 		case R4_SNOOP:
330 			pr_cont("Tag Snoop error.\n");
331 			break;
332 
333 		default:
334 			ret = false;
335 			break;
336 		}
337 	} else
338 		ret = false;
339 
340 	return ret;
341 }
342 
cat_mc1_mce(u16 ec,u8 xec)343 static bool cat_mc1_mce(u16 ec, u8 xec)
344 {
345 	u8 r4    = R4(ec);
346 	bool ret = true;
347 
348 	if (!MEM_ERROR(ec))
349 		return false;
350 
351 	if (TT(ec) != TT_INSTR)
352 		return false;
353 
354 	if (r4 == R4_IRD)
355 		pr_cont("Data/tag array parity error for a tag hit.\n");
356 	else if (r4 == R4_SNOOP)
357 		pr_cont("Tag error during snoop/victimization.\n");
358 	else if (xec == 0x0)
359 		pr_cont("Tag parity error from victim castout.\n");
360 	else if (xec == 0x2)
361 		pr_cont("Microcode patch RAM parity error.\n");
362 	else
363 		ret = false;
364 
365 	return ret;
366 }
367 
f15h_mc1_mce(u16 ec,u8 xec)368 static bool f15h_mc1_mce(u16 ec, u8 xec)
369 {
370 	bool ret = true;
371 
372 	if (!MEM_ERROR(ec))
373 		return false;
374 
375 	switch (xec) {
376 	case 0x0 ... 0xa:
377 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
378 		break;
379 
380 	case 0xd:
381 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
382 		break;
383 
384 	case 0x10:
385 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
386 		break;
387 
388 	case 0x11 ... 0x15:
389 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
390 		break;
391 
392 	default:
393 		ret = false;
394 	}
395 	return ret;
396 }
397 
decode_mc1_mce(struct mce * m)398 static void decode_mc1_mce(struct mce *m)
399 {
400 	u16 ec = EC(m->status);
401 	u8 xec = XEC(m->status, xec_mask);
402 
403 	pr_emerg(HW_ERR "MC1 Error: ");
404 
405 	if (TLB_ERROR(ec))
406 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
407 			(xec ? "multimatch" : "parity error"));
408 	else if (BUS_ERROR(ec)) {
409 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
410 
411 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
412 	} else if (INT_ERROR(ec)) {
413 		if (xec <= 0x3f)
414 			pr_cont("Hardware Assert.\n");
415 		else
416 			goto wrong_mc1_mce;
417 	} else if (fam_ops.mc1_mce(ec, xec))
418 		;
419 	else
420 		goto wrong_mc1_mce;
421 
422 	return;
423 
424 wrong_mc1_mce:
425 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
426 }
427 
k8_mc2_mce(u16 ec,u8 xec)428 static bool k8_mc2_mce(u16 ec, u8 xec)
429 {
430 	bool ret = true;
431 
432 	if (xec == 0x1)
433 		pr_cont(" in the write data buffers.\n");
434 	else if (xec == 0x3)
435 		pr_cont(" in the victim data buffers.\n");
436 	else if (xec == 0x2 && MEM_ERROR(ec))
437 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
438 	else if (xec == 0x0) {
439 		if (TLB_ERROR(ec))
440 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
441 				TT_MSG(ec));
442 		else if (BUS_ERROR(ec))
443 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
444 				R4_MSG(ec), PP_MSG(ec));
445 		else if (MEM_ERROR(ec)) {
446 			u8 r4 = R4(ec);
447 
448 			if (r4 >= 0x7)
449 				pr_cont(": %s error during data copyback.\n",
450 					R4_MSG(ec));
451 			else if (r4 <= 0x1)
452 				pr_cont(": %s parity/ECC error during data "
453 					"access from L2.\n", R4_MSG(ec));
454 			else
455 				ret = false;
456 		} else
457 			ret = false;
458 	} else
459 		ret = false;
460 
461 	return ret;
462 }
463 
f15h_mc2_mce(u16 ec,u8 xec)464 static bool f15h_mc2_mce(u16 ec, u8 xec)
465 {
466 	bool ret = true;
467 
468 	if (TLB_ERROR(ec)) {
469 		if (xec == 0x0)
470 			pr_cont("Data parity TLB read error.\n");
471 		else if (xec == 0x1)
472 			pr_cont("Poison data provided for TLB fill.\n");
473 		else
474 			ret = false;
475 	} else if (BUS_ERROR(ec)) {
476 		if (xec > 2)
477 			ret = false;
478 
479 		pr_cont("Error during attempted NB data read.\n");
480 	} else if (MEM_ERROR(ec)) {
481 		switch (xec) {
482 		case 0x4 ... 0xc:
483 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
484 			break;
485 
486 		case 0x10 ... 0x14:
487 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
488 			break;
489 
490 		default:
491 			ret = false;
492 		}
493 	} else if (INT_ERROR(ec)) {
494 		if (xec <= 0x3f)
495 			pr_cont("Hardware Assert.\n");
496 		else
497 			ret = false;
498 	}
499 
500 	return ret;
501 }
502 
f16h_mc2_mce(u16 ec,u8 xec)503 static bool f16h_mc2_mce(u16 ec, u8 xec)
504 {
505 	u8 r4 = R4(ec);
506 
507 	if (!MEM_ERROR(ec))
508 		return false;
509 
510 	switch (xec) {
511 	case 0x04 ... 0x05:
512 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
513 		break;
514 
515 	case 0x09 ... 0x0b:
516 	case 0x0d ... 0x0f:
517 		pr_cont("ECC error in L2 tag (%s).\n",
518 			((r4 == R4_GEN)   ? "BankReq" :
519 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
520 		break;
521 
522 	case 0x10 ... 0x19:
523 	case 0x1b:
524 		pr_cont("ECC error in L2 data array (%s).\n",
525 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
526 			((r4 == R4_GEN)   ? "Attr" :
527 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
528 		break;
529 
530 	case 0x1c ... 0x1d:
531 	case 0x1f:
532 		pr_cont("Parity error in L2 attribute bits (%s).\n",
533 			((r4 == R4_RD)  ? "Hit"  :
534 			((r4 == R4_GEN) ? "Attr" : "Fill")));
535 		break;
536 
537 	default:
538 		return false;
539 	}
540 
541 	return true;
542 }
543 
decode_mc2_mce(struct mce * m)544 static void decode_mc2_mce(struct mce *m)
545 {
546 	u16 ec = EC(m->status);
547 	u8 xec = XEC(m->status, xec_mask);
548 
549 	pr_emerg(HW_ERR "MC2 Error: ");
550 
551 	if (!fam_ops.mc2_mce(ec, xec))
552 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
553 }
554 
decode_mc3_mce(struct mce * m)555 static void decode_mc3_mce(struct mce *m)
556 {
557 	u16 ec = EC(m->status);
558 	u8 xec = XEC(m->status, xec_mask);
559 
560 	if (boot_cpu_data.x86 >= 0x14) {
561 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
562 			 " please report on LKML.\n");
563 		return;
564 	}
565 
566 	pr_emerg(HW_ERR "MC3 Error");
567 
568 	if (xec == 0x0) {
569 		u8 r4 = R4(ec);
570 
571 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
572 			goto wrong_mc3_mce;
573 
574 		pr_cont(" during %s.\n", R4_MSG(ec));
575 	} else
576 		goto wrong_mc3_mce;
577 
578 	return;
579 
580  wrong_mc3_mce:
581 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
582 }
583 
decode_mc4_mce(struct mce * m)584 static void decode_mc4_mce(struct mce *m)
585 {
586 	unsigned int fam = x86_family(m->cpuid);
587 	int node_id = topology_amd_node_id(m->extcpu);
588 	u16 ec = EC(m->status);
589 	u8 xec = XEC(m->status, 0x1f);
590 	u8 offset = 0;
591 
592 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
593 
594 	switch (xec) {
595 	case 0x0 ... 0xe:
596 
597 		/* special handling for DRAM ECCs */
598 		if (xec == 0x0 || xec == 0x8) {
599 			/* no ECCs on F11h */
600 			if (fam == 0x11)
601 				goto wrong_mc4_mce;
602 
603 			pr_cont("%s.\n", mc4_mce_desc[xec]);
604 
605 			if (decode_dram_ecc)
606 				decode_dram_ecc(node_id, m);
607 			return;
608 		}
609 		break;
610 
611 	case 0xf:
612 		if (TLB_ERROR(ec))
613 			pr_cont("GART Table Walk data error.\n");
614 		else if (BUS_ERROR(ec))
615 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
616 		else
617 			goto wrong_mc4_mce;
618 		return;
619 
620 	case 0x19:
621 		if (fam == 0x15 || fam == 0x16)
622 			pr_cont("Compute Unit Data Error.\n");
623 		else
624 			goto wrong_mc4_mce;
625 		return;
626 
627 	case 0x1c ... 0x1f:
628 		offset = 13;
629 		break;
630 
631 	default:
632 		goto wrong_mc4_mce;
633 	}
634 
635 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
636 	return;
637 
638  wrong_mc4_mce:
639 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
640 }
641 
decode_mc5_mce(struct mce * m)642 static void decode_mc5_mce(struct mce *m)
643 {
644 	unsigned int fam = x86_family(m->cpuid);
645 	u16 ec = EC(m->status);
646 	u8 xec = XEC(m->status, xec_mask);
647 
648 	if (fam == 0xf || fam == 0x11)
649 		goto wrong_mc5_mce;
650 
651 	pr_emerg(HW_ERR "MC5 Error: ");
652 
653 	if (INT_ERROR(ec)) {
654 		if (xec <= 0x1f) {
655 			pr_cont("Hardware Assert.\n");
656 			return;
657 		} else
658 			goto wrong_mc5_mce;
659 	}
660 
661 	if (xec == 0x0 || xec == 0xc)
662 		pr_cont("%s.\n", mc5_mce_desc[xec]);
663 	else if (xec <= 0xd)
664 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
665 	else
666 		goto wrong_mc5_mce;
667 
668 	return;
669 
670  wrong_mc5_mce:
671 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
672 }
673 
decode_mc6_mce(struct mce * m)674 static void decode_mc6_mce(struct mce *m)
675 {
676 	u8 xec = XEC(m->status, xec_mask);
677 
678 	pr_emerg(HW_ERR "MC6 Error: ");
679 
680 	if (xec > 0x5)
681 		goto wrong_mc6_mce;
682 
683 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
684 	return;
685 
686  wrong_mc6_mce:
687 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
688 }
689 
690 static const char * const smca_long_names[] = {
691 	[SMCA_LS ... SMCA_LS_V2]	= "Load Store Unit",
692 	[SMCA_IF]			= "Instruction Fetch Unit",
693 	[SMCA_L2_CACHE]			= "L2 Cache",
694 	[SMCA_DE]			= "Decode Unit",
695 	[SMCA_RESERVED]			= "Reserved",
696 	[SMCA_EX]			= "Execution Unit",
697 	[SMCA_FP]			= "Floating Point Unit",
698 	[SMCA_L3_CACHE]			= "L3 Cache",
699 	[SMCA_CS ... SMCA_CS_V2]	= "Coherent Slave",
700 	[SMCA_PIE]			= "Power, Interrupts, etc.",
701 
702 	/* UMC v2 is separate because both of them can exist in a single system. */
703 	[SMCA_UMC]			= "Unified Memory Controller",
704 	[SMCA_UMC_V2]			= "Unified Memory Controller v2",
705 	[SMCA_PB]			= "Parameter Block",
706 	[SMCA_PSP ... SMCA_PSP_V2]	= "Platform Security Processor",
707 	[SMCA_SMU ... SMCA_SMU_V2]	= "System Management Unit",
708 	[SMCA_MP5]			= "Microprocessor 5 Unit",
709 	[SMCA_MPDMA]			= "MPDMA Unit",
710 	[SMCA_NBIO]			= "Northbridge IO Unit",
711 	[SMCA_PCIE ... SMCA_PCIE_V2]	= "PCI Express Unit",
712 	[SMCA_XGMI_PCS]			= "Ext Global Memory Interconnect PCS Unit",
713 	[SMCA_NBIF]			= "NBIF Unit",
714 	[SMCA_SHUB]			= "System Hub Unit",
715 	[SMCA_SATA]			= "SATA Unit",
716 	[SMCA_USB]			= "USB Unit",
717 	[SMCA_GMI_PCS]			= "Global Memory Interconnect PCS Unit",
718 	[SMCA_XGMI_PHY]			= "Ext Global Memory Interconnect PHY Unit",
719 	[SMCA_WAFL_PHY]			= "WAFL PHY Unit",
720 	[SMCA_GMI_PHY]			= "Global Memory Interconnect PHY Unit",
721 };
722 
smca_get_long_name(enum smca_bank_types t)723 static const char *smca_get_long_name(enum smca_bank_types t)
724 {
725 	if (t >= N_SMCA_BANK_TYPES)
726 		return NULL;
727 
728 	return smca_long_names[t];
729 }
730 
731 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)732 static void decode_smca_error(struct mce *m)
733 {
734 	enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
735 	u8 xec = XEC(m->status, xec_mask);
736 
737 	if (bank_type >= N_SMCA_BANK_TYPES)
738 		return;
739 
740 	if (bank_type == SMCA_RESERVED) {
741 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
742 		return;
743 	}
744 
745 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
746 
747 	if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
748 	    xec == 0 && decode_dram_ecc)
749 		decode_dram_ecc(topology_amd_node_id(m->extcpu), m);
750 }
751 
amd_decode_err_code(u16 ec)752 static inline void amd_decode_err_code(u16 ec)
753 {
754 	if (INT_ERROR(ec)) {
755 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
756 		return;
757 	}
758 
759 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
760 
761 	if (BUS_ERROR(ec))
762 		pr_cont(", mem/io: %s", II_MSG(ec));
763 	else
764 		pr_cont(", tx: %s", TT_MSG(ec));
765 
766 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
767 		pr_cont(", mem-tx: %s", R4_MSG(ec));
768 
769 		if (BUS_ERROR(ec))
770 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
771 	}
772 
773 	pr_cont("\n");
774 }
775 
decode_error_status(struct mce * m)776 static const char *decode_error_status(struct mce *m)
777 {
778 	if (m->status & MCI_STATUS_UC) {
779 		if (m->status & MCI_STATUS_PCC)
780 			return "System Fatal error.";
781 		if (m->mcgstatus & MCG_STATUS_RIPV)
782 			return "Uncorrected, software restartable error.";
783 		return "Uncorrected, software containable error.";
784 	}
785 
786 	if (m->status & MCI_STATUS_DEFERRED)
787 		return "Deferred error, no action required.";
788 
789 	return "Corrected error, no action required.";
790 }
791 
792 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)793 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
794 {
795 	struct mce *m = (struct mce *)data;
796 	unsigned int fam = x86_family(m->cpuid);
797 	int ecc;
798 
799 	if (m->kflags & MCE_HANDLED_CEC)
800 		return NOTIFY_DONE;
801 
802 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
803 
804 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
805 		m->extcpu,
806 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
807 		m->bank,
808 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
809 		((m->status & MCI_STATUS_UC)	? "UE"	  :
810 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
811 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
812 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
813 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
814 
815 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
816 		u32 low, high;
817 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
818 
819 		if (!rdmsr_safe(addr, &low, &high) &&
820 		    (low & MCI_CONFIG_MCAX))
821 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
822 
823 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
824 	}
825 
826 	/* do the two bits[14:13] together */
827 	ecc = (m->status >> 45) & 0x3;
828 	if (ecc)
829 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
830 
831 	if (fam >= 0x15) {
832 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
833 
834 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
835 		if (fam != 0x15 || m->bank != 4)
836 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
837 	}
838 
839 	if (fam >= 0x17)
840 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
841 
842 	pr_cont("]: 0x%016llx\n", m->status);
843 
844 	if (m->status & MCI_STATUS_ADDRV)
845 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
846 
847 	if (m->ppin)
848 		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
849 
850 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
851 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
852 
853 		if (m->status & MCI_STATUS_SYNDV)
854 			pr_cont(", Syndrome: 0x%016llx", m->synd);
855 
856 		pr_cont("\n");
857 
858 		decode_smca_error(m);
859 		goto err_code;
860 	}
861 
862 	if (m->tsc)
863 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
864 
865 	/* Doesn't matter which member to test. */
866 	if (!fam_ops.mc0_mce)
867 		goto err_code;
868 
869 	switch (m->bank) {
870 	case 0:
871 		decode_mc0_mce(m);
872 		break;
873 
874 	case 1:
875 		decode_mc1_mce(m);
876 		break;
877 
878 	case 2:
879 		decode_mc2_mce(m);
880 		break;
881 
882 	case 3:
883 		decode_mc3_mce(m);
884 		break;
885 
886 	case 4:
887 		decode_mc4_mce(m);
888 		break;
889 
890 	case 5:
891 		decode_mc5_mce(m);
892 		break;
893 
894 	case 6:
895 		decode_mc6_mce(m);
896 		break;
897 
898 	default:
899 		break;
900 	}
901 
902  err_code:
903 	amd_decode_err_code(m->status & 0xffff);
904 
905 	m->kflags |= MCE_HANDLED_EDAC;
906 	return NOTIFY_OK;
907 }
908 
909 static struct notifier_block amd_mce_dec_nb = {
910 	.notifier_call	= amd_decode_mce,
911 	.priority	= MCE_PRIO_EDAC,
912 };
913 
mce_amd_init(void)914 static int __init mce_amd_init(void)
915 {
916 	struct cpuinfo_x86 *c = &boot_cpu_data;
917 
918 	if (c->x86_vendor != X86_VENDOR_AMD &&
919 	    c->x86_vendor != X86_VENDOR_HYGON)
920 		return -ENODEV;
921 
922 	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
923 		return -ENODEV;
924 
925 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
926 		xec_mask = 0x3f;
927 		goto out;
928 	}
929 
930 	switch (c->x86) {
931 	case 0xf:
932 		fam_ops.mc0_mce = k8_mc0_mce;
933 		fam_ops.mc1_mce = k8_mc1_mce;
934 		fam_ops.mc2_mce = k8_mc2_mce;
935 		break;
936 
937 	case 0x10:
938 		fam_ops.mc0_mce = f10h_mc0_mce;
939 		fam_ops.mc1_mce = k8_mc1_mce;
940 		fam_ops.mc2_mce = k8_mc2_mce;
941 		break;
942 
943 	case 0x11:
944 		fam_ops.mc0_mce = k8_mc0_mce;
945 		fam_ops.mc1_mce = k8_mc1_mce;
946 		fam_ops.mc2_mce = k8_mc2_mce;
947 		break;
948 
949 	case 0x12:
950 		fam_ops.mc0_mce = f12h_mc0_mce;
951 		fam_ops.mc1_mce = k8_mc1_mce;
952 		fam_ops.mc2_mce = k8_mc2_mce;
953 		break;
954 
955 	case 0x14:
956 		fam_ops.mc0_mce = cat_mc0_mce;
957 		fam_ops.mc1_mce = cat_mc1_mce;
958 		fam_ops.mc2_mce = k8_mc2_mce;
959 		break;
960 
961 	case 0x15:
962 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
963 
964 		fam_ops.mc0_mce = f15h_mc0_mce;
965 		fam_ops.mc1_mce = f15h_mc1_mce;
966 		fam_ops.mc2_mce = f15h_mc2_mce;
967 		break;
968 
969 	case 0x16:
970 		xec_mask = 0x1f;
971 		fam_ops.mc0_mce = cat_mc0_mce;
972 		fam_ops.mc1_mce = cat_mc1_mce;
973 		fam_ops.mc2_mce = f16h_mc2_mce;
974 		break;
975 
976 	case 0x17:
977 	case 0x18:
978 		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
979 		return -EINVAL;
980 
981 	default:
982 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
983 		return -EINVAL;
984 	}
985 
986 out:
987 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
988 
989 	mce_register_decode_chain(&amd_mce_dec_nb);
990 
991 	return 0;
992 }
993 early_initcall(mce_amd_init);
994 
995 #ifdef MODULE
mce_amd_exit(void)996 static void __exit mce_amd_exit(void)
997 {
998 	mce_unregister_decode_chain(&amd_mce_dec_nb);
999 }
1000 
1001 MODULE_DESCRIPTION("AMD MCE decoder");
1002 MODULE_ALIAS("edac-mce-amd");
1003 MODULE_LICENSE("GPL");
1004 module_exit(mce_amd_exit);
1005 #endif
1006