PageRenderTime 168ms CodeModel.GetById 41ms app.highlight 115ms RepoModel.GetById 1ms app.codeStats 0ms

/arch/x86/kernel/cpu/mcheck/mce.c

https://bitbucket.org/thekraven/iscream_thunderc-2.6.35
C | 2221 lines | 1410 code | 338 blank | 473 comment | 305 complexity | 519145303c4d5010c92a831f512e1730 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10#include <linux/thread_info.h>
  11#include <linux/capability.h>
  12#include <linux/miscdevice.h>
  13#include <linux/interrupt.h>
  14#include <linux/ratelimit.h>
  15#include <linux/kallsyms.h>
  16#include <linux/rcupdate.h>
  17#include <linux/kobject.h>
  18#include <linux/uaccess.h>
  19#include <linux/kdebug.h>
  20#include <linux/kernel.h>
  21#include <linux/percpu.h>
  22#include <linux/string.h>
  23#include <linux/sysdev.h>
  24#include <linux/delay.h>
  25#include <linux/ctype.h>
  26#include <linux/sched.h>
  27#include <linux/sysfs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/init.h>
  31#include <linux/kmod.h>
  32#include <linux/poll.h>
  33#include <linux/nmi.h>
  34#include <linux/cpu.h>
  35#include <linux/smp.h>
  36#include <linux/fs.h>
  37#include <linux/mm.h>
  38#include <linux/debugfs.h>
  39#include <linux/edac_mce.h>
  40
  41#include <asm/processor.h>
  42#include <asm/hw_irq.h>
  43#include <asm/apic.h>
  44#include <asm/idle.h>
  45#include <asm/ipi.h>
  46#include <asm/mce.h>
  47#include <asm/msr.h>
  48
  49#include "mce-internal.h"
  50
  51static DEFINE_MUTEX(mce_read_mutex);
  52
  53#define rcu_dereference_check_mce(p) \
  54	rcu_dereference_check((p), \
  55			      rcu_read_lock_sched_held() || \
  56			      lockdep_is_held(&mce_read_mutex))
  57
  58#define CREATE_TRACE_POINTS
  59#include <trace/events/mce.h>
  60
  61int mce_disabled __read_mostly;
  62
  63#define MISC_MCELOG_MINOR	227
  64
  65#define SPINUNIT 100	/* 100ns */
  66
  67atomic_t mce_entry;
  68
  69DEFINE_PER_CPU(unsigned, mce_exception_count);
  70
  71/*
  72 * Tolerant levels:
  73 *   0: always panic on uncorrected errors, log corrected errors
  74 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  75 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  76 *   3: never panic or SIGBUS, log all errors (for testing only)
  77 */
  78static int			tolerant		__read_mostly = 1;
  79static int			banks			__read_mostly;
  80static int			rip_msr			__read_mostly;
  81static int			mce_bootlog		__read_mostly = -1;
  82static int			monarch_timeout		__read_mostly = -1;
  83static int			mce_panic_timeout	__read_mostly;
  84static int			mce_dont_log_ce		__read_mostly;
  85int				mce_cmci_disabled	__read_mostly;
  86int				mce_ignore_ce		__read_mostly;
  87int				mce_ser			__read_mostly;
  88
  89struct mce_bank                *mce_banks		__read_mostly;
  90
  91/* User mode helper program triggered by machine check event */
  92static unsigned long		mce_need_notify;
  93static char			mce_helper[128];
  94static char			*mce_helper_argv[2] = { mce_helper, NULL };
  95
  96static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  97static DEFINE_PER_CPU(struct mce, mces_seen);
  98static int			cpu_missing;
  99
 100/*
 101 * CPU/chipset specific EDAC code can register a notifier call here to print
 102 * MCE errors in a human-readable form.
 103 */
 104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
 106
 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
 108			       void *data)
 109{
 110	pr_emerg("No human readable MCE decoding support on this CPU type.\n");
 111	pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
 112
 113	return NOTIFY_STOP;
 114}
 115
 116static struct notifier_block mce_dec_nb = {
 117	.notifier_call = default_decode_mce,
 118	.priority      = -1,
 119};
 120
 121/* MCA banks polled by the period polling timer for corrected events */
 122DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 123	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 124};
 125
 126static DEFINE_PER_CPU(struct work_struct, mce_work);
 127
 128/* Do initial initialization of a struct mce */
 129void mce_setup(struct mce *m)
 130{
 131	memset(m, 0, sizeof(struct mce));
 132	m->cpu = m->extcpu = smp_processor_id();
 133	rdtscll(m->tsc);
 134	/* We hope get_seconds stays lockless */
 135	m->time = get_seconds();
 136	m->cpuvendor = boot_cpu_data.x86_vendor;
 137	m->cpuid = cpuid_eax(1);
 138#ifdef CONFIG_SMP
 139	m->socketid = cpu_data(m->extcpu).phys_proc_id;
 140#endif
 141	m->apicid = cpu_data(m->extcpu).initial_apicid;
 142	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 143}
 144
 145DEFINE_PER_CPU(struct mce, injectm);
 146EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 147
 148/*
 149 * Lockless MCE logging infrastructure.
 150 * This avoids deadlocks on printk locks without having to break locks. Also
 151 * separate MCEs from kernel messages to avoid bogus bug reports.
 152 */
 153
 154static struct mce_log mcelog = {
 155	.signature	= MCE_LOG_SIGNATURE,
 156	.len		= MCE_LOG_LEN,
 157	.recordlen	= sizeof(struct mce),
 158};
 159
 160void mce_log(struct mce *mce)
 161{
 162	unsigned next, entry;
 163
 164	/* Emit the trace record: */
 165	trace_mce_record(mce);
 166
 167	mce->finished = 0;
 168	wmb();
 169	for (;;) {
 170		entry = rcu_dereference_check_mce(mcelog.next);
 171		for (;;) {
 172			/*
 173			 * If edac_mce is enabled, it will check the error type
 174			 * and will process it, if it is a known error.
 175			 * Otherwise, the error will be sent through mcelog
 176			 * interface
 177			 */
 178			if (edac_mce_parse(mce))
 179				return;
 180
 181			/*
 182			 * When the buffer fills up discard new entries.
 183			 * Assume that the earlier errors are the more
 184			 * interesting ones:
 185			 */
 186			if (entry >= MCE_LOG_LEN) {
 187				set_bit(MCE_OVERFLOW,
 188					(unsigned long *)&mcelog.flags);
 189				return;
 190			}
 191			/* Old left over entry. Skip: */
 192			if (mcelog.entry[entry].finished) {
 193				entry++;
 194				continue;
 195			}
 196			break;
 197		}
 198		smp_rmb();
 199		next = entry + 1;
 200		if (cmpxchg(&mcelog.next, entry, next) == entry)
 201			break;
 202	}
 203	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 204	wmb();
 205	mcelog.entry[entry].finished = 1;
 206	wmb();
 207
 208	mce->finished = 1;
 209	set_bit(0, &mce_need_notify);
 210}
 211
 212static void print_mce(struct mce *m)
 213{
 214	pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 215	       m->extcpu, m->mcgstatus, m->bank, m->status);
 216
 217	if (m->ip) {
 218		pr_emerg("RIP%s %02x:<%016Lx> ",
 219			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 220				m->cs, m->ip);
 221
 222		if (m->cs == __KERNEL_CS)
 223			print_symbol("{%s}", m->ip);
 224		pr_cont("\n");
 225	}
 226
 227	pr_emerg("TSC %llx ", m->tsc);
 228	if (m->addr)
 229		pr_cont("ADDR %llx ", m->addr);
 230	if (m->misc)
 231		pr_cont("MISC %llx ", m->misc);
 232
 233	pr_cont("\n");
 234	pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
 235		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
 236
 237	/*
 238	 * Print out human-readable details about the MCE error,
 239	 * (if the CPU has an implementation for that)
 240	 */
 241	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 242}
 243
 244static void print_mce_head(void)
 245{
 246	pr_emerg("\nHARDWARE ERROR\n");
 247}
 248
 249static void print_mce_tail(void)
 250{
 251	pr_emerg("This is not a software problem!\n");
 252}
 253
 254#define PANIC_TIMEOUT 5 /* 5 seconds */
 255
 256static atomic_t mce_paniced;
 257
 258static int fake_panic;
 259static atomic_t mce_fake_paniced;
 260
 261/* Panic in progress. Enable interrupts and wait for final IPI */
 262static void wait_for_panic(void)
 263{
 264	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 265
 266	preempt_disable();
 267	local_irq_enable();
 268	while (timeout-- > 0)
 269		udelay(1);
 270	if (panic_timeout == 0)
 271		panic_timeout = mce_panic_timeout;
 272	panic("Panicing machine check CPU died");
 273}
 274
 275static void mce_panic(char *msg, struct mce *final, char *exp)
 276{
 277	int i, apei_err = 0;
 278
 279	if (!fake_panic) {
 280		/*
 281		 * Make sure only one CPU runs in machine check panic
 282		 */
 283		if (atomic_inc_return(&mce_paniced) > 1)
 284			wait_for_panic();
 285		barrier();
 286
 287		bust_spinlocks(1);
 288		console_verbose();
 289	} else {
 290		/* Don't log too much for fake panic */
 291		if (atomic_inc_return(&mce_fake_paniced) > 1)
 292			return;
 293	}
 294	print_mce_head();
 295	/* First print corrected ones that are still unlogged */
 296	for (i = 0; i < MCE_LOG_LEN; i++) {
 297		struct mce *m = &mcelog.entry[i];
 298		if (!(m->status & MCI_STATUS_VAL))
 299			continue;
 300		if (!(m->status & MCI_STATUS_UC)) {
 301			print_mce(m);
 302			if (!apei_err)
 303				apei_err = apei_write_mce(m);
 304		}
 305	}
 306	/* Now print uncorrected but with the final one last */
 307	for (i = 0; i < MCE_LOG_LEN; i++) {
 308		struct mce *m = &mcelog.entry[i];
 309		if (!(m->status & MCI_STATUS_VAL))
 310			continue;
 311		if (!(m->status & MCI_STATUS_UC))
 312			continue;
 313		if (!final || memcmp(m, final, sizeof(struct mce))) {
 314			print_mce(m);
 315			if (!apei_err)
 316				apei_err = apei_write_mce(m);
 317		}
 318	}
 319	if (final) {
 320		print_mce(final);
 321		if (!apei_err)
 322			apei_err = apei_write_mce(final);
 323	}
 324	if (cpu_missing)
 325		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
 326	print_mce_tail();
 327	if (exp)
 328		printk(KERN_EMERG "Machine check: %s\n", exp);
 329	if (!fake_panic) {
 330		if (panic_timeout == 0)
 331			panic_timeout = mce_panic_timeout;
 332		panic(msg);
 333	} else
 334		printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
 335}
 336
 337/* Support code for software error injection */
 338
 339static int msr_to_offset(u32 msr)
 340{
 341	unsigned bank = __get_cpu_var(injectm.bank);
 342
 343	if (msr == rip_msr)
 344		return offsetof(struct mce, ip);
 345	if (msr == MSR_IA32_MCx_STATUS(bank))
 346		return offsetof(struct mce, status);
 347	if (msr == MSR_IA32_MCx_ADDR(bank))
 348		return offsetof(struct mce, addr);
 349	if (msr == MSR_IA32_MCx_MISC(bank))
 350		return offsetof(struct mce, misc);
 351	if (msr == MSR_IA32_MCG_STATUS)
 352		return offsetof(struct mce, mcgstatus);
 353	return -1;
 354}
 355
 356/* MSR access wrappers used for error injection */
 357static u64 mce_rdmsrl(u32 msr)
 358{
 359	u64 v;
 360
 361	if (__get_cpu_var(injectm).finished) {
 362		int offset = msr_to_offset(msr);
 363
 364		if (offset < 0)
 365			return 0;
 366		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
 367	}
 368
 369	if (rdmsrl_safe(msr, &v)) {
 370		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 371		/*
 372		 * Return zero in case the access faulted. This should
 373		 * not happen normally but can happen if the CPU does
 374		 * something weird, or if the code is buggy.
 375		 */
 376		v = 0;
 377	}
 378
 379	return v;
 380}
 381
 382static void mce_wrmsrl(u32 msr, u64 v)
 383{
 384	if (__get_cpu_var(injectm).finished) {
 385		int offset = msr_to_offset(msr);
 386
 387		if (offset >= 0)
 388			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
 389		return;
 390	}
 391	wrmsrl(msr, v);
 392}
 393
 394/*
 395 * Simple lockless ring to communicate PFNs from the exception handler with the
 396 * process context work function. This is vastly simplified because there's
 397 * only a single reader and a single writer.
 398 */
 399#define MCE_RING_SIZE 16	/* we use one entry less */
 400
 401struct mce_ring {
 402	unsigned short start;
 403	unsigned short end;
 404	unsigned long ring[MCE_RING_SIZE];
 405};
 406static DEFINE_PER_CPU(struct mce_ring, mce_ring);
 407
 408/* Runs with CPU affinity in workqueue */
 409static int mce_ring_empty(void)
 410{
 411	struct mce_ring *r = &__get_cpu_var(mce_ring);
 412
 413	return r->start == r->end;
 414}
 415
 416static int mce_ring_get(unsigned long *pfn)
 417{
 418	struct mce_ring *r;
 419	int ret = 0;
 420
 421	*pfn = 0;
 422	get_cpu();
 423	r = &__get_cpu_var(mce_ring);
 424	if (r->start == r->end)
 425		goto out;
 426	*pfn = r->ring[r->start];
 427	r->start = (r->start + 1) % MCE_RING_SIZE;
 428	ret = 1;
 429out:
 430	put_cpu();
 431	return ret;
 432}
 433
 434/* Always runs in MCE context with preempt off */
 435static int mce_ring_add(unsigned long pfn)
 436{
 437	struct mce_ring *r = &__get_cpu_var(mce_ring);
 438	unsigned next;
 439
 440	next = (r->end + 1) % MCE_RING_SIZE;
 441	if (next == r->start)
 442		return -1;
 443	r->ring[r->end] = pfn;
 444	wmb();
 445	r->end = next;
 446	return 0;
 447}
 448
 449int mce_available(struct cpuinfo_x86 *c)
 450{
 451	if (mce_disabled)
 452		return 0;
 453	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 454}
 455
 456static void mce_schedule_work(void)
 457{
 458	if (!mce_ring_empty()) {
 459		struct work_struct *work = &__get_cpu_var(mce_work);
 460		if (!work_pending(work))
 461			schedule_work(work);
 462	}
 463}
 464
 465/*
 466 * Get the address of the instruction at the time of the machine check
 467 * error.
 468 */
 469static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 470{
 471
 472	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
 473		m->ip = regs->ip;
 474		m->cs = regs->cs;
 475	} else {
 476		m->ip = 0;
 477		m->cs = 0;
 478	}
 479	if (rip_msr)
 480		m->ip = mce_rdmsrl(rip_msr);
 481}
 482
 483#ifdef CONFIG_X86_LOCAL_APIC
 484/*
 485 * Called after interrupts have been reenabled again
 486 * when a MCE happened during an interrupts off region
 487 * in the kernel.
 488 */
 489asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
 490{
 491	ack_APIC_irq();
 492	exit_idle();
 493	irq_enter();
 494	mce_notify_irq();
 495	mce_schedule_work();
 496	irq_exit();
 497}
 498#endif
 499
 500static void mce_report_event(struct pt_regs *regs)
 501{
 502	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 503		mce_notify_irq();
 504		/*
 505		 * Triggering the work queue here is just an insurance
 506		 * policy in case the syscall exit notify handler
 507		 * doesn't run soon enough or ends up running on the
 508		 * wrong CPU (can happen when audit sleeps)
 509		 */
 510		mce_schedule_work();
 511		return;
 512	}
 513
 514#ifdef CONFIG_X86_LOCAL_APIC
 515	/*
 516	 * Without APIC do not notify. The event will be picked
 517	 * up eventually.
 518	 */
 519	if (!cpu_has_apic)
 520		return;
 521
 522	/*
 523	 * When interrupts are disabled we cannot use
 524	 * kernel services safely. Trigger an self interrupt
 525	 * through the APIC to instead do the notification
 526	 * after interrupts are reenabled again.
 527	 */
 528	apic->send_IPI_self(MCE_SELF_VECTOR);
 529
 530	/*
 531	 * Wait for idle afterwards again so that we don't leave the
 532	 * APIC in a non idle state because the normal APIC writes
 533	 * cannot exclude us.
 534	 */
 535	apic_wait_icr_idle();
 536#endif
 537}
 538
 539DEFINE_PER_CPU(unsigned, mce_poll_count);
 540
 541/*
 542 * Poll for corrected events or events that happened before reset.
 543 * Those are just logged through /dev/mcelog.
 544 *
 545 * This is executed in standard interrupt context.
 546 *
 547 * Note: spec recommends to panic for fatal unsignalled
 548 * errors here. However this would be quite problematic --
 549 * we would need to reimplement the Monarch handling and
 550 * it would mess up the exclusion between exception handler
 551 * and poll hander -- * so we skip this for now.
 552 * These cases should not happen anyways, or only when the CPU
 553 * is already totally * confused. In this case it's likely it will
 554 * not fully execute the machine check handler either.
 555 */
 556void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 557{
 558	struct mce m;
 559	int i;
 560
 561	percpu_inc(mce_poll_count);
 562
 563	mce_setup(&m);
 564
 565	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 566	for (i = 0; i < banks; i++) {
 567		if (!mce_banks[i].ctl || !test_bit(i, *b))
 568			continue;
 569
 570		m.misc = 0;
 571		m.addr = 0;
 572		m.bank = i;
 573		m.tsc = 0;
 574
 575		barrier();
 576		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 577		if (!(m.status & MCI_STATUS_VAL))
 578			continue;
 579
 580		/*
 581		 * Uncorrected or signalled events are handled by the exception
 582		 * handler when it is enabled, so don't process those here.
 583		 *
 584		 * TBD do the same check for MCI_STATUS_EN here?
 585		 */
 586		if (!(flags & MCP_UC) &&
 587		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 588			continue;
 589
 590		if (m.status & MCI_STATUS_MISCV)
 591			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 592		if (m.status & MCI_STATUS_ADDRV)
 593			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 594
 595		if (!(flags & MCP_TIMESTAMP))
 596			m.tsc = 0;
 597		/*
 598		 * Don't get the IP here because it's unlikely to
 599		 * have anything to do with the actual error location.
 600		 */
 601		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
 602			mce_log(&m);
 603			add_taint(TAINT_MACHINE_CHECK);
 604		}
 605
 606		/*
 607		 * Clear state for this bank.
 608		 */
 609		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 610	}
 611
 612	/*
 613	 * Don't clear MCG_STATUS here because it's only defined for
 614	 * exceptions.
 615	 */
 616
 617	sync_core();
 618}
 619EXPORT_SYMBOL_GPL(machine_check_poll);
 620
 621/*
 622 * Do a quick check if any of the events requires a panic.
 623 * This decides if we keep the events around or clear them.
 624 */
 625static int mce_no_way_out(struct mce *m, char **msg)
 626{
 627	int i;
 628
 629	for (i = 0; i < banks; i++) {
 630		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 631		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
 632			return 1;
 633	}
 634	return 0;
 635}
 636
 637/*
 638 * Variable to establish order between CPUs while scanning.
 639 * Each CPU spins initially until executing is equal its number.
 640 */
 641static atomic_t mce_executing;
 642
 643/*
 644 * Defines order of CPUs on entry. First CPU becomes Monarch.
 645 */
 646static atomic_t mce_callin;
 647
 648/*
 649 * Check if a timeout waiting for other CPUs happened.
 650 */
 651static int mce_timed_out(u64 *t)
 652{
 653	/*
 654	 * The others already did panic for some reason.
 655	 * Bail out like in a timeout.
 656	 * rmb() to tell the compiler that system_state
 657	 * might have been modified by someone else.
 658	 */
 659	rmb();
 660	if (atomic_read(&mce_paniced))
 661		wait_for_panic();
 662	if (!monarch_timeout)
 663		goto out;
 664	if ((s64)*t < SPINUNIT) {
 665		/* CHECKME: Make panic default for 1 too? */
 666		if (tolerant < 1)
 667			mce_panic("Timeout synchronizing machine check over CPUs",
 668				  NULL, NULL);
 669		cpu_missing = 1;
 670		return 1;
 671	}
 672	*t -= SPINUNIT;
 673out:
 674	touch_nmi_watchdog();
 675	return 0;
 676}
 677
 678/*
 679 * The Monarch's reign.  The Monarch is the CPU who entered
 680 * the machine check handler first. It waits for the others to
 681 * raise the exception too and then grades them. When any
 682 * error is fatal panic. Only then let the others continue.
 683 *
 684 * The other CPUs entering the MCE handler will be controlled by the
 685 * Monarch. They are called Subjects.
 686 *
 687 * This way we prevent any potential data corruption in a unrecoverable case
 688 * and also makes sure always all CPU's errors are examined.
 689 *
 690 * Also this detects the case of a machine check event coming from outer
 691 * space (not detected by any CPUs) In this case some external agent wants
 692 * us to shut down, so panic too.
 693 *
 694 * The other CPUs might still decide to panic if the handler happens
 695 * in a unrecoverable place, but in this case the system is in a semi-stable
 696 * state and won't corrupt anything by itself. It's ok to let the others
 697 * continue for a bit first.
 698 *
 699 * All the spin loops have timeouts; when a timeout happens a CPU
 700 * typically elects itself to be Monarch.
 701 */
 702static void mce_reign(void)
 703{
 704	int cpu;
 705	struct mce *m = NULL;
 706	int global_worst = 0;
 707	char *msg = NULL;
 708	char *nmsg = NULL;
 709
 710	/*
 711	 * This CPU is the Monarch and the other CPUs have run
 712	 * through their handlers.
 713	 * Grade the severity of the errors of all the CPUs.
 714	 */
 715	for_each_possible_cpu(cpu) {
 716		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
 717					    &nmsg);
 718		if (severity > global_worst) {
 719			msg = nmsg;
 720			global_worst = severity;
 721			m = &per_cpu(mces_seen, cpu);
 722		}
 723	}
 724
 725	/*
 726	 * Cannot recover? Panic here then.
 727	 * This dumps all the mces in the log buffer and stops the
 728	 * other CPUs.
 729	 */
 730	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
 731		mce_panic("Fatal Machine check", m, msg);
 732
 733	/*
 734	 * For UC somewhere we let the CPU who detects it handle it.
 735	 * Also must let continue the others, otherwise the handling
 736	 * CPU could deadlock on a lock.
 737	 */
 738
 739	/*
 740	 * No machine check event found. Must be some external
 741	 * source or one CPU is hung. Panic.
 742	 */
 743	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
 744		mce_panic("Machine check from unknown source", NULL, NULL);
 745
 746	/*
 747	 * Now clear all the mces_seen so that they don't reappear on
 748	 * the next mce.
 749	 */
 750	for_each_possible_cpu(cpu)
 751		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 752}
 753
 754static atomic_t global_nwo;
 755
 756/*
 757 * Start of Monarch synchronization. This waits until all CPUs have
 758 * entered the exception handler and then determines if any of them
 759 * saw a fatal event that requires panic. Then it executes them
 760 * in the entry order.
 761 * TBD double check parallel CPU hotunplug
 762 */
 763static int mce_start(int *no_way_out)
 764{
 765	int order;
 766	int cpus = num_online_cpus();
 767	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 768
 769	if (!timeout)
 770		return -1;
 771
 772	atomic_add(*no_way_out, &global_nwo);
 773	/*
 774	 * global_nwo should be updated before mce_callin
 775	 */
 776	smp_wmb();
 777	order = atomic_inc_return(&mce_callin);
 778
 779	/*
 780	 * Wait for everyone.
 781	 */
 782	while (atomic_read(&mce_callin) != cpus) {
 783		if (mce_timed_out(&timeout)) {
 784			atomic_set(&global_nwo, 0);
 785			return -1;
 786		}
 787		ndelay(SPINUNIT);
 788	}
 789
 790	/*
 791	 * mce_callin should be read before global_nwo
 792	 */
 793	smp_rmb();
 794
 795	if (order == 1) {
 796		/*
 797		 * Monarch: Starts executing now, the others wait.
 798		 */
 799		atomic_set(&mce_executing, 1);
 800	} else {
 801		/*
 802		 * Subject: Now start the scanning loop one by one in
 803		 * the original callin order.
 804		 * This way when there are any shared banks it will be
 805		 * only seen by one CPU before cleared, avoiding duplicates.
 806		 */
 807		while (atomic_read(&mce_executing) < order) {
 808			if (mce_timed_out(&timeout)) {
 809				atomic_set(&global_nwo, 0);
 810				return -1;
 811			}
 812			ndelay(SPINUNIT);
 813		}
 814	}
 815
 816	/*
 817	 * Cache the global no_way_out state.
 818	 */
 819	*no_way_out = atomic_read(&global_nwo);
 820
 821	return order;
 822}
 823
 824/*
 825 * Synchronize between CPUs after main scanning loop.
 826 * This invokes the bulk of the Monarch processing.
 827 */
 828static int mce_end(int order)
 829{
 830	int ret = -1;
 831	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 832
 833	if (!timeout)
 834		goto reset;
 835	if (order < 0)
 836		goto reset;
 837
 838	/*
 839	 * Allow others to run.
 840	 */
 841	atomic_inc(&mce_executing);
 842
 843	if (order == 1) {
 844		/* CHECKME: Can this race with a parallel hotplug? */
 845		int cpus = num_online_cpus();
 846
 847		/*
 848		 * Monarch: Wait for everyone to go through their scanning
 849		 * loops.
 850		 */
 851		while (atomic_read(&mce_executing) <= cpus) {
 852			if (mce_timed_out(&timeout))
 853				goto reset;
 854			ndelay(SPINUNIT);
 855		}
 856
 857		mce_reign();
 858		barrier();
 859		ret = 0;
 860	} else {
 861		/*
 862		 * Subject: Wait for Monarch to finish.
 863		 */
 864		while (atomic_read(&mce_executing) != 0) {
 865			if (mce_timed_out(&timeout))
 866				goto reset;
 867			ndelay(SPINUNIT);
 868		}
 869
 870		/*
 871		 * Don't reset anything. That's done by the Monarch.
 872		 */
 873		return 0;
 874	}
 875
 876	/*
 877	 * Reset all global state.
 878	 */
 879reset:
 880	atomic_set(&global_nwo, 0);
 881	atomic_set(&mce_callin, 0);
 882	barrier();
 883
 884	/*
 885	 * Let others run again.
 886	 */
 887	atomic_set(&mce_executing, 0);
 888	return ret;
 889}
 890
 891/*
 892 * Check if the address reported by the CPU is in a format we can parse.
 893 * It would be possible to add code for most other cases, but all would
 894 * be somewhat complicated (e.g. segment offset would require an instruction
 895 * parser). So only support physical addresses upto page granuality for now.
 896 */
 897static int mce_usable_address(struct mce *m)
 898{
 899	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 900		return 0;
 901	if ((m->misc & 0x3f) > PAGE_SHIFT)
 902		return 0;
 903	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
 904		return 0;
 905	return 1;
 906}
 907
 908static void mce_clear_state(unsigned long *toclear)
 909{
 910	int i;
 911
 912	for (i = 0; i < banks; i++) {
 913		if (test_bit(i, toclear))
 914			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 915	}
 916}
 917
 918/*
 919 * The actual machine check handler. This only handles real
 920 * exceptions when something got corrupted coming in through int 18.
 921 *
 922 * This is executed in NMI context not subject to normal locking rules. This
 923 * implies that most kernel services cannot be safely used. Don't even
 924 * think about putting a printk in there!
 925 *
 926 * On Intel systems this is entered on all CPUs in parallel through
 927 * MCE broadcast. However some CPUs might be broken beyond repair,
 928 * so be always careful when synchronizing with others.
 929 */
 930void do_machine_check(struct pt_regs *regs, long error_code)
 931{
 932	struct mce m, *final;
 933	int i;
 934	int worst = 0;
 935	int severity;
 936	/*
 937	 * Establish sequential order between the CPUs entering the machine
 938	 * check handler.
 939	 */
 940	int order;
 941	/*
 942	 * If no_way_out gets set, there is no safe way to recover from this
 943	 * MCE.  If tolerant is cranked up, we'll try anyway.
 944	 */
 945	int no_way_out = 0;
 946	/*
 947	 * If kill_it gets set, there might be a way to recover from this
 948	 * error.
 949	 */
 950	int kill_it = 0;
 951	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 952	char *msg = "Unknown";
 953
 954	atomic_inc(&mce_entry);
 955
 956	percpu_inc(mce_exception_count);
 957
 958	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 959			   18, SIGKILL) == NOTIFY_STOP)
 960		goto out;
 961	if (!banks)
 962		goto out;
 963
 964	mce_setup(&m);
 965
 966	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 967	final = &__get_cpu_var(mces_seen);
 968	*final = m;
 969
 970	no_way_out = mce_no_way_out(&m, &msg);
 971
 972	barrier();
 973
 974	/*
 975	 * When no restart IP must always kill or panic.
 976	 */
 977	if (!(m.mcgstatus & MCG_STATUS_RIPV))
 978		kill_it = 1;
 979
 980	/*
 981	 * Go through all the banks in exclusion of the other CPUs.
 982	 * This way we don't report duplicated events on shared banks
 983	 * because the first one to see it will clear it.
 984	 */
 985	order = mce_start(&no_way_out);
 986	for (i = 0; i < banks; i++) {
 987		__clear_bit(i, toclear);
 988		if (!mce_banks[i].ctl)
 989			continue;
 990
 991		m.misc = 0;
 992		m.addr = 0;
 993		m.bank = i;
 994
 995		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 996		if ((m.status & MCI_STATUS_VAL) == 0)
 997			continue;
 998
 999		/*
1000		 * Non uncorrected or non signaled errors are handled by
1001		 * machine_check_poll. Leave them alone, unless this panics.
1002		 */
1003		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1004			!no_way_out)
1005			continue;
1006
1007		/*
1008		 * Set taint even when machine check was not enabled.
1009		 */
1010		add_taint(TAINT_MACHINE_CHECK);
1011
1012		severity = mce_severity(&m, tolerant, NULL);
1013
1014		/*
1015		 * When machine check was for corrected handler don't touch,
1016		 * unless we're panicing.
1017		 */
1018		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1019			continue;
1020		__set_bit(i, toclear);
1021		if (severity == MCE_NO_SEVERITY) {
1022			/*
1023			 * Machine check event was not enabled. Clear, but
1024			 * ignore.
1025			 */
1026			continue;
1027		}
1028
1029		/*
1030		 * Kill on action required.
1031		 */
1032		if (severity == MCE_AR_SEVERITY)
1033			kill_it = 1;
1034
1035		if (m.status & MCI_STATUS_MISCV)
1036			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1037		if (m.status & MCI_STATUS_ADDRV)
1038			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1039
1040		/*
1041		 * Action optional error. Queue address for later processing.
1042		 * When the ring overflows we just ignore the AO error.
1043		 * RED-PEN add some logging mechanism when
1044		 * usable_address or mce_add_ring fails.
1045		 * RED-PEN don't ignore overflow for tolerant == 0
1046		 */
1047		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1048			mce_ring_add(m.addr >> PAGE_SHIFT);
1049
1050		mce_get_rip(&m, regs);
1051		mce_log(&m);
1052
1053		if (severity > worst) {
1054			*final = m;
1055			worst = severity;
1056		}
1057	}
1058
1059	if (!no_way_out)
1060		mce_clear_state(toclear);
1061
1062	/*
1063	 * Do most of the synchronization with other CPUs.
1064	 * When there's any problem use only local no_way_out state.
1065	 */
1066	if (mce_end(order) < 0)
1067		no_way_out = worst >= MCE_PANIC_SEVERITY;
1068
1069	/*
1070	 * If we have decided that we just CAN'T continue, and the user
1071	 * has not set tolerant to an insane level, give up and die.
1072	 *
1073	 * This is mainly used in the case when the system doesn't
1074	 * support MCE broadcasting or it has been disabled.
1075	 */
1076	if (no_way_out && tolerant < 3)
1077		mce_panic("Fatal machine check on current CPU", final, msg);
1078
1079	/*
1080	 * If the error seems to be unrecoverable, something should be
1081	 * done.  Try to kill as little as possible.  If we can kill just
1082	 * one task, do that.  If the user has set the tolerance very
1083	 * high, don't try to do anything at all.
1084	 */
1085
1086	if (kill_it && tolerant < 3)
1087		force_sig(SIGBUS, current);
1088
1089	/* notify userspace ASAP */
1090	set_thread_flag(TIF_MCE_NOTIFY);
1091
1092	if (worst > 0)
1093		mce_report_event(regs);
1094	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1095out:
1096	atomic_dec(&mce_entry);
1097	sync_core();
1098}
1099EXPORT_SYMBOL_GPL(do_machine_check);
1100
1101/* dummy to break dependency. actual code is in mm/memory-failure.c */
1102void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1103{
1104	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1105}
1106
1107/*
1108 * Called after mce notification in process context. This code
1109 * is allowed to sleep. Call the high level VM handler to process
1110 * any corrupted pages.
1111 * Assume that the work queue code only calls this one at a time
1112 * per CPU.
1113 * Note we don't disable preemption, so this code might run on the wrong
1114 * CPU. In this case the event is picked up by the scheduled work queue.
1115 * This is merely a fast path to expedite processing in some common
1116 * cases.
1117 */
1118void mce_notify_process(void)
1119{
1120	unsigned long pfn;
1121	mce_notify_irq();
1122	while (mce_ring_get(&pfn))
1123		memory_failure(pfn, MCE_VECTOR);
1124}
1125
1126static void mce_process_work(struct work_struct *dummy)
1127{
1128	mce_notify_process();
1129}
1130
1131#ifdef CONFIG_X86_MCE_INTEL
1132/***
1133 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1134 * @cpu: The CPU on which the event occurred.
1135 * @status: Event status information
1136 *
1137 * This function should be called by the thermal interrupt after the
1138 * event has been processed and the decision was made to log the event
1139 * further.
1140 *
1141 * The status parameter will be saved to the 'status' field of 'struct mce'
1142 * and historically has been the register value of the
1143 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1144 */
1145void mce_log_therm_throt_event(__u64 status)
1146{
1147	struct mce m;
1148
1149	mce_setup(&m);
1150	m.bank = MCE_THERMAL_BANK;
1151	m.status = status;
1152	mce_log(&m);
1153}
1154#endif /* CONFIG_X86_MCE_INTEL */
1155
1156/*
1157 * Periodic polling timer for "silent" machine check errors.  If the
1158 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1159 * errors, poll 2x slower (up to check_interval seconds).
1160 */
1161static int check_interval = 5 * 60; /* 5 minutes */
1162
1163static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1164static DEFINE_PER_CPU(struct timer_list, mce_timer);
1165
1166static void mce_start_timer(unsigned long data)
1167{
1168	struct timer_list *t = &per_cpu(mce_timer, data);
1169	int *n;
1170
1171	WARN_ON(smp_processor_id() != data);
1172
1173	if (mce_available(&current_cpu_data)) {
1174		machine_check_poll(MCP_TIMESTAMP,
1175				&__get_cpu_var(mce_poll_banks));
1176	}
1177
1178	/*
1179	 * Alert userspace if needed.  If we logged an MCE, reduce the
1180	 * polling interval, otherwise increase the polling interval.
1181	 */
1182	n = &__get_cpu_var(mce_next_interval);
1183	if (mce_notify_irq())
1184		*n = max(*n/2, HZ/100);
1185	else
1186		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1187
1188	t->expires = jiffies + *n;
1189	add_timer_on(t, smp_processor_id());
1190}
1191
1192static void mce_do_trigger(struct work_struct *work)
1193{
1194	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1195}
1196
1197static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1198
1199/*
1200 * Notify the user(s) about new machine check events.
1201 * Can be called from interrupt context, but not from machine check/NMI
1202 * context.
1203 */
1204int mce_notify_irq(void)
1205{
1206	/* Not more than two messages every minute */
1207	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1208
1209	clear_thread_flag(TIF_MCE_NOTIFY);
1210
1211	if (test_and_clear_bit(0, &mce_need_notify)) {
1212		wake_up_interruptible(&mce_wait);
1213
1214		/*
1215		 * There is no risk of missing notifications because
1216		 * work_pending is always cleared before the function is
1217		 * executed.
1218		 */
1219		if (mce_helper[0] && !work_pending(&mce_trigger_work))
1220			schedule_work(&mce_trigger_work);
1221
1222		if (__ratelimit(&ratelimit))
1223			printk(KERN_INFO "Machine check events logged\n");
1224
1225		return 1;
1226	}
1227	return 0;
1228}
1229EXPORT_SYMBOL_GPL(mce_notify_irq);
1230
1231static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1232{
1233	int i;
1234
1235	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1236	if (!mce_banks)
1237		return -ENOMEM;
1238	for (i = 0; i < banks; i++) {
1239		struct mce_bank *b = &mce_banks[i];
1240
1241		b->ctl = -1ULL;
1242		b->init = 1;
1243	}
1244	return 0;
1245}
1246
1247/*
1248 * Initialize Machine Checks for a CPU.
1249 */
1250static int __cpuinit __mcheck_cpu_cap_init(void)
1251{
1252	unsigned b;
1253	u64 cap;
1254
1255	rdmsrl(MSR_IA32_MCG_CAP, cap);
1256
1257	b = cap & MCG_BANKCNT_MASK;
1258	if (!banks)
1259		printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1260
1261	if (b > MAX_NR_BANKS) {
1262		printk(KERN_WARNING
1263		       "MCE: Using only %u machine check banks out of %u\n",
1264			MAX_NR_BANKS, b);
1265		b = MAX_NR_BANKS;
1266	}
1267
1268	/* Don't support asymmetric configurations today */
1269	WARN_ON(banks != 0 && b != banks);
1270	banks = b;
1271	if (!mce_banks) {
1272		int err = __mcheck_cpu_mce_banks_init();
1273
1274		if (err)
1275			return err;
1276	}
1277
1278	/* Use accurate RIP reporting if available. */
1279	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1280		rip_msr = MSR_IA32_MCG_EIP;
1281
1282	if (cap & MCG_SER_P)
1283		mce_ser = 1;
1284
1285	return 0;
1286}
1287
1288static void __mcheck_cpu_init_generic(void)
1289{
1290	mce_banks_t all_banks;
1291	u64 cap;
1292	int i;
1293
1294	/*
1295	 * Log the machine checks left over from the previous reset.
1296	 */
1297	bitmap_fill(all_banks, MAX_NR_BANKS);
1298	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1299
1300	set_in_cr4(X86_CR4_MCE);
1301
1302	rdmsrl(MSR_IA32_MCG_CAP, cap);
1303	if (cap & MCG_CTL_P)
1304		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1305
1306	for (i = 0; i < banks; i++) {
1307		struct mce_bank *b = &mce_banks[i];
1308
1309		if (!b->init)
1310			continue;
1311		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1312		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1313	}
1314}
1315
1316/* Add per CPU specific workarounds here */
1317static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1318{
1319	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1320		pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1321		return -EOPNOTSUPP;
1322	}
1323
1324	/* This should be disabled by the BIOS, but isn't always */
1325	if (c->x86_vendor == X86_VENDOR_AMD) {
1326		if (c->x86 == 15 && banks > 4) {
1327			/*
1328			 * disable GART TBL walk error reporting, which
1329			 * trips off incorrectly with the IOMMU & 3ware
1330			 * & Cerberus:
1331			 */
1332			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1333		}
1334		if (c->x86 <= 17 && mce_bootlog < 0) {
1335			/*
1336			 * Lots of broken BIOS around that don't clear them
1337			 * by default and leave crap in there. Don't log:
1338			 */
1339			mce_bootlog = 0;
1340		}
1341		/*
1342		 * Various K7s with broken bank 0 around. Always disable
1343		 * by default.
1344		 */
1345		 if (c->x86 == 6 && banks > 0)
1346			mce_banks[0].ctl = 0;
1347	}
1348
1349	if (c->x86_vendor == X86_VENDOR_INTEL) {
1350		/*
1351		 * SDM documents that on family 6 bank 0 should not be written
1352		 * because it aliases to another special BIOS controlled
1353		 * register.
1354		 * But it's not aliased anymore on model 0x1a+
1355		 * Don't ignore bank 0 completely because there could be a
1356		 * valid event later, merely don't write CTL0.
1357		 */
1358
1359		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1360			mce_banks[0].init = 0;
1361
1362		/*
1363		 * All newer Intel systems support MCE broadcasting. Enable
1364		 * synchronization with a one second timeout.
1365		 */
1366		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1367			monarch_timeout < 0)
1368			monarch_timeout = USEC_PER_SEC;
1369
1370		/*
1371		 * There are also broken BIOSes on some Pentium M and
1372		 * earlier systems:
1373		 */
1374		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1375			mce_bootlog = 0;
1376	}
1377	if (monarch_timeout < 0)
1378		monarch_timeout = 0;
1379	if (mce_bootlog != 0)
1380		mce_panic_timeout = 30;
1381
1382	return 0;
1383}
1384
1385static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1386{
1387	if (c->x86 != 5)
1388		return;
1389	switch (c->x86_vendor) {
1390	case X86_VENDOR_INTEL:
1391		intel_p5_mcheck_init(c);
1392		break;
1393	case X86_VENDOR_CENTAUR:
1394		winchip_mcheck_init(c);
1395		break;
1396	}
1397}
1398
1399static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1400{
1401	switch (c->x86_vendor) {
1402	case X86_VENDOR_INTEL:
1403		mce_intel_feature_init(c);
1404		break;
1405	case X86_VENDOR_AMD:
1406		mce_amd_feature_init(c);
1407		break;
1408	default:
1409		break;
1410	}
1411}
1412
1413static void __mcheck_cpu_init_timer(void)
1414{
1415	struct timer_list *t = &__get_cpu_var(mce_timer);
1416	int *n = &__get_cpu_var(mce_next_interval);
1417
1418	setup_timer(t, mce_start_timer, smp_processor_id());
1419
1420	if (mce_ignore_ce)
1421		return;
1422
1423	*n = check_interval * HZ;
1424	if (!*n)
1425		return;
1426	t->expires = round_jiffies(jiffies + *n);
1427	add_timer_on(t, smp_processor_id());
1428}
1429
1430/* Handle unconfigured int18 (should never happen) */
1431static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1432{
1433	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1434	       smp_processor_id());
1435}
1436
1437/* Call the installed machine check handler for this CPU setup. */
1438void (*machine_check_vector)(struct pt_regs *, long error_code) =
1439						unexpected_machine_check;
1440
1441/*
1442 * Called for each booted CPU to set up machine checks.
1443 * Must be called with preempt off:
1444 */
1445void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1446{
1447	if (mce_disabled)
1448		return;
1449
1450	__mcheck_cpu_ancient_init(c);
1451
1452	if (!mce_available(c))
1453		return;
1454
1455	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1456		mce_disabled = 1;
1457		return;
1458	}
1459
1460	machine_check_vector = do_machine_check;
1461
1462	__mcheck_cpu_init_generic();
1463	__mcheck_cpu_init_vendor(c);
1464	__mcheck_cpu_init_timer();
1465	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1466
1467}
1468
1469/*
1470 * Character device to read and clear the MCE log.
1471 */
1472
1473static DEFINE_SPINLOCK(mce_state_lock);
1474static int		open_count;		/* #times opened */
1475static int		open_exclu;		/* already open exclusive? */
1476
1477static int mce_open(struct inode *inode, struct file *file)
1478{
1479	spin_lock(&mce_state_lock);
1480
1481	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1482		spin_unlock(&mce_state_lock);
1483
1484		return -EBUSY;
1485	}
1486
1487	if (file->f_flags & O_EXCL)
1488		open_exclu = 1;
1489	open_count++;
1490
1491	spin_unlock(&mce_state_lock);
1492
1493	return nonseekable_open(inode, file);
1494}
1495
1496static int mce_release(struct inode *inode, struct file *file)
1497{
1498	spin_lock(&mce_state_lock);
1499
1500	open_count--;
1501	open_exclu = 0;
1502
1503	spin_unlock(&mce_state_lock);
1504
1505	return 0;
1506}
1507
1508static void collect_tscs(void *data)
1509{
1510	unsigned long *cpu_tsc = (unsigned long *)data;
1511
1512	rdtscll(cpu_tsc[smp_processor_id()]);
1513}
1514
1515static int mce_apei_read_done;
1516
1517/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1518static int __mce_read_apei(char __user **ubuf, size_t usize)
1519{
1520	int rc;
1521	u64 record_id;
1522	struct mce m;
1523
1524	if (usize < sizeof(struct mce))
1525		return -EINVAL;
1526
1527	rc = apei_read_mce(&m, &record_id);
1528	/* Error or no more MCE record */
1529	if (rc <= 0) {
1530		mce_apei_read_done = 1;
1531		return rc;
1532	}
1533	rc = -EFAULT;
1534	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1535		return rc;
1536	/*
1537	 * In fact, we should have cleared the record after that has
1538	 * been flushed to the disk or sent to network in
1539	 * /sbin/mcelog, but we have no interface to support that now,
1540	 * so just clear it to avoid duplication.
1541	 */
1542	rc = apei_clear_mce(record_id);
1543	if (rc) {
1544		mce_apei_read_done = 1;
1545		return rc;
1546	}
1547	*ubuf += sizeof(struct mce);
1548
1549	return 0;
1550}
1551
1552static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1553			loff_t *off)
1554{
1555	char __user *buf = ubuf;
1556	unsigned long *cpu_tsc;
1557	unsigned prev, next;
1558	int i, err;
1559
1560	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1561	if (!cpu_tsc)
1562		return -ENOMEM;
1563
1564	mutex_lock(&mce_read_mutex);
1565
1566	if (!mce_apei_read_done) {
1567		err = __mce_read_apei(&buf, usize);
1568		if (err || buf != ubuf)
1569			goto out;
1570	}
1571
1572	next = rcu_dereference_check_mce(mcelog.next);
1573
1574	/* Only supports full reads right now */
1575	err = -EINVAL;
1576	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1577		goto out;
1578
1579	err = 0;
1580	prev = 0;
1581	do {
1582		for (i = prev; i < next; i++) {
1583			unsigned long start = jiffies;
1584
1585			while (!mcelog.entry[i].finished) {
1586				if (time_after_eq(jiffies, start + 2)) {
1587					memset(mcelog.entry + i, 0,
1588					       sizeof(struct mce));
1589					goto timeout;
1590				}
1591				cpu_relax();
1592			}
1593			smp_rmb();
1594			err |= copy_to_user(buf, mcelog.entry + i,
1595					    sizeof(struct mce));
1596			buf += sizeof(struct mce);
1597timeout:
1598			;
1599		}
1600
1601		memset(mcelog.entry + prev, 0,
1602		       (next - prev) * sizeof(struct mce));
1603		prev = next;
1604		next = cmpxchg(&mcelog.next, prev, 0);
1605	} while (next != prev);
1606
1607	synchronize_sched();
1608
1609	/*
1610	 * Collect entries that were still getting written before the
1611	 * synchronize.
1612	 */
1613	on_each_cpu(collect_tscs, cpu_tsc, 1);
1614
1615	for (i = next; i < MCE_LOG_LEN; i++) {
1616		if (mcelog.entry[i].finished &&
1617		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1618			err |= copy_to_user(buf, mcelog.entry+i,
1619					    sizeof(struct mce));
1620			smp_rmb();
1621			buf += sizeof(struct mce);
1622			memset(&mcelog.entry[i], 0, sizeof(struct mce));
1623		}
1624	}
1625
1626	if (err)
1627		err = -EFAULT;
1628
1629out:
1630	mutex_unlock(&mce_read_mutex);
1631	kfree(cpu_tsc);
1632
1633	return err ? err : buf - ubuf;
1634}
1635
1636static unsigned int mce_poll(struct file *file, poll_table *wait)
1637{
1638	poll_wait(file, &mce_wait, wait);
1639	if (rcu_dereference_check_mce(mcelog.next))
1640		return POLLIN | POLLRDNORM;
1641	if (!mce_apei_read_done && apei_check_mce())
1642		return POLLIN | POLLRDNORM;
1643	return 0;
1644}
1645
1646static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1647{
1648	int __user *p = (int __user *)arg;
1649
1650	if (!capable(CAP_SYS_ADMIN))
1651		return -EPERM;
1652
1653	switch (cmd) {
1654	case MCE_GET_RECORD_LEN:
1655		return put_user(sizeof(struct mce), p);
1656	case MCE_GET_LOG_LEN:
1657		return put_user(MCE_LOG_LEN, p);
1658	case MCE_GETCLEAR_FLAGS: {
1659		unsigned flags;
1660
1661		do {
1662			flags = mcelog.flags;
1663		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1664
1665		return put_user(flags, p);
1666	}
1667	default:
1668		return -ENOTTY;
1669	}
1670}
1671
1672/* Modified in mce-inject.c, so not static or const */
1673struct file_operations mce_chrdev_ops = {
1674	.open			= mce_open,
1675	.release		= mce_release,
1676	.read			= mce_read,
1677	.poll			= mce_poll,
1678	.unlocked_ioctl		= mce_ioctl,
1679};
1680EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1681
1682static struct miscdevice mce_log_device = {
1683	MISC_MCELOG_MINOR,
1684	"mcelog",
1685	&mce_chrdev_ops,
1686};
1687
1688/*
1689 * mce=off Disables machine check
1690 * mce=no_cmci Disables CMCI
1691 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1692 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1693 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1694 *	monarchtimeout is how long to wait for other CPUs on machine
1695 *	check, or 0 to not wait
1696 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1697 * mce=nobootlog Don't log MCEs from before booting.
1698 */
1699static int __init mcheck_enable(char *str)
1700{
1701	if (*str == 0) {
1702		enable_p5_mce();
1703		return 1;
1704	}
1705	if (*str == '=')
1706		str++;
1707	if (!strcmp(str, "off"))
1708		mce_disabled = 1;
1709	else if (!strcmp(str, "no_cmci"))
1710		mce_cmci_disabled = 1;
1711	else if (!strcmp(str, "dont_log_ce"))
1712		mce_dont_log_ce = 1;
1713	else if (!strcmp(str, "ignore_ce"))
1714		mce_ignore_ce = 1;
1715	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1716		mce_bootlog = (str[0] == 'b');
1717	else if (isdigit(str[0])) {
1718		get_option(&str, &tolerant);
1719		if (*str == ',') {
1720			++str;
1721			get_option(&str, &monarch_timeout);
1722		}
1723	} else {
1724		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1725		       str);
1726		return 0;
1727	}
1728	return 1;
1729}
1730__setup("mce", mcheck_enable);
1731
1732int __init mcheck_init(void)
1733{
1734	atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1735
1736	mcheck_intel_therm_init();
1737
1738	return 0;
1739}
1740
1741/*
1742 * Sysfs support
1743 */
1744
1745/*
1746 * Disable machine checks on suspend and shutdown. We can't really handle
1747 * them later.
1748 */
1749static int mce_disable_error_reporting(void)
1750{
1751	int i;
1752
1753	for (i = 0; i < banks; i++) {
1754		struct mce_bank *b = &mce_banks[i];
1755
1756		if (b->init)
1757			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1758	}
1759	return 0;
1760}
1761
1762static int mce_suspend(struct sys_device *dev, pm_message_t state)
1763{
1764	return mce_disable_error_reporting();
1765}
1766
1767static int mce_shutdown(struct sys_device *dev)
1768{
1769	return mce_disable_error_reporting();
1770}
1771
1772/*
1773 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1774 * Only one CPU is active at this time, the others get re-added later using
1775 * CPU hotplug:
1776 */
1777static int mce_resume(struct sys_device *dev)
1778{
1779	__mcheck_cpu_init_generic();
1780	__mcheck_cpu_init_vendor(&current_cpu_data);
1781
1782	return 0;
1783}
1784
1785static void mce_cpu_restart(void *data)
1786{
1787	del_timer_sync(&__get_cpu_var(mce_timer));
1788	if (!mce_available(&current_cpu_data))
1789		return;
1790	__mcheck_cpu_init_generic();
1791	__mcheck_cpu_init_timer();
1792}
1793
1794/* Reinit MCEs after user configuration changes */
1795static void mce_restart(void)
1796{
1797	on_each_cpu(mce_cpu_restart, NULL, 1);
1798}
1799
1800/* Toggle features for corrected errors */
1801static void mce_disable_ce(void *all)
1802{
1803	if (!mce_available(&current_cpu_data))
1804		return;
1805	if (all)
1806		del_timer_sync(&__get_cpu_var(mce_timer));
1807	cmci_clear();
1808}
1809
1810static void mce_enable_ce(void *all)
1811{
1812	if (!mce_available(&current_cpu_data))
1813		return;
1814	cmci_reenable();
1815	cmci_recheck();
1816	if (all)
1817		__mcheck_cpu_init_timer();
1818}
1819
1820static struct sysdev_class mce_sysclass = {
1821	.suspend	= mce_suspend,
1822	.shutdown	= mce_shutdown,
1823	.resume		= mce_resume,
1824	.name		= "machinecheck",
1825};
1826
1827DEFINE_PER_CPU(struct sys_device, mce_dev);
1828
1829__cpuinitdata
1830void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1831
1832static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1833{
1834	return container_of(attr, struct mce_bank, attr);
1835}
1836
1837static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1838			 char *buf)
1839{
1840	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1841}
1842
1843static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1844			const char *buf, size_t size)
1845{
1846	u64 new;
1847
1848	if (strict_strtoull(buf, 0, &new) < 0)
1849		return -EINVAL;
1850
1851	attr_to_bank(attr)->ctl = new;
1852	mce_restart();
1853
1854	return size;
1855}
1856
1857static ssize_t
1858show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1859{
1860	strcpy(buf, mce_helper);
1861	strcat(buf, "\n");
1862	return strlen(mce_helper) + 1;
1863}
1864
1865static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1866				const char *buf, size_t siz)
1867{
1868	char *p;
1869
1870	strncpy(mce_helper, buf, sizeof(mce_helper));
1871	mce_helper[sizeof(mce_helper)-1] = 0;
1872	p = strchr(mce_helper, '\n');
1873
1874	if (p)
1875		*p = 0;
1876
1877	return strlen(mce_helper) + !!p;
1878}
1879
1880static ssize_t set_ignore_ce(struct sys_device *s,
1881			     struct sysdev_attribute *attr,
1882			     const char *buf, size_t size)
1883{
1884	u64 new;
1885
1886	if (strict_strtoull(buf, 0, &new) < 0)
1887		return -EINVAL;
1888
1889	if (mce_ignore_ce ^ !!new) {
1890		if (new) {
1891			/* disable ce features */
1892			on_each_cpu(mce_disable_ce, (void *)1, 1);
1893			mce_ignore_ce = 1;
1894		} else {
1895			/* enable ce features */
1896			mce_ignore_ce = 0;
1897			on_each_cpu(mce_enable_ce, (void *)1, 1);
1898		}
1899	}
1900	return size;
1901}
1902
1903static ssize_t set_cmci_disabled(struct sys_device *s,
1904				 struct sysdev_attribute *attr,
1905				 const char *buf, size_t size)
1906{
1907	u64 new;
1908
1909	if (strict_strtoull(buf, 0, &new) < 0)
1910		return -EINVAL;
1911
1912	if (mce_cmci_disabled ^ !!new) {
1913		if (new) {
1914			/* disable cmci */
1915			on_each_cpu(mce_disable_ce, NULL, 1);
1916			mce_cmci_disabled = 1;
1917		} else {
1918			/* enable cmci */
1919			mce_cmci_disabled = 0;
1920			on_each_cpu(mce_enable_ce, NULL, 1);
1921		}
1922	}
1923	return size;
1924}
1925
1926static ssize_t store_int_with_restart(struct sys_device *s,
1927				      struct sysdev_attribute *attr,
1928				      const char *buf, size_t size)
1929{
1930	ssize_t ret = sysdev_store_int(s, attr, buf, size);
1931	mce_restart();
1932	return ret;
1933}
1934
1935static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1936static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1937static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1938static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1939
1940static struct sysdev_ext_attribute attr_check_interval = {
1941	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1942		     store_int_with_restart),
1943	&check_interval
1944};
1945
1946static struct sysdev_ext_attribute attr_ignore_ce = {
1947	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1948	&mce_ignore_ce
1949};
1950
1951static struct sysdev_ext_attribute attr_cmci_disabled = {
1952	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1953	&mce_cmci_disabled
1954};
1955
1956static struct sysdev_attribute *mce_attrs[] = {
1957	&attr_tolerant.attr,
1958	&attr_check_interval.attr,
1959	&attr_trigger,
1960	&attr_monarch_timeout.attr,
1961	&attr_dont_log_ce.attr,
1962	&attr_ignore_ce.attr,
1963	&attr_cmci_disabled.attr,
1964	NULL
1965};
1966
1967static cpumask_var_t mce_dev_initialized;
1968
1969/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1970static __cpuinit int mce_create_device(unsigned int cpu)
1971{
1972	int err;
1973	int i, j;
1974
1975	if (!mce_available(&boot_cpu_data))
1976		return -EIO;
1977
1978	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1979	per_cpu(mce_dev, cpu).id	= cpu;
1980	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
1981
1982	err = sysdev_register(&per_cpu(mce_dev, cpu));
1983	if (err)
1984		return err;
1985
1986	for (i = 0; mce_attrs[i]; i++) {
1987		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1988		if (err)
1989			goto error;
1990	}
1991	for (j = 0; j < banks; j++) {
1992		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1993					&mce_banks[j].attr);
1994		if (err)
1995			goto error2;
1996	}
1997	cpumask_set_cpu(cpu, mce_dev_initialized);
1998
1999	return 0;
2000error2:
2001	while (--j >= 0)
2002		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
2003error:
2004	while (--i >= 0)
2005		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
2006
2007	sysdev_unregister(&per_cpu(mce_dev, cpu));
2008
2009	return err;
2010}
2011
2012static __cpuinit void mce_remove_device(unsigned int cpu)
2013{
2014	int i;
2015
2016	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
2017		return;
2018
2019	for (i = 0; mce_attrs[i]; i++)
2020		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
2021
2022	for (i = 0; i < banks; i++)
2023		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
2024
2025	sysdev_unregister(&per_cpu(mce_dev, cpu));
2026	cpumask_clear_cpu(cpu, mce_dev_initialized);
2027}
2028
2029/* Make sure there are no machine checks on offlined CPUs. */
2030static void __cpuinit mce_disable_cpu(void *h)
2031{
2032	unsigned long action = *(unsigned long *)h;
2033	int i;
2034
2035	if (!mce_available(&current_cpu_data))
2036		return;
2037
2038	if (!(action & CPU_TASKS_FROZEN))
2039		cmci_clear();
2040	for (i = 0; i < banks; i++) {
2041		struct mce_bank *b = &mce_banks[i];
2042
2043		if (b->init)
2044			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2045	}
2046}
2047
2048static void __cpuinit mce_reenable_cpu(void *h)
2049{
2050	unsigned long action = *(unsigned long *)h;
2051	int i;
2052
2053	if (!mce_available(&current_cpu_data))
2054		return;
2055
2056	if (!(action & CPU_TASKS_FROZEN))
2057		cmci_reenable();
2058	for (i = 0; i < banks; i++) {
2059		struct mce_bank *b = &mce_banks[i];
2060
2061		if (b->init)
2062			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2063	}
2064}
2065
2066/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2067static int __cpuinit
2068mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2069{
2070	unsigned int cpu = (unsigned long)hcpu;
2071	struct timer_list *t = &per_cpu(mce_timer, cpu);
2072
2073	switch (action) {
2074	case CPU_ONLINE:
2075	case CPU_ONLINE_FROZEN:
2076		mce_create_device(cpu);
2077		if (threshold_cpu_callback)
2078			threshold_cpu_callback(action, cpu);
2079		break;
2080	case CPU_DEAD:
2081	case CPU_DEAD_FROZEN:
2082		if (threshold_cpu_callback)
2083			threshold_cpu_callback(action, cpu);
2084		mce_remove_device(cpu);
2085		break;
2086	case CPU_DOWN_PREPARE:
2087	case CPU_DOWN_PREPARE_FROZEN:
2088		del_timer_sync(t);
2089		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2090		break;
2091	case CPU_DOWN_FAILED:
2092	case CPU_DOWN_FAILED_FROZEN:
2093		if (!mce_ignore_ce && check_interval) {
2094			t->expires = round_jiffies(jiffies +
2095					   __get_cpu_var(mce_next_interval));
2096			add_timer_on(t, cpu);
2097		}
2098		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2099		break;
2100	case CPU_POST_DEAD:
2101		/* intentionally ignoring frozen here */
2102		cmci_rediscover(cpu);
2103		break;
2104	}
2105	return NOTIFY_OK;
2106}
2107
2108static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2109	.notifier_call = mce_cpu_callback,
2110};
2111
2112static __init void mce_init_banks(void)
2113{
2114	int i;
2115
2116	for (i = 0; i < banks; i++) {
2117		struct mce_bank *b = &mce_banks[i];
2118		struct sysdev_attribute *a = &b->attr;
2119
2120		sysfs_attr_init(&a->attr);
2121		a->attr.name	= b->attrname;
2122		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2123
2124		a->attr.mode	= 0644;
2125		a->show		= show_bank;
2126		a->store	= set_bank;
2127	}
2128}
2129
2130static __init int mcheck_init_device(void)
2131{
2132	int err;
2133	i

Large files files are truncated, but you can click here to view the full file