PageRenderTime 83ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/arch/x86/kernel/tsc_sync.c

http://github.com/torvalds/linux
C | 493 lines | 228 code | 63 blank | 202 comment | 52 complexity | 96ef89f7a46c6b16ee78e944580810ed MD5 | raw file
Possible License(s): LGPL-2.0, AGPL-1.0, GPL-2.0
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * check TSC synchronization.
  4. *
  5. * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
  6. *
  7. * We check whether all boot CPUs have their TSC's synchronized,
  8. * print a warning if not and turn off the TSC clock-source.
  9. *
  10. * The warp-check is point-to-point between two CPUs, the CPU
  11. * initiating the bootup is the 'source CPU', the freshly booting
  12. * CPU is the 'target CPU'.
  13. *
  14. * Only two CPUs may participate - they can enter in any order.
  15. * ( The serial nature of the boot logic and the CPU hotplug lock
  16. * protects against more than 2 CPUs entering this code. )
  17. */
  18. #include <linux/topology.h>
  19. #include <linux/spinlock.h>
  20. #include <linux/kernel.h>
  21. #include <linux/smp.h>
  22. #include <linux/nmi.h>
  23. #include <asm/tsc.h>
  24. struct tsc_adjust {
  25. s64 bootval;
  26. s64 adjusted;
  27. unsigned long nextcheck;
  28. bool warned;
  29. };
  30. static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
  31. /*
  32. * TSC's on different sockets may be reset asynchronously.
  33. * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
  34. */
  35. bool __read_mostly tsc_async_resets;
  36. void mark_tsc_async_resets(char *reason)
  37. {
  38. if (tsc_async_resets)
  39. return;
  40. tsc_async_resets = true;
  41. pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
  42. }
  43. void tsc_verify_tsc_adjust(bool resume)
  44. {
  45. struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
  46. s64 curval;
  47. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  48. return;
  49. /* Skip unnecessary error messages if TSC already unstable */
  50. if (check_tsc_unstable())
  51. return;
  52. /* Rate limit the MSR check */
  53. if (!resume && time_before(jiffies, adj->nextcheck))
  54. return;
  55. adj->nextcheck = jiffies + HZ;
  56. rdmsrl(MSR_IA32_TSC_ADJUST, curval);
  57. if (adj->adjusted == curval)
  58. return;
  59. /* Restore the original value */
  60. wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
  61. if (!adj->warned || resume) {
  62. pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
  63. smp_processor_id(), adj->adjusted, curval);
  64. adj->warned = true;
  65. }
  66. }
  67. static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
  68. unsigned int cpu, bool bootcpu)
  69. {
  70. /*
  71. * First online CPU in a package stores the boot value in the
  72. * adjustment value. This value might change later via the sync
  73. * mechanism. If that fails we still can yell about boot values not
  74. * being consistent.
  75. *
  76. * On the boot cpu we just force set the ADJUST value to 0 if it's
  77. * non zero. We don't do that on non boot cpus because physical
  78. * hotplug should have set the ADJUST register to a value > 0 so
  79. * the TSC is in sync with the already running cpus.
  80. *
  81. * Also don't force the ADJUST value to zero if that is a valid value
  82. * for socket 0 as determined by the system arch. This is required
  83. * when multiple sockets are reset asynchronously with each other
  84. * and socket 0 may not have an TSC ADJUST value of 0.
  85. */
  86. if (bootcpu && bootval != 0) {
  87. if (likely(!tsc_async_resets)) {
  88. pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
  89. cpu, bootval);
  90. wrmsrl(MSR_IA32_TSC_ADJUST, 0);
  91. bootval = 0;
  92. } else {
  93. pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
  94. cpu, bootval);
  95. }
  96. }
  97. cur->adjusted = bootval;
  98. }
  99. #ifndef CONFIG_SMP
  100. bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
  101. {
  102. struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
  103. s64 bootval;
  104. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  105. return false;
  106. /* Skip unnecessary error messages if TSC already unstable */
  107. if (check_tsc_unstable())
  108. return false;
  109. rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
  110. cur->bootval = bootval;
  111. cur->nextcheck = jiffies + HZ;
  112. tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
  113. return false;
  114. }
  115. #else /* !CONFIG_SMP */
  116. /*
  117. * Store and check the TSC ADJUST MSR if available
  118. */
  119. bool tsc_store_and_check_tsc_adjust(bool bootcpu)
  120. {
  121. struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
  122. unsigned int refcpu, cpu = smp_processor_id();
  123. struct cpumask *mask;
  124. s64 bootval;
  125. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  126. return false;
  127. rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
  128. cur->bootval = bootval;
  129. cur->nextcheck = jiffies + HZ;
  130. cur->warned = false;
  131. /*
  132. * If a non-zero TSC value for socket 0 may be valid then the default
  133. * adjusted value cannot assumed to be zero either.
  134. */
  135. if (tsc_async_resets)
  136. cur->adjusted = bootval;
  137. /*
  138. * Check whether this CPU is the first in a package to come up. In
  139. * this case do not check the boot value against another package
  140. * because the new package might have been physically hotplugged,
  141. * where TSC_ADJUST is expected to be different. When called on the
  142. * boot CPU topology_core_cpumask() might not be available yet.
  143. */
  144. mask = topology_core_cpumask(cpu);
  145. refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
  146. if (refcpu >= nr_cpu_ids) {
  147. tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
  148. bootcpu);
  149. return false;
  150. }
  151. ref = per_cpu_ptr(&tsc_adjust, refcpu);
  152. /*
  153. * Compare the boot value and complain if it differs in the
  154. * package.
  155. */
  156. if (bootval != ref->bootval)
  157. printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
  158. /*
  159. * The TSC_ADJUST values in a package must be the same. If the boot
  160. * value on this newly upcoming CPU differs from the adjustment
  161. * value of the already online CPU in this package, set it to that
  162. * adjusted value.
  163. */
  164. if (bootval != ref->adjusted) {
  165. cur->adjusted = ref->adjusted;
  166. wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
  167. }
  168. /*
  169. * We have the TSCs forced to be in sync on this package. Skip sync
  170. * test:
  171. */
  172. return true;
  173. }
  174. /*
  175. * Entry/exit counters that make sure that both CPUs
  176. * run the measurement code at once:
  177. */
  178. static atomic_t start_count;
  179. static atomic_t stop_count;
  180. static atomic_t skip_test;
  181. static atomic_t test_runs;
  182. /*
  183. * We use a raw spinlock in this exceptional case, because
  184. * we want to have the fastest, inlined, non-debug version
  185. * of a critical section, to be able to prove TSC time-warps:
  186. */
  187. static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
  188. static cycles_t last_tsc;
  189. static cycles_t max_warp;
  190. static int nr_warps;
  191. static int random_warps;
  192. /*
  193. * TSC-warp measurement loop running on both CPUs. This is not called
  194. * if there is no TSC.
  195. */
  196. static cycles_t check_tsc_warp(unsigned int timeout)
  197. {
  198. cycles_t start, now, prev, end, cur_max_warp = 0;
  199. int i, cur_warps = 0;
  200. start = rdtsc_ordered();
  201. /*
  202. * The measurement runs for 'timeout' msecs:
  203. */
  204. end = start + (cycles_t) tsc_khz * timeout;
  205. for (i = 0; ; i++) {
  206. /*
  207. * We take the global lock, measure TSC, save the
  208. * previous TSC that was measured (possibly on
  209. * another CPU) and update the previous TSC timestamp.
  210. */
  211. arch_spin_lock(&sync_lock);
  212. prev = last_tsc;
  213. now = rdtsc_ordered();
  214. last_tsc = now;
  215. arch_spin_unlock(&sync_lock);
  216. /*
  217. * Be nice every now and then (and also check whether
  218. * measurement is done [we also insert a 10 million
  219. * loops safety exit, so we dont lock up in case the
  220. * TSC readout is totally broken]):
  221. */
  222. if (unlikely(!(i & 7))) {
  223. if (now > end || i > 10000000)
  224. break;
  225. cpu_relax();
  226. touch_nmi_watchdog();
  227. }
  228. /*
  229. * Outside the critical section we can now see whether
  230. * we saw a time-warp of the TSC going backwards:
  231. */
  232. if (unlikely(prev > now)) {
  233. arch_spin_lock(&sync_lock);
  234. max_warp = max(max_warp, prev - now);
  235. cur_max_warp = max_warp;
  236. /*
  237. * Check whether this bounces back and forth. Only
  238. * one CPU should observe time going backwards.
  239. */
  240. if (cur_warps != nr_warps)
  241. random_warps++;
  242. nr_warps++;
  243. cur_warps = nr_warps;
  244. arch_spin_unlock(&sync_lock);
  245. }
  246. }
  247. WARN(!(now-start),
  248. "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
  249. now-start, end-start);
  250. return cur_max_warp;
  251. }
  252. /*
  253. * If the target CPU coming online doesn't have any of its core-siblings
  254. * online, a timeout of 20msec will be used for the TSC-warp measurement
  255. * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
  256. * information about this socket already (and this information grows as we
  257. * have more and more logical-siblings in that socket).
  258. *
  259. * Ideally we should be able to skip the TSC sync check on the other
  260. * core-siblings, if the first logical CPU in a socket passed the sync test.
  261. * But as the TSC is per-logical CPU and can potentially be modified wrongly
  262. * by the bios, TSC sync test for smaller duration should be able
  263. * to catch such errors. Also this will catch the condition where all the
  264. * cores in the socket don't get reset at the same time.
  265. */
  266. static inline unsigned int loop_timeout(int cpu)
  267. {
  268. return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
  269. }
  270. /*
  271. * Source CPU calls into this - it waits for the freshly booted
  272. * target CPU to arrive and then starts the measurement:
  273. */
  274. void check_tsc_sync_source(int cpu)
  275. {
  276. int cpus = 2;
  277. /*
  278. * No need to check if we already know that the TSC is not
  279. * synchronized or if we have no TSC.
  280. */
  281. if (unsynchronized_tsc())
  282. return;
  283. /*
  284. * Set the maximum number of test runs to
  285. * 1 if the CPU does not provide the TSC_ADJUST MSR
  286. * 3 if the MSR is available, so the target can try to adjust
  287. */
  288. if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  289. atomic_set(&test_runs, 1);
  290. else
  291. atomic_set(&test_runs, 3);
  292. retry:
  293. /*
  294. * Wait for the target to start or to skip the test:
  295. */
  296. while (atomic_read(&start_count) != cpus - 1) {
  297. if (atomic_read(&skip_test) > 0) {
  298. atomic_set(&skip_test, 0);
  299. return;
  300. }
  301. cpu_relax();
  302. }
  303. /*
  304. * Trigger the target to continue into the measurement too:
  305. */
  306. atomic_inc(&start_count);
  307. check_tsc_warp(loop_timeout(cpu));
  308. while (atomic_read(&stop_count) != cpus-1)
  309. cpu_relax();
  310. /*
  311. * If the test was successful set the number of runs to zero and
  312. * stop. If not, decrement the number of runs an check if we can
  313. * retry. In case of random warps no retry is attempted.
  314. */
  315. if (!nr_warps) {
  316. atomic_set(&test_runs, 0);
  317. pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
  318. smp_processor_id(), cpu);
  319. } else if (atomic_dec_and_test(&test_runs) || random_warps) {
  320. /* Force it to 0 if random warps brought us here */
  321. atomic_set(&test_runs, 0);
  322. pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n",
  323. smp_processor_id(), cpu);
  324. pr_warn("Measured %Ld cycles TSC warp between CPUs, "
  325. "turning off TSC clock.\n", max_warp);
  326. if (random_warps)
  327. pr_warn("TSC warped randomly between CPUs\n");
  328. mark_tsc_unstable("check_tsc_sync_source failed");
  329. }
  330. /*
  331. * Reset it - just in case we boot another CPU later:
  332. */
  333. atomic_set(&start_count, 0);
  334. random_warps = 0;
  335. nr_warps = 0;
  336. max_warp = 0;
  337. last_tsc = 0;
  338. /*
  339. * Let the target continue with the bootup:
  340. */
  341. atomic_inc(&stop_count);
  342. /*
  343. * Retry, if there is a chance to do so.
  344. */
  345. if (atomic_read(&test_runs) > 0)
  346. goto retry;
  347. }
  348. /*
  349. * Freshly booted CPUs call into this:
  350. */
  351. void check_tsc_sync_target(void)
  352. {
  353. struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
  354. unsigned int cpu = smp_processor_id();
  355. cycles_t cur_max_warp, gbl_max_warp;
  356. int cpus = 2;
  357. /* Also aborts if there is no TSC. */
  358. if (unsynchronized_tsc())
  359. return;
  360. /*
  361. * Store, verify and sanitize the TSC adjust register. If
  362. * successful skip the test.
  363. *
  364. * The test is also skipped when the TSC is marked reliable. This
  365. * is true for SoCs which have no fallback clocksource. On these
  366. * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
  367. * register might have been wreckaged by the BIOS..
  368. */
  369. if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
  370. atomic_inc(&skip_test);
  371. return;
  372. }
  373. retry:
  374. /*
  375. * Register this CPU's participation and wait for the
  376. * source CPU to start the measurement:
  377. */
  378. atomic_inc(&start_count);
  379. while (atomic_read(&start_count) != cpus)
  380. cpu_relax();
  381. cur_max_warp = check_tsc_warp(loop_timeout(cpu));
  382. /*
  383. * Store the maximum observed warp value for a potential retry:
  384. */
  385. gbl_max_warp = max_warp;
  386. /*
  387. * Ok, we are done:
  388. */
  389. atomic_inc(&stop_count);
  390. /*
  391. * Wait for the source CPU to print stuff:
  392. */
  393. while (atomic_read(&stop_count) != cpus)
  394. cpu_relax();
  395. /*
  396. * Reset it for the next sync test:
  397. */
  398. atomic_set(&stop_count, 0);
  399. /*
  400. * Check the number of remaining test runs. If not zero, the test
  401. * failed and a retry with adjusted TSC is possible. If zero the
  402. * test was either successful or failed terminally.
  403. */
  404. if (!atomic_read(&test_runs))
  405. return;
  406. /*
  407. * If the warp value of this CPU is 0, then the other CPU
  408. * observed time going backwards so this TSC was ahead and
  409. * needs to move backwards.
  410. */
  411. if (!cur_max_warp)
  412. cur_max_warp = -gbl_max_warp;
  413. /*
  414. * Add the result to the previous adjustment value.
  415. *
  416. * The adjustement value is slightly off by the overhead of the
  417. * sync mechanism (observed values are ~200 TSC cycles), but this
  418. * really depends on CPU, node distance and frequency. So
  419. * compensating for this is hard to get right. Experiments show
  420. * that the warp is not longer detectable when the observed warp
  421. * value is used. In the worst case the adjustment needs to go
  422. * through a 3rd run for fine tuning.
  423. */
  424. cur->adjusted += cur_max_warp;
  425. pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
  426. cpu, cur_max_warp, cur->adjusted);
  427. wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
  428. goto retry;
  429. }
  430. #endif /* CONFIG_SMP */