PageRenderTime 62ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/drivers/gpu/drm/i915/i915_gpu_error.c

http://github.com/torvalds/linux
C | 1938 lines | 1489 code | 367 blank | 82 comment | 234 complexity | 9895ef423c13bb31a1a16a967235a76a MD5 | raw file
Possible License(s): LGPL-2.0, AGPL-1.0, GPL-2.0
  1. /*
  2. * Copyright (c) 2008 Intel Corporation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice (including the next
  12. * paragraph) shall be included in all copies or substantial portions of the
  13. * Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. * IN THE SOFTWARE.
  22. *
  23. * Authors:
  24. * Eric Anholt <eric@anholt.net>
  25. * Keith Packard <keithp@keithp.com>
  26. * Mika Kuoppala <mika.kuoppala@intel.com>
  27. *
  28. */
  29. #include <linux/ascii85.h>
  30. #include <linux/nmi.h>
  31. #include <linux/pagevec.h>
  32. #include <linux/scatterlist.h>
  33. #include <linux/utsname.h>
  34. #include <linux/zlib.h>
  35. #include <drm/drm_print.h>
  36. #include "display/intel_atomic.h"
  37. #include "display/intel_csr.h"
  38. #include "display/intel_overlay.h"
  39. #include "gem/i915_gem_context.h"
  40. #include "gem/i915_gem_lmem.h"
  41. #include "gt/intel_gt_pm.h"
  42. #include "i915_drv.h"
  43. #include "i915_gpu_error.h"
  44. #include "i915_memcpy.h"
  45. #include "i915_scatterlist.h"
  46. #define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
  47. #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
  48. static void __sg_set_buf(struct scatterlist *sg,
  49. void *addr, unsigned int len, loff_t it)
  50. {
  51. sg->page_link = (unsigned long)virt_to_page(addr);
  52. sg->offset = offset_in_page(addr);
  53. sg->length = len;
  54. sg->dma_address = it;
  55. }
  56. static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
  57. {
  58. if (!len)
  59. return false;
  60. if (e->bytes + len + 1 <= e->size)
  61. return true;
  62. if (e->bytes) {
  63. __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
  64. e->iter += e->bytes;
  65. e->buf = NULL;
  66. e->bytes = 0;
  67. }
  68. if (e->cur == e->end) {
  69. struct scatterlist *sgl;
  70. sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
  71. if (!sgl) {
  72. e->err = -ENOMEM;
  73. return false;
  74. }
  75. if (e->cur) {
  76. e->cur->offset = 0;
  77. e->cur->length = 0;
  78. e->cur->page_link =
  79. (unsigned long)sgl | SG_CHAIN;
  80. } else {
  81. e->sgl = sgl;
  82. }
  83. e->cur = sgl;
  84. e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
  85. }
  86. e->size = ALIGN(len + 1, SZ_64K);
  87. e->buf = kmalloc(e->size, ALLOW_FAIL);
  88. if (!e->buf) {
  89. e->size = PAGE_ALIGN(len + 1);
  90. e->buf = kmalloc(e->size, GFP_KERNEL);
  91. }
  92. if (!e->buf) {
  93. e->err = -ENOMEM;
  94. return false;
  95. }
  96. return true;
  97. }
  98. __printf(2, 0)
  99. static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
  100. const char *fmt, va_list args)
  101. {
  102. va_list ap;
  103. int len;
  104. if (e->err)
  105. return;
  106. va_copy(ap, args);
  107. len = vsnprintf(NULL, 0, fmt, ap);
  108. va_end(ap);
  109. if (len <= 0) {
  110. e->err = len;
  111. return;
  112. }
  113. if (!__i915_error_grow(e, len))
  114. return;
  115. GEM_BUG_ON(e->bytes >= e->size);
  116. len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
  117. if (len < 0) {
  118. e->err = len;
  119. return;
  120. }
  121. e->bytes += len;
  122. }
  123. static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
  124. {
  125. unsigned len;
  126. if (e->err || !str)
  127. return;
  128. len = strlen(str);
  129. if (!__i915_error_grow(e, len))
  130. return;
  131. GEM_BUG_ON(e->bytes + len > e->size);
  132. memcpy(e->buf + e->bytes, str, len);
  133. e->bytes += len;
  134. }
  135. #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
  136. #define err_puts(e, s) i915_error_puts(e, s)
  137. static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
  138. {
  139. i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
  140. }
  141. static inline struct drm_printer
  142. i915_error_printer(struct drm_i915_error_state_buf *e)
  143. {
  144. struct drm_printer p = {
  145. .printfn = __i915_printfn_error,
  146. .arg = e,
  147. };
  148. return p;
  149. }
  150. /* single threaded page allocator with a reserved stash for emergencies */
  151. static void pool_fini(struct pagevec *pv)
  152. {
  153. pagevec_release(pv);
  154. }
  155. static int pool_refill(struct pagevec *pv, gfp_t gfp)
  156. {
  157. while (pagevec_space(pv)) {
  158. struct page *p;
  159. p = alloc_page(gfp);
  160. if (!p)
  161. return -ENOMEM;
  162. pagevec_add(pv, p);
  163. }
  164. return 0;
  165. }
  166. static int pool_init(struct pagevec *pv, gfp_t gfp)
  167. {
  168. int err;
  169. pagevec_init(pv);
  170. err = pool_refill(pv, gfp);
  171. if (err)
  172. pool_fini(pv);
  173. return err;
  174. }
  175. static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
  176. {
  177. struct page *p;
  178. p = alloc_page(gfp);
  179. if (!p && pagevec_count(pv))
  180. p = pv->pages[--pv->nr];
  181. return p ? page_address(p) : NULL;
  182. }
  183. static void pool_free(struct pagevec *pv, void *addr)
  184. {
  185. struct page *p = virt_to_page(addr);
  186. if (pagevec_space(pv))
  187. pagevec_add(pv, p);
  188. else
  189. __free_page(p);
  190. }
  191. #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
  192. struct i915_vma_compress {
  193. struct pagevec pool;
  194. struct z_stream_s zstream;
  195. void *tmp;
  196. };
  197. static bool compress_init(struct i915_vma_compress *c)
  198. {
  199. struct z_stream_s *zstream = &c->zstream;
  200. if (pool_init(&c->pool, ALLOW_FAIL))
  201. return false;
  202. zstream->workspace =
  203. kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
  204. ALLOW_FAIL);
  205. if (!zstream->workspace) {
  206. pool_fini(&c->pool);
  207. return false;
  208. }
  209. c->tmp = NULL;
  210. if (i915_has_memcpy_from_wc())
  211. c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
  212. return true;
  213. }
  214. static bool compress_start(struct i915_vma_compress *c)
  215. {
  216. struct z_stream_s *zstream = &c->zstream;
  217. void *workspace = zstream->workspace;
  218. memset(zstream, 0, sizeof(*zstream));
  219. zstream->workspace = workspace;
  220. return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
  221. }
  222. static void *compress_next_page(struct i915_vma_compress *c,
  223. struct i915_vma_coredump *dst)
  224. {
  225. void *page;
  226. if (dst->page_count >= dst->num_pages)
  227. return ERR_PTR(-ENOSPC);
  228. page = pool_alloc(&c->pool, ALLOW_FAIL);
  229. if (!page)
  230. return ERR_PTR(-ENOMEM);
  231. return dst->pages[dst->page_count++] = page;
  232. }
  233. static int compress_page(struct i915_vma_compress *c,
  234. void *src,
  235. struct i915_vma_coredump *dst,
  236. bool wc)
  237. {
  238. struct z_stream_s *zstream = &c->zstream;
  239. zstream->next_in = src;
  240. if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
  241. zstream->next_in = c->tmp;
  242. zstream->avail_in = PAGE_SIZE;
  243. do {
  244. if (zstream->avail_out == 0) {
  245. zstream->next_out = compress_next_page(c, dst);
  246. if (IS_ERR(zstream->next_out))
  247. return PTR_ERR(zstream->next_out);
  248. zstream->avail_out = PAGE_SIZE;
  249. }
  250. if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
  251. return -EIO;
  252. } while (zstream->avail_in);
  253. /* Fallback to uncompressed if we increase size? */
  254. if (0 && zstream->total_out > zstream->total_in)
  255. return -E2BIG;
  256. return 0;
  257. }
  258. static int compress_flush(struct i915_vma_compress *c,
  259. struct i915_vma_coredump *dst)
  260. {
  261. struct z_stream_s *zstream = &c->zstream;
  262. do {
  263. switch (zlib_deflate(zstream, Z_FINISH)) {
  264. case Z_OK: /* more space requested */
  265. zstream->next_out = compress_next_page(c, dst);
  266. if (IS_ERR(zstream->next_out))
  267. return PTR_ERR(zstream->next_out);
  268. zstream->avail_out = PAGE_SIZE;
  269. break;
  270. case Z_STREAM_END:
  271. goto end;
  272. default: /* any error */
  273. return -EIO;
  274. }
  275. } while (1);
  276. end:
  277. memset(zstream->next_out, 0, zstream->avail_out);
  278. dst->unused = zstream->avail_out;
  279. return 0;
  280. }
  281. static void compress_finish(struct i915_vma_compress *c)
  282. {
  283. zlib_deflateEnd(&c->zstream);
  284. }
  285. static void compress_fini(struct i915_vma_compress *c)
  286. {
  287. kfree(c->zstream.workspace);
  288. if (c->tmp)
  289. pool_free(&c->pool, c->tmp);
  290. pool_fini(&c->pool);
  291. }
  292. static void err_compression_marker(struct drm_i915_error_state_buf *m)
  293. {
  294. err_puts(m, ":");
  295. }
  296. #else
  297. struct i915_vma_compress {
  298. struct pagevec pool;
  299. };
  300. static bool compress_init(struct i915_vma_compress *c)
  301. {
  302. return pool_init(&c->pool, ALLOW_FAIL) == 0;
  303. }
  304. static bool compress_start(struct i915_vma_compress *c)
  305. {
  306. return true;
  307. }
  308. static int compress_page(struct i915_vma_compress *c,
  309. void *src,
  310. struct i915_vma_coredump *dst,
  311. bool wc)
  312. {
  313. void *ptr;
  314. ptr = pool_alloc(&c->pool, ALLOW_FAIL);
  315. if (!ptr)
  316. return -ENOMEM;
  317. if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
  318. memcpy(ptr, src, PAGE_SIZE);
  319. dst->pages[dst->page_count++] = ptr;
  320. return 0;
  321. }
  322. static int compress_flush(struct i915_vma_compress *c,
  323. struct i915_vma_coredump *dst)
  324. {
  325. return 0;
  326. }
  327. static void compress_finish(struct i915_vma_compress *c)
  328. {
  329. }
  330. static void compress_fini(struct i915_vma_compress *c)
  331. {
  332. pool_fini(&c->pool);
  333. }
  334. static void err_compression_marker(struct drm_i915_error_state_buf *m)
  335. {
  336. err_puts(m, "~");
  337. }
  338. #endif
  339. static void error_print_instdone(struct drm_i915_error_state_buf *m,
  340. const struct intel_engine_coredump *ee)
  341. {
  342. const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu;
  343. int slice;
  344. int subslice;
  345. err_printf(m, " INSTDONE: 0x%08x\n",
  346. ee->instdone.instdone);
  347. if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3)
  348. return;
  349. err_printf(m, " SC_INSTDONE: 0x%08x\n",
  350. ee->instdone.slice_common);
  351. if (INTEL_GEN(m->i915) <= 6)
  352. return;
  353. for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
  354. err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
  355. slice, subslice,
  356. ee->instdone.sampler[slice][subslice]);
  357. for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
  358. err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
  359. slice, subslice,
  360. ee->instdone.row[slice][subslice]);
  361. if (INTEL_GEN(m->i915) < 12)
  362. return;
  363. err_printf(m, " SC_INSTDONE_EXTRA: 0x%08x\n",
  364. ee->instdone.slice_common_extra[0]);
  365. err_printf(m, " SC_INSTDONE_EXTRA2: 0x%08x\n",
  366. ee->instdone.slice_common_extra[1]);
  367. }
  368. static void error_print_request(struct drm_i915_error_state_buf *m,
  369. const char *prefix,
  370. const struct i915_request_coredump *erq)
  371. {
  372. if (!erq->seqno)
  373. return;
  374. err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, start %08x, head %08x, tail %08x\n",
  375. prefix, erq->pid, erq->context, erq->seqno,
  376. test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
  377. &erq->flags) ? "!" : "",
  378. test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
  379. &erq->flags) ? "+" : "",
  380. erq->sched_attr.priority,
  381. erq->start, erq->head, erq->tail);
  382. }
  383. static void error_print_context(struct drm_i915_error_state_buf *m,
  384. const char *header,
  385. const struct i915_gem_context_coredump *ctx)
  386. {
  387. const u32 period = RUNTIME_INFO(m->i915)->cs_timestamp_period_ns;
  388. err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n",
  389. header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
  390. ctx->guilty, ctx->active,
  391. ctx->total_runtime * period,
  392. mul_u32_u32(ctx->avg_runtime, period));
  393. }
  394. static struct i915_vma_coredump *
  395. __find_vma(struct i915_vma_coredump *vma, const char *name)
  396. {
  397. while (vma) {
  398. if (strcmp(vma->name, name) == 0)
  399. return vma;
  400. vma = vma->next;
  401. }
  402. return NULL;
  403. }
  404. static struct i915_vma_coredump *
  405. find_batch(const struct intel_engine_coredump *ee)
  406. {
  407. return __find_vma(ee->vma, "batch");
  408. }
  409. static void error_print_engine(struct drm_i915_error_state_buf *m,
  410. const struct intel_engine_coredump *ee)
  411. {
  412. struct i915_vma_coredump *batch;
  413. int n;
  414. err_printf(m, "%s command stream:\n", ee->engine->name);
  415. err_printf(m, " CCID: 0x%08x\n", ee->ccid);
  416. err_printf(m, " START: 0x%08x\n", ee->start);
  417. err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
  418. err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
  419. ee->tail, ee->rq_post, ee->rq_tail);
  420. err_printf(m, " CTL: 0x%08x\n", ee->ctl);
  421. err_printf(m, " MODE: 0x%08x\n", ee->mode);
  422. err_printf(m, " HWS: 0x%08x\n", ee->hws);
  423. err_printf(m, " ACTHD: 0x%08x %08x\n",
  424. (u32)(ee->acthd>>32), (u32)ee->acthd);
  425. err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
  426. err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
  427. err_printf(m, " ESR: 0x%08x\n", ee->esr);
  428. error_print_instdone(m, ee);
  429. batch = find_batch(ee);
  430. if (batch) {
  431. u64 start = batch->gtt_offset;
  432. u64 end = start + batch->gtt_size;
  433. err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
  434. upper_32_bits(start), lower_32_bits(start),
  435. upper_32_bits(end), lower_32_bits(end));
  436. }
  437. if (INTEL_GEN(m->i915) >= 4) {
  438. err_printf(m, " BBADDR: 0x%08x_%08x\n",
  439. (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
  440. err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
  441. err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
  442. }
  443. err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
  444. err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
  445. lower_32_bits(ee->faddr));
  446. if (INTEL_GEN(m->i915) >= 6) {
  447. err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
  448. err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
  449. }
  450. if (HAS_PPGTT(m->i915)) {
  451. err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
  452. if (INTEL_GEN(m->i915) >= 8) {
  453. int i;
  454. for (i = 0; i < 4; i++)
  455. err_printf(m, " PDP%d: 0x%016llx\n",
  456. i, ee->vm_info.pdp[i]);
  457. } else {
  458. err_printf(m, " PP_DIR_BASE: 0x%08x\n",
  459. ee->vm_info.pp_dir_base);
  460. }
  461. }
  462. err_printf(m, " engine reset count: %u\n", ee->reset_count);
  463. for (n = 0; n < ee->num_ports; n++) {
  464. err_printf(m, " ELSP[%d]:", n);
  465. error_print_request(m, " ", &ee->execlist[n]);
  466. }
  467. error_print_context(m, " Active context: ", &ee->context);
  468. }
  469. void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
  470. {
  471. va_list args;
  472. va_start(args, f);
  473. i915_error_vprintf(e, f, args);
  474. va_end(args);
  475. }
  476. static void print_error_vma(struct drm_i915_error_state_buf *m,
  477. const struct intel_engine_cs *engine,
  478. const struct i915_vma_coredump *vma)
  479. {
  480. char out[ASCII85_BUFSZ];
  481. int page;
  482. if (!vma)
  483. return;
  484. err_printf(m, "%s --- %s = 0x%08x %08x\n",
  485. engine ? engine->name : "global", vma->name,
  486. upper_32_bits(vma->gtt_offset),
  487. lower_32_bits(vma->gtt_offset));
  488. if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
  489. err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes);
  490. err_compression_marker(m);
  491. for (page = 0; page < vma->page_count; page++) {
  492. int i, len;
  493. len = PAGE_SIZE;
  494. if (page == vma->page_count - 1)
  495. len -= vma->unused;
  496. len = ascii85_encode_len(len);
  497. for (i = 0; i < len; i++)
  498. err_puts(m, ascii85_encode(vma->pages[page][i], out));
  499. }
  500. err_puts(m, "\n");
  501. }
  502. static void err_print_capabilities(struct drm_i915_error_state_buf *m,
  503. const struct intel_device_info *info,
  504. const struct intel_runtime_info *runtime,
  505. const struct intel_driver_caps *caps)
  506. {
  507. struct drm_printer p = i915_error_printer(m);
  508. intel_device_info_print_static(info, &p);
  509. intel_device_info_print_runtime(runtime, &p);
  510. intel_device_info_print_topology(&runtime->sseu, &p);
  511. intel_driver_caps_print(caps, &p);
  512. }
  513. static void err_print_params(struct drm_i915_error_state_buf *m,
  514. const struct i915_params *params)
  515. {
  516. struct drm_printer p = i915_error_printer(m);
  517. i915_params_dump(params, &p);
  518. }
  519. static void err_print_pciid(struct drm_i915_error_state_buf *m,
  520. struct drm_i915_private *i915)
  521. {
  522. struct pci_dev *pdev = i915->drm.pdev;
  523. err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
  524. err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
  525. err_printf(m, "PCI Subsystem: %04x:%04x\n",
  526. pdev->subsystem_vendor,
  527. pdev->subsystem_device);
  528. }
  529. static void err_print_uc(struct drm_i915_error_state_buf *m,
  530. const struct intel_uc_coredump *error_uc)
  531. {
  532. struct drm_printer p = i915_error_printer(m);
  533. intel_uc_fw_dump(&error_uc->guc_fw, &p);
  534. intel_uc_fw_dump(&error_uc->huc_fw, &p);
  535. print_error_vma(m, NULL, error_uc->guc_log);
  536. }
  537. static void err_free_sgl(struct scatterlist *sgl)
  538. {
  539. while (sgl) {
  540. struct scatterlist *sg;
  541. for (sg = sgl; !sg_is_chain(sg); sg++) {
  542. kfree(sg_virt(sg));
  543. if (sg_is_last(sg))
  544. break;
  545. }
  546. sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
  547. free_page((unsigned long)sgl);
  548. sgl = sg;
  549. }
  550. }
  551. static void err_print_gt(struct drm_i915_error_state_buf *m,
  552. struct intel_gt_coredump *gt)
  553. {
  554. const struct intel_engine_coredump *ee;
  555. int i;
  556. err_printf(m, "GT awake: %s\n", yesno(gt->awake));
  557. err_printf(m, "EIR: 0x%08x\n", gt->eir);
  558. err_printf(m, "IER: 0x%08x\n", gt->ier);
  559. for (i = 0; i < gt->ngtier; i++)
  560. err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]);
  561. err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er);
  562. err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake);
  563. err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr);
  564. for (i = 0; i < gt->nfence; i++)
  565. err_printf(m, " fence[%d] = %08llx\n", i, gt->fence[i]);
  566. if (IS_GEN_RANGE(m->i915, 6, 11)) {
  567. err_printf(m, "ERROR: 0x%08x\n", gt->error);
  568. err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg);
  569. }
  570. if (INTEL_GEN(m->i915) >= 8)
  571. err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
  572. gt->fault_data1, gt->fault_data0);
  573. if (IS_GEN(m->i915, 7))
  574. err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int);
  575. if (IS_GEN_RANGE(m->i915, 8, 11))
  576. err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache);
  577. if (IS_GEN(m->i915, 12))
  578. err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err);
  579. if (INTEL_GEN(m->i915) >= 12) {
  580. int i;
  581. for (i = 0; i < GEN12_SFC_DONE_MAX; i++)
  582. err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i,
  583. gt->sfc_done[i]);
  584. err_printf(m, " GAM_DONE: 0x%08x\n", gt->gam_done);
  585. }
  586. for (ee = gt->engine; ee; ee = ee->next) {
  587. const struct i915_vma_coredump *vma;
  588. error_print_engine(m, ee);
  589. for (vma = ee->vma; vma; vma = vma->next)
  590. print_error_vma(m, ee->engine, vma);
  591. }
  592. if (gt->uc)
  593. err_print_uc(m, gt->uc);
  594. }
  595. static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
  596. struct i915_gpu_coredump *error)
  597. {
  598. const struct intel_engine_coredump *ee;
  599. struct timespec64 ts;
  600. if (*error->error_msg)
  601. err_printf(m, "%s\n", error->error_msg);
  602. err_printf(m, "Kernel: %s %s\n",
  603. init_utsname()->release,
  604. init_utsname()->machine);
  605. err_printf(m, "Driver: %s\n", DRIVER_DATE);
  606. ts = ktime_to_timespec64(error->time);
  607. err_printf(m, "Time: %lld s %ld us\n",
  608. (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
  609. ts = ktime_to_timespec64(error->boottime);
  610. err_printf(m, "Boottime: %lld s %ld us\n",
  611. (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
  612. ts = ktime_to_timespec64(error->uptime);
  613. err_printf(m, "Uptime: %lld s %ld us\n",
  614. (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
  615. err_printf(m, "Capture: %lu jiffies; %d ms ago\n",
  616. error->capture, jiffies_to_msecs(jiffies - error->capture));
  617. for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next)
  618. err_printf(m, "Active process (on ring %s): %s [%d]\n",
  619. ee->engine->name,
  620. ee->context.comm,
  621. ee->context.pid);
  622. err_printf(m, "Reset count: %u\n", error->reset_count);
  623. err_printf(m, "Suspend count: %u\n", error->suspend_count);
  624. err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
  625. err_printf(m, "Subplatform: 0x%x\n",
  626. intel_subplatform(&error->runtime_info,
  627. error->device_info.platform));
  628. err_print_pciid(m, m->i915);
  629. err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
  630. if (HAS_CSR(m->i915)) {
  631. struct intel_csr *csr = &m->i915->csr;
  632. err_printf(m, "DMC loaded: %s\n",
  633. yesno(csr->dmc_payload != NULL));
  634. err_printf(m, "DMC fw version: %d.%d\n",
  635. CSR_VERSION_MAJOR(csr->version),
  636. CSR_VERSION_MINOR(csr->version));
  637. }
  638. err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
  639. err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
  640. if (error->gt)
  641. err_print_gt(m, error->gt);
  642. if (error->overlay)
  643. intel_overlay_print_error_state(m, error->overlay);
  644. if (error->display)
  645. intel_display_print_error_state(m, error->display);
  646. err_print_capabilities(m, &error->device_info, &error->runtime_info,
  647. &error->driver_caps);
  648. err_print_params(m, &error->params);
  649. }
  650. static int err_print_to_sgl(struct i915_gpu_coredump *error)
  651. {
  652. struct drm_i915_error_state_buf m;
  653. if (IS_ERR(error))
  654. return PTR_ERR(error);
  655. if (READ_ONCE(error->sgl))
  656. return 0;
  657. memset(&m, 0, sizeof(m));
  658. m.i915 = error->i915;
  659. __err_print_to_sgl(&m, error);
  660. if (m.buf) {
  661. __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
  662. m.bytes = 0;
  663. m.buf = NULL;
  664. }
  665. if (m.cur) {
  666. GEM_BUG_ON(m.end < m.cur);
  667. sg_mark_end(m.cur - 1);
  668. }
  669. GEM_BUG_ON(m.sgl && !m.cur);
  670. if (m.err) {
  671. err_free_sgl(m.sgl);
  672. return m.err;
  673. }
  674. if (cmpxchg(&error->sgl, NULL, m.sgl))
  675. err_free_sgl(m.sgl);
  676. return 0;
  677. }
  678. ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
  679. char *buf, loff_t off, size_t rem)
  680. {
  681. struct scatterlist *sg;
  682. size_t count;
  683. loff_t pos;
  684. int err;
  685. if (!error || !rem)
  686. return 0;
  687. err = err_print_to_sgl(error);
  688. if (err)
  689. return err;
  690. sg = READ_ONCE(error->fit);
  691. if (!sg || off < sg->dma_address)
  692. sg = error->sgl;
  693. if (!sg)
  694. return 0;
  695. pos = sg->dma_address;
  696. count = 0;
  697. do {
  698. size_t len, start;
  699. if (sg_is_chain(sg)) {
  700. sg = sg_chain_ptr(sg);
  701. GEM_BUG_ON(sg_is_chain(sg));
  702. }
  703. len = sg->length;
  704. if (pos + len <= off) {
  705. pos += len;
  706. continue;
  707. }
  708. start = sg->offset;
  709. if (pos < off) {
  710. GEM_BUG_ON(off - pos > len);
  711. len -= off - pos;
  712. start += off - pos;
  713. pos = off;
  714. }
  715. len = min(len, rem);
  716. GEM_BUG_ON(!len || len > sg->length);
  717. memcpy(buf, page_address(sg_page(sg)) + start, len);
  718. count += len;
  719. pos += len;
  720. buf += len;
  721. rem -= len;
  722. if (!rem) {
  723. WRITE_ONCE(error->fit, sg);
  724. break;
  725. }
  726. } while (!sg_is_last(sg++));
  727. return count;
  728. }
  729. static void i915_vma_coredump_free(struct i915_vma_coredump *vma)
  730. {
  731. while (vma) {
  732. struct i915_vma_coredump *next = vma->next;
  733. int page;
  734. for (page = 0; page < vma->page_count; page++)
  735. free_page((unsigned long)vma->pages[page]);
  736. kfree(vma);
  737. vma = next;
  738. }
  739. }
  740. static void cleanup_params(struct i915_gpu_coredump *error)
  741. {
  742. i915_params_free(&error->params);
  743. }
  744. static void cleanup_uc(struct intel_uc_coredump *uc)
  745. {
  746. kfree(uc->guc_fw.path);
  747. kfree(uc->huc_fw.path);
  748. i915_vma_coredump_free(uc->guc_log);
  749. kfree(uc);
  750. }
  751. static void cleanup_gt(struct intel_gt_coredump *gt)
  752. {
  753. while (gt->engine) {
  754. struct intel_engine_coredump *ee = gt->engine;
  755. gt->engine = ee->next;
  756. i915_vma_coredump_free(ee->vma);
  757. kfree(ee);
  758. }
  759. if (gt->uc)
  760. cleanup_uc(gt->uc);
  761. kfree(gt);
  762. }
  763. void __i915_gpu_coredump_free(struct kref *error_ref)
  764. {
  765. struct i915_gpu_coredump *error =
  766. container_of(error_ref, typeof(*error), ref);
  767. while (error->gt) {
  768. struct intel_gt_coredump *gt = error->gt;
  769. error->gt = gt->next;
  770. cleanup_gt(gt);
  771. }
  772. kfree(error->overlay);
  773. kfree(error->display);
  774. cleanup_params(error);
  775. err_free_sgl(error->sgl);
  776. kfree(error);
  777. }
  778. static struct i915_vma_coredump *
  779. i915_vma_coredump_create(const struct intel_gt *gt,
  780. const struct i915_vma *vma,
  781. const char *name,
  782. struct i915_vma_compress *compress)
  783. {
  784. struct i915_ggtt *ggtt = gt->ggtt;
  785. const u64 slot = ggtt->error_capture.start;
  786. struct i915_vma_coredump *dst;
  787. unsigned long num_pages;
  788. struct sgt_iter iter;
  789. int ret;
  790. might_sleep();
  791. if (!vma || !vma->pages || !compress)
  792. return NULL;
  793. num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
  794. num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
  795. dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL);
  796. if (!dst)
  797. return NULL;
  798. if (!compress_start(compress)) {
  799. kfree(dst);
  800. return NULL;
  801. }
  802. strcpy(dst->name, name);
  803. dst->next = NULL;
  804. dst->gtt_offset = vma->node.start;
  805. dst->gtt_size = vma->node.size;
  806. dst->gtt_page_sizes = vma->page_sizes.gtt;
  807. dst->num_pages = num_pages;
  808. dst->page_count = 0;
  809. dst->unused = 0;
  810. ret = -EINVAL;
  811. if (drm_mm_node_allocated(&ggtt->error_capture)) {
  812. void __iomem *s;
  813. dma_addr_t dma;
  814. for_each_sgt_daddr(dma, iter, vma->pages) {
  815. ggtt->vm.insert_page(&ggtt->vm, dma, slot,
  816. I915_CACHE_NONE, 0);
  817. mb();
  818. s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
  819. ret = compress_page(compress,
  820. (void __force *)s, dst,
  821. true);
  822. io_mapping_unmap(s);
  823. if (ret)
  824. break;
  825. }
  826. } else if (i915_gem_object_is_lmem(vma->obj)) {
  827. struct intel_memory_region *mem = vma->obj->mm.region;
  828. dma_addr_t dma;
  829. for_each_sgt_daddr(dma, iter, vma->pages) {
  830. void __iomem *s;
  831. s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE);
  832. ret = compress_page(compress,
  833. (void __force *)s, dst,
  834. true);
  835. io_mapping_unmap(s);
  836. if (ret)
  837. break;
  838. }
  839. } else {
  840. struct page *page;
  841. for_each_sgt_page(page, iter, vma->pages) {
  842. void *s;
  843. drm_clflush_pages(&page, 1);
  844. s = kmap(page);
  845. ret = compress_page(compress, s, dst, false);
  846. kunmap(page);
  847. drm_clflush_pages(&page, 1);
  848. if (ret)
  849. break;
  850. }
  851. }
  852. if (ret || compress_flush(compress, dst)) {
  853. while (dst->page_count--)
  854. pool_free(&compress->pool, dst->pages[dst->page_count]);
  855. kfree(dst);
  856. dst = NULL;
  857. }
  858. compress_finish(compress);
  859. return dst;
  860. }
  861. static void gt_record_fences(struct intel_gt_coredump *gt)
  862. {
  863. struct i915_ggtt *ggtt = gt->_gt->ggtt;
  864. struct intel_uncore *uncore = gt->_gt->uncore;
  865. int i;
  866. if (INTEL_GEN(uncore->i915) >= 6) {
  867. for (i = 0; i < ggtt->num_fences; i++)
  868. gt->fence[i] =
  869. intel_uncore_read64(uncore,
  870. FENCE_REG_GEN6_LO(i));
  871. } else if (INTEL_GEN(uncore->i915) >= 4) {
  872. for (i = 0; i < ggtt->num_fences; i++)
  873. gt->fence[i] =
  874. intel_uncore_read64(uncore,
  875. FENCE_REG_965_LO(i));
  876. } else {
  877. for (i = 0; i < ggtt->num_fences; i++)
  878. gt->fence[i] =
  879. intel_uncore_read(uncore, FENCE_REG(i));
  880. }
  881. gt->nfence = i;
  882. }
  883. static void engine_record_registers(struct intel_engine_coredump *ee)
  884. {
  885. const struct intel_engine_cs *engine = ee->engine;
  886. struct drm_i915_private *i915 = engine->i915;
  887. if (INTEL_GEN(i915) >= 6) {
  888. ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
  889. if (INTEL_GEN(i915) >= 12)
  890. ee->fault_reg = intel_uncore_read(engine->uncore,
  891. GEN12_RING_FAULT_REG);
  892. else if (INTEL_GEN(i915) >= 8)
  893. ee->fault_reg = intel_uncore_read(engine->uncore,
  894. GEN8_RING_FAULT_REG);
  895. else
  896. ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
  897. }
  898. if (INTEL_GEN(i915) >= 4) {
  899. ee->esr = ENGINE_READ(engine, RING_ESR);
  900. ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
  901. ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
  902. ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
  903. ee->instps = ENGINE_READ(engine, RING_INSTPS);
  904. ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
  905. ee->ccid = ENGINE_READ(engine, CCID);
  906. if (INTEL_GEN(i915) >= 8) {
  907. ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
  908. ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
  909. }
  910. ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
  911. } else {
  912. ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
  913. ee->ipeir = ENGINE_READ(engine, IPEIR);
  914. ee->ipehr = ENGINE_READ(engine, IPEHR);
  915. }
  916. intel_engine_get_instdone(engine, &ee->instdone);
  917. ee->instpm = ENGINE_READ(engine, RING_INSTPM);
  918. ee->acthd = intel_engine_get_active_head(engine);
  919. ee->start = ENGINE_READ(engine, RING_START);
  920. ee->head = ENGINE_READ(engine, RING_HEAD);
  921. ee->tail = ENGINE_READ(engine, RING_TAIL);
  922. ee->ctl = ENGINE_READ(engine, RING_CTL);
  923. if (INTEL_GEN(i915) > 2)
  924. ee->mode = ENGINE_READ(engine, RING_MI_MODE);
  925. if (!HWS_NEEDS_PHYSICAL(i915)) {
  926. i915_reg_t mmio;
  927. if (IS_GEN(i915, 7)) {
  928. switch (engine->id) {
  929. default:
  930. MISSING_CASE(engine->id);
  931. /* fall through */
  932. case RCS0:
  933. mmio = RENDER_HWS_PGA_GEN7;
  934. break;
  935. case BCS0:
  936. mmio = BLT_HWS_PGA_GEN7;
  937. break;
  938. case VCS0:
  939. mmio = BSD_HWS_PGA_GEN7;
  940. break;
  941. case VECS0:
  942. mmio = VEBOX_HWS_PGA_GEN7;
  943. break;
  944. }
  945. } else if (IS_GEN(engine->i915, 6)) {
  946. mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
  947. } else {
  948. /* XXX: gen8 returns to sanity */
  949. mmio = RING_HWS_PGA(engine->mmio_base);
  950. }
  951. ee->hws = intel_uncore_read(engine->uncore, mmio);
  952. }
  953. ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine);
  954. if (HAS_PPGTT(i915)) {
  955. int i;
  956. ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
  957. if (IS_GEN(i915, 6)) {
  958. ee->vm_info.pp_dir_base =
  959. ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
  960. } else if (IS_GEN(i915, 7)) {
  961. ee->vm_info.pp_dir_base =
  962. ENGINE_READ(engine, RING_PP_DIR_BASE);
  963. } else if (INTEL_GEN(i915) >= 8) {
  964. u32 base = engine->mmio_base;
  965. for (i = 0; i < 4; i++) {
  966. ee->vm_info.pdp[i] =
  967. intel_uncore_read(engine->uncore,
  968. GEN8_RING_PDP_UDW(base, i));
  969. ee->vm_info.pdp[i] <<= 32;
  970. ee->vm_info.pdp[i] |=
  971. intel_uncore_read(engine->uncore,
  972. GEN8_RING_PDP_LDW(base, i));
  973. }
  974. }
  975. }
  976. }
  977. static void record_request(const struct i915_request *request,
  978. struct i915_request_coredump *erq)
  979. {
  980. const struct i915_gem_context *ctx;
  981. erq->flags = request->fence.flags;
  982. erq->context = request->fence.context;
  983. erq->seqno = request->fence.seqno;
  984. erq->sched_attr = request->sched.attr;
  985. erq->start = i915_ggtt_offset(request->ring->vma);
  986. erq->head = request->head;
  987. erq->tail = request->tail;
  988. erq->pid = 0;
  989. rcu_read_lock();
  990. ctx = rcu_dereference(request->context->gem_context);
  991. if (ctx)
  992. erq->pid = pid_nr(ctx->pid);
  993. rcu_read_unlock();
  994. }
  995. static void engine_record_execlists(struct intel_engine_coredump *ee)
  996. {
  997. const struct intel_engine_execlists * const el = &ee->engine->execlists;
  998. struct i915_request * const *port = el->active;
  999. unsigned int n = 0;
  1000. while (*port)
  1001. record_request(*port++, &ee->execlist[n++]);
  1002. ee->num_ports = n;
  1003. }
  1004. static bool record_context(struct i915_gem_context_coredump *e,
  1005. const struct i915_request *rq)
  1006. {
  1007. struct i915_gem_context *ctx;
  1008. struct task_struct *task;
  1009. bool simulated;
  1010. rcu_read_lock();
  1011. ctx = rcu_dereference(rq->context->gem_context);
  1012. if (ctx && !kref_get_unless_zero(&ctx->ref))
  1013. ctx = NULL;
  1014. rcu_read_unlock();
  1015. if (!ctx)
  1016. return true;
  1017. rcu_read_lock();
  1018. task = pid_task(ctx->pid, PIDTYPE_PID);
  1019. if (task) {
  1020. strcpy(e->comm, task->comm);
  1021. e->pid = task->pid;
  1022. }
  1023. rcu_read_unlock();
  1024. e->sched_attr = ctx->sched;
  1025. e->guilty = atomic_read(&ctx->guilty_count);
  1026. e->active = atomic_read(&ctx->active_count);
  1027. e->total_runtime = rq->context->runtime.total;
  1028. e->avg_runtime = ewma_runtime_read(&rq->context->runtime.avg);
  1029. simulated = i915_gem_context_no_error_capture(ctx);
  1030. i915_gem_context_put(ctx);
  1031. return simulated;
  1032. }
  1033. struct intel_engine_capture_vma {
  1034. struct intel_engine_capture_vma *next;
  1035. struct i915_vma *vma;
  1036. char name[16];
  1037. };
  1038. static struct intel_engine_capture_vma *
  1039. capture_vma(struct intel_engine_capture_vma *next,
  1040. struct i915_vma *vma,
  1041. const char *name,
  1042. gfp_t gfp)
  1043. {
  1044. struct intel_engine_capture_vma *c;
  1045. if (!vma)
  1046. return next;
  1047. c = kmalloc(sizeof(*c), gfp);
  1048. if (!c)
  1049. return next;
  1050. if (!i915_active_acquire_if_busy(&vma->active)) {
  1051. kfree(c);
  1052. return next;
  1053. }
  1054. strcpy(c->name, name);
  1055. c->vma = i915_vma_get(vma);
  1056. c->next = next;
  1057. return c;
  1058. }
  1059. static struct intel_engine_capture_vma *
  1060. capture_user(struct intel_engine_capture_vma *capture,
  1061. const struct i915_request *rq,
  1062. gfp_t gfp)
  1063. {
  1064. struct i915_capture_list *c;
  1065. for (c = rq->capture_list; c; c = c->next)
  1066. capture = capture_vma(capture, c->vma, "user", gfp);
  1067. return capture;
  1068. }
  1069. static struct i915_vma_coredump *
  1070. capture_object(const struct intel_gt *gt,
  1071. struct drm_i915_gem_object *obj,
  1072. const char *name,
  1073. struct i915_vma_compress *compress)
  1074. {
  1075. if (obj && i915_gem_object_has_pages(obj)) {
  1076. struct i915_vma fake = {
  1077. .node = { .start = U64_MAX, .size = obj->base.size },
  1078. .size = obj->base.size,
  1079. .pages = obj->mm.pages,
  1080. .obj = obj,
  1081. };
  1082. return i915_vma_coredump_create(gt, &fake, name, compress);
  1083. } else {
  1084. return NULL;
  1085. }
  1086. }
  1087. static void add_vma(struct intel_engine_coredump *ee,
  1088. struct i915_vma_coredump *vma)
  1089. {
  1090. if (vma) {
  1091. vma->next = ee->vma;
  1092. ee->vma = vma;
  1093. }
  1094. }
  1095. struct intel_engine_coredump *
  1096. intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp)
  1097. {
  1098. struct intel_engine_coredump *ee;
  1099. ee = kzalloc(sizeof(*ee), gfp);
  1100. if (!ee)
  1101. return NULL;
  1102. ee->engine = engine;
  1103. engine_record_registers(ee);
  1104. engine_record_execlists(ee);
  1105. return ee;
  1106. }
  1107. struct intel_engine_capture_vma *
  1108. intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
  1109. struct i915_request *rq,
  1110. gfp_t gfp)
  1111. {
  1112. struct intel_engine_capture_vma *vma = NULL;
  1113. ee->simulated |= record_context(&ee->context, rq);
  1114. if (ee->simulated)
  1115. return NULL;
  1116. /*
  1117. * We need to copy these to an anonymous buffer
  1118. * as the simplest method to avoid being overwritten
  1119. * by userspace.
  1120. */
  1121. vma = capture_vma(vma, rq->batch, "batch", gfp);
  1122. vma = capture_user(vma, rq, gfp);
  1123. vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
  1124. vma = capture_vma(vma, rq->context->state, "HW context", gfp);
  1125. ee->rq_head = rq->head;
  1126. ee->rq_post = rq->postfix;
  1127. ee->rq_tail = rq->tail;
  1128. return vma;
  1129. }
  1130. void
  1131. intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
  1132. struct intel_engine_capture_vma *capture,
  1133. struct i915_vma_compress *compress)
  1134. {
  1135. const struct intel_engine_cs *engine = ee->engine;
  1136. while (capture) {
  1137. struct intel_engine_capture_vma *this = capture;
  1138. struct i915_vma *vma = this->vma;
  1139. add_vma(ee,
  1140. i915_vma_coredump_create(engine->gt,
  1141. vma, this->name,
  1142. compress));
  1143. i915_active_release(&vma->active);
  1144. i915_vma_put(vma);
  1145. capture = this->next;
  1146. kfree(this);
  1147. }
  1148. add_vma(ee,
  1149. i915_vma_coredump_create(engine->gt,
  1150. engine->status_page.vma,
  1151. "HW Status",
  1152. compress));
  1153. add_vma(ee,
  1154. i915_vma_coredump_create(engine->gt,
  1155. engine->wa_ctx.vma,
  1156. "WA context",
  1157. compress));
  1158. add_vma(ee,
  1159. capture_object(engine->gt,
  1160. engine->default_state,
  1161. "NULL context",
  1162. compress));
  1163. }
  1164. static struct intel_engine_coredump *
  1165. capture_engine(struct intel_engine_cs *engine,
  1166. struct i915_vma_compress *compress)
  1167. {
  1168. struct intel_engine_capture_vma *capture = NULL;
  1169. struct intel_engine_coredump *ee;
  1170. struct i915_request *rq;
  1171. unsigned long flags;
  1172. ee = intel_engine_coredump_alloc(engine, GFP_KERNEL);
  1173. if (!ee)
  1174. return NULL;
  1175. spin_lock_irqsave(&engine->active.lock, flags);
  1176. rq = intel_engine_find_active_request(engine);
  1177. if (rq)
  1178. capture = intel_engine_coredump_add_request(ee, rq,
  1179. ATOMIC_MAYFAIL);
  1180. spin_unlock_irqrestore(&engine->active.lock, flags);
  1181. if (!capture) {
  1182. kfree(ee);
  1183. return NULL;
  1184. }
  1185. intel_engine_coredump_add_vma(ee, capture, compress);
  1186. return ee;
  1187. }
  1188. static void
  1189. gt_record_engines(struct intel_gt_coredump *gt,
  1190. struct i915_vma_compress *compress)
  1191. {
  1192. struct intel_engine_cs *engine;
  1193. enum intel_engine_id id;
  1194. for_each_engine(engine, gt->_gt, id) {
  1195. struct intel_engine_coredump *ee;
  1196. /* Refill our page pool before entering atomic section */
  1197. pool_refill(&compress->pool, ALLOW_FAIL);
  1198. ee = capture_engine(engine, compress);
  1199. if (!ee)
  1200. continue;
  1201. gt->simulated |= ee->simulated;
  1202. if (ee->simulated) {
  1203. kfree(ee);
  1204. continue;
  1205. }
  1206. ee->next = gt->engine;
  1207. gt->engine = ee;
  1208. }
  1209. }
  1210. static struct intel_uc_coredump *
  1211. gt_record_uc(struct intel_gt_coredump *gt,
  1212. struct i915_vma_compress *compress)
  1213. {
  1214. const struct intel_uc *uc = &gt->_gt->uc;
  1215. struct intel_uc_coredump *error_uc;
  1216. error_uc = kzalloc(sizeof(*error_uc), ALLOW_FAIL);
  1217. if (!error_uc)
  1218. return NULL;
  1219. memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));
  1220. memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw));
  1221. /* Non-default firmware paths will be specified by the modparam.
  1222. * As modparams are generally accesible from the userspace make
  1223. * explicit copies of the firmware paths.
  1224. */
  1225. error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
  1226. error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
  1227. error_uc->guc_log =
  1228. i915_vma_coredump_create(gt->_gt,
  1229. uc->guc.log.vma, "GuC log buffer",
  1230. compress);
  1231. return error_uc;
  1232. }
  1233. static void gt_capture_prepare(struct intel_gt_coredump *gt)
  1234. {
  1235. struct i915_ggtt *ggtt = gt->_gt->ggtt;
  1236. mutex_lock(&ggtt->error_mutex);
  1237. }
  1238. static void gt_capture_finish(struct intel_gt_coredump *gt)
  1239. {
  1240. struct i915_ggtt *ggtt = gt->_gt->ggtt;
  1241. if (drm_mm_node_allocated(&ggtt->error_capture))
  1242. ggtt->vm.clear_range(&ggtt->vm,
  1243. ggtt->error_capture.start,
  1244. PAGE_SIZE);
  1245. mutex_unlock(&ggtt->error_mutex);
  1246. }
  1247. /* Capture all registers which don't fit into another category. */
  1248. static void gt_record_regs(struct intel_gt_coredump *gt)
  1249. {
  1250. struct intel_uncore *uncore = gt->_gt->uncore;
  1251. struct drm_i915_private *i915 = uncore->i915;
  1252. int i;
  1253. /*
  1254. * General organization
  1255. * 1. Registers specific to a single generation
  1256. * 2. Registers which belong to multiple generations
  1257. * 3. Feature specific registers.
  1258. * 4. Everything else
  1259. * Please try to follow the order.
  1260. */
  1261. /* 1: Registers specific to a single generation */
  1262. if (IS_VALLEYVIEW(i915)) {
  1263. gt->gtier[0] = intel_uncore_read(uncore, GTIER);
  1264. gt->ier = intel_uncore_read(uncore, VLV_IER);
  1265. gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
  1266. }
  1267. if (IS_GEN(i915, 7))
  1268. gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
  1269. if (INTEL_GEN(i915) >= 12) {
  1270. gt->fault_data0 = intel_uncore_read(uncore,
  1271. GEN12_FAULT_TLB_DATA0);
  1272. gt->fault_data1 = intel_uncore_read(uncore,
  1273. GEN12_FAULT_TLB_DATA1);
  1274. } else if (INTEL_GEN(i915) >= 8) {
  1275. gt->fault_data0 = intel_uncore_read(uncore,
  1276. GEN8_FAULT_TLB_DATA0);
  1277. gt->fault_data1 = intel_uncore_read(uncore,
  1278. GEN8_FAULT_TLB_DATA1);
  1279. }
  1280. if (IS_GEN(i915, 6)) {
  1281. gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
  1282. gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
  1283. gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
  1284. }
  1285. /* 2: Registers which belong to multiple generations */
  1286. if (INTEL_GEN(i915) >= 7)
  1287. gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
  1288. if (INTEL_GEN(i915) >= 6) {
  1289. gt->derrmr = intel_uncore_read(uncore, DERRMR);
  1290. if (INTEL_GEN(i915) < 12) {
  1291. gt->error = intel_uncore_read(uncore, ERROR_GEN6);
  1292. gt->done_reg = intel_uncore_read(uncore, DONE_REG);
  1293. }
  1294. }
  1295. /* 3: Feature specific registers */
  1296. if (IS_GEN_RANGE(i915, 6, 7)) {
  1297. gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
  1298. gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
  1299. }
  1300. if (IS_GEN_RANGE(i915, 8, 11))
  1301. gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);
  1302. if (IS_GEN(i915, 12))
  1303. gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG);
  1304. if (INTEL_GEN(i915) >= 12) {
  1305. for (i = 0; i < GEN12_SFC_DONE_MAX; i++) {
  1306. gt->sfc_done[i] =
  1307. intel_uncore_read(uncore, GEN12_SFC_DONE(i));
  1308. }
  1309. gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE);
  1310. }
  1311. /* 4: Everything else */
  1312. if (INTEL_GEN(i915) >= 11) {
  1313. gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
  1314. gt->gtier[0] =
  1315. intel_uncore_read(uncore,
  1316. GEN11_RENDER_COPY_INTR_ENABLE);
  1317. gt->gtier[1] =
  1318. intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
  1319. gt->gtier[2] =
  1320. intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
  1321. gt->gtier[3] =
  1322. intel_uncore_read(uncore,
  1323. GEN11_GPM_WGBOXPERF_INTR_ENABLE);
  1324. gt->gtier[4] =
  1325. intel_uncore_read(uncore,
  1326. GEN11_CRYPTO_RSVD_INTR_ENABLE);
  1327. gt->gtier[5] =
  1328. intel_uncore_read(uncore,
  1329. GEN11_GUNIT_CSME_INTR_ENABLE);
  1330. gt->ngtier = 6;
  1331. } else if (INTEL_GEN(i915) >= 8) {
  1332. gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
  1333. for (i = 0; i < 4; i++)
  1334. gt->gtier[i] =
  1335. intel_uncore_read(uncore, GEN8_GT_IER(i));
  1336. gt->ngtier = 4;
  1337. } else if (HAS_PCH_SPLIT(i915)) {
  1338. gt->ier = intel_uncore_read(uncore, DEIER);
  1339. gt->gtier[0] = intel_uncore_read(uncore, GTIER);
  1340. gt->ngtier = 1;
  1341. } else if (IS_GEN(i915, 2)) {
  1342. gt->ier = intel_uncore_read16(uncore, GEN2_IER);
  1343. } else if (!IS_VALLEYVIEW(i915)) {
  1344. gt->ier = intel_uncore_read(uncore, GEN2_IER);
  1345. }
  1346. gt->eir = intel_uncore_read(uncore, EIR);
  1347. gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
  1348. }
  1349. /*
  1350. * Generate a semi-unique error code. The code is not meant to have meaning, The
  1351. * code's only purpose is to try to prevent false duplicated bug reports by
  1352. * grossly estimating a GPU error state.
  1353. *
  1354. * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
  1355. * the hang if we could strip the GTT offset information from it.
  1356. *
  1357. * It's only a small step better than a random number in its current form.
  1358. */
  1359. static u32 generate_ecode(const struct intel_engine_coredump *ee)
  1360. {
  1361. /*
  1362. * IPEHR would be an ideal way to detect errors, as it's the gross
  1363. * measure of "the command that hung." However, has some very common
  1364. * synchronization commands which almost always appear in the case
  1365. * strictly a client bug. Use instdone to differentiate those some.
  1366. */
  1367. return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
  1368. }
  1369. static const char *error_msg(struct i915_gpu_coredump *error)
  1370. {
  1371. struct intel_engine_coredump *first = NULL;
  1372. struct intel_gt_coredump *gt;
  1373. intel_engine_mask_t engines;
  1374. int len;
  1375. engines = 0;
  1376. for (gt = error->gt; gt; gt = gt->next) {
  1377. struct intel_engine_coredump *cs;
  1378. if (gt->engine && !first)
  1379. first = gt->engine;
  1380. for (cs = gt->engine; cs; cs = cs->next)
  1381. engines |= cs->engine->mask;
  1382. }
  1383. len = scnprintf(error->error_msg, sizeof(error->error_msg),
  1384. "GPU HANG: ecode %d:%x:%08x",
  1385. INTEL_GEN(error->i915), engines,
  1386. generate_ecode(first));
  1387. if (first && first->context.pid) {
  1388. /* Just show the first executing process, more is confusing */
  1389. len += scnprintf(error->error_msg + len,
  1390. sizeof(error->error_msg) - len,
  1391. ", in %s [%d]",
  1392. first->context.comm, first->context.pid);
  1393. }
  1394. return error->error_msg;
  1395. }
  1396. static void capture_gen(struct i915_gpu_coredump *error)
  1397. {
  1398. struct drm_i915_private *i915 = error->i915;
  1399. error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
  1400. error->suspended = i915->runtime_pm.suspended;
  1401. error->iommu = -1;
  1402. #ifdef CONFIG_INTEL_IOMMU
  1403. error->iommu = intel_iommu_gfx_mapped;
  1404. #endif
  1405. error->reset_count = i915_reset_count(&i915->gpu_error);
  1406. error->suspend_count = i915->suspend_count;
  1407. i915_params_copy(&error->params, &i915_modparams);
  1408. memcpy(&error->device_info,
  1409. INTEL_INFO(i915),
  1410. sizeof(error->device_info));
  1411. memcpy(&error->runtime_info,
  1412. RUNTIME_INFO(i915),
  1413. sizeof(error->runtime_info));
  1414. error->driver_caps = i915->caps;
  1415. }
  1416. struct i915_gpu_coredump *
  1417. i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
  1418. {
  1419. struct i915_gpu_coredump *error;
  1420. if (!i915_modparams.error_capture)
  1421. return NULL;
  1422. error = kzalloc(sizeof(*error), gfp);
  1423. if (!error)
  1424. return NULL;
  1425. kref_init(&error->ref);
  1426. error->i915 = i915;
  1427. error->time = ktime_get_real();
  1428. error->boottime = ktime_get_boottime();
  1429. error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
  1430. error->capture = jiffies;
  1431. capture_gen(error);
  1432. return error;
  1433. }
  1434. #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
  1435. struct intel_gt_coredump *
  1436. intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)
  1437. {
  1438. struct intel_gt_coredump *gc;
  1439. gc = kzalloc(sizeof(*gc), gfp);
  1440. if (!gc)
  1441. return NULL;
  1442. gc->_gt = gt;
  1443. gc->awake = intel_gt_pm_is_awake(gt);
  1444. gt_record_regs(gc);
  1445. gt_record_fences(gc);
  1446. return gc;
  1447. }
  1448. struct i915_vma_compress *
  1449. i915_vma_capture_prepare(struct intel_gt_coredump *gt)
  1450. {
  1451. struct i915_vma_compress *compress;
  1452. compress = kmalloc(sizeof(*compress), ALLOW_FAIL);
  1453. if (!compress)
  1454. return NULL;
  1455. if (!compress_init(compress)) {
  1456. kfree(compress);
  1457. return NULL;
  1458. }
  1459. gt_capture_prepare(gt);
  1460. return compress;
  1461. }
  1462. void i915_vma_capture_finish(struct intel_gt_coredump *gt,
  1463. struct i915_vma_compress *compress)
  1464. {
  1465. if (!compress)
  1466. return;
  1467. gt_capture_finish(gt);
  1468. compress_fini(compress);
  1469. kfree(compress);
  1470. }
  1471. struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
  1472. {
  1473. struct i915_gpu_coredump *error;
  1474. /* Check if GPU capture has been disabled */
  1475. error = READ_ONCE(i915->gpu_error.first_error);
  1476. if (IS_ERR(error))
  1477. return error;
  1478. error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL);
  1479. if (!error)
  1480. return ERR_PTR(-ENOMEM);
  1481. error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL);
  1482. if (error->gt) {
  1483. struct i915_vma_compress *compress;
  1484. compress = i915_vma_capture_prepare(error->gt);
  1485. if (!compress) {
  1486. kfree(error->gt);
  1487. kfree(error);
  1488. return ERR_PTR(-ENOMEM);
  1489. }
  1490. gt_record_engines(error->gt, compress);
  1491. if (INTEL_INFO(i915)->has_gt_uc)
  1492. error->gt->uc = gt_record_uc(error->gt, compress);
  1493. i915_vma_capture_finish(error->gt, compress);
  1494. error->simulated |= error->gt->simulated;
  1495. }
  1496. error->overlay = intel_overlay_capture_error_state(i915);
  1497. error->display = intel_display_capture_error_state(i915);
  1498. return error;
  1499. }
  1500. void i915_error_state_store(struct i915_gpu_coredump *error)
  1501. {
  1502. struct drm_i915_private *i915;
  1503. static bool warned;
  1504. if (IS_ERR_OR_NULL(error))
  1505. return;
  1506. i915 = error->i915;
  1507. dev_info(i915->drm.dev, "%s\n", error_msg(error));
  1508. if (error->simulated ||
  1509. cmpxchg(&i915->gpu_error.first_error, NULL, error))
  1510. return;
  1511. i915_gpu_coredump_get(error);
  1512. if (!xchg(&warned, true) &&
  1513. ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
  1514. pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
  1515. pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n");
  1516. pr_info("Please see https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs for details.\n");
  1517. pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
  1518. pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n");
  1519. pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n",
  1520. i915->drm.primary->index);
  1521. }
  1522. }
  1523. /**
  1524. * i915_capture_error_state - capture an error record for later analysis
  1525. * @i915: i915 device
  1526. *
  1527. * Should be called when an error is detected (either a hang or an error
  1528. * interrupt) to capture error state from the time of the error. Fills
  1529. * out a structure which becomes available in debugfs for user level tools
  1530. * to pick up.
  1531. */
  1532. void i915_capture_error_state(struct drm_i915_private *i915)
  1533. {
  1534. struct i915_gpu_coredump *error;
  1535. error = i915_gpu_coredump(i915);
  1536. if (IS_ERR(error)) {
  1537. cmpxchg(&i915->gpu_error.first_error, NULL, error);
  1538. return;
  1539. }
  1540. i915_error_state_store(error);
  1541. i915_gpu_coredump_put(error);
  1542. }
  1543. struct i915_gpu_coredump *
  1544. i915_first_error_state(struct drm_i915_private *i915)
  1545. {
  1546. struct i915_gpu_coredump *error;
  1547. spin_lock_irq(&i915->gpu_error.lock);
  1548. error = i915->gpu_error.first_error;
  1549. if (!IS_ERR_OR_NULL(error))
  1550. i915_gpu_coredump_get(error);
  1551. spin_unlock_irq(&i915->gpu_error.lock);
  1552. return error;
  1553. }
  1554. void i915_reset_error_state(struct drm_i915_private *i915)
  1555. {
  1556. struct i915_gpu_coredump *error;
  1557. spin_lock_irq(&i915->gpu_error.lock);
  1558. error = i915->gpu_error.first_error;
  1559. if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
  1560. i915->gpu_error.first_error = NULL;
  1561. spin_unlock_irq(&i915->gpu_error.lock);
  1562. if (!IS_ERR_OR_NULL(error))
  1563. i915_gpu_coredump_put(error);
  1564. }
  1565. void i915_disable_error_state(struct drm_i915_private *i915, int err)
  1566. {
  1567. spin_lock_irq(&i915->gpu_error.lock);
  1568. if (!i915->gpu_error.first_error)
  1569. i915->gpu_error.first_error = ERR_PTR(err);
  1570. spin_unlock_irq(&i915->gpu_error.lock);
  1571. }