1 /* 2 * Copyright (c) 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * Keith Packard <keithp@keithp.com> 26 * Mika Kuoppala <mika.kuoppala@intel.com> 27 * 28 */ 29 30 #include <linux/ascii85.h> 31 #include <linux/highmem.h> 32 #include <linux/nmi.h> 33 #include <linux/pagevec.h> 34 #include <linux/scatterlist.h> 35 #include <linux/string_helpers.h> 36 #include <linux/utsname.h> 37 #include <linux/zlib.h> 38 39 #include <drm/drm_cache.h> 40 #include <drm/drm_print.h> 41 42 #include "display/intel_dmc.h" 43 #include "display/intel_overlay.h" 44 45 #include "gem/i915_gem_context.h" 46 #include "gem/i915_gem_lmem.h" 47 #include "gt/intel_engine_regs.h" 48 #include "gt/intel_gt.h" 49 #include "gt/intel_gt_mcr.h" 50 #include "gt/intel_gt_pm.h" 51 #include "gt/intel_gt_regs.h" 52 #include "gt/uc/intel_guc_capture.h" 53 54 #include "i915_driver.h" 55 #include "i915_drv.h" 56 #include "i915_gpu_error.h" 57 #include "i915_memcpy.h" 58 #include "i915_reg.h" 59 #include "i915_scatterlist.h" 60 #include "i915_utils.h" 61 62 #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) 63 #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) 64 65 static void __sg_set_buf(struct scatterlist *sg, 66 void *addr, unsigned int len, loff_t it) 67 { 68 STUB(); 69 #ifdef notyet 70 sg->page_link = (unsigned long)virt_to_page(addr); 71 sg->offset = offset_in_page(addr); 72 sg->length = len; 73 sg->dma_address = it; 74 #endif 75 } 76 77 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) 78 { 79 STUB(); 80 return false; 81 #ifdef notyet 82 if (!len) 83 return false; 84 85 if (e->bytes + len + 1 <= e->size) 86 return true; 87 88 if (e->bytes) { 89 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter); 90 e->iter += e->bytes; 91 e->buf = NULL; 92 e->bytes = 0; 93 } 94 95 if (e->cur == e->end) { 96 struct scatterlist *sgl; 97 98 sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL); 99 if (!sgl) { 100 e->err = -ENOMEM; 101 return false; 102 } 103 104 if (e->cur) { 105 e->cur->offset = 0; 106 e->cur->length = 0; 107 e->cur->page_link = 108 (unsigned long)sgl | SG_CHAIN; 109 } else { 110 e->sgl = sgl; 111 } 112 113 e->cur = sgl; 114 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1; 115 } 116 117 e->size = ALIGN(len + 1, SZ_64K); 118 e->buf = kmalloc(e->size, ALLOW_FAIL); 119 if (!e->buf) { 120 e->size = PAGE_ALIGN(len + 1); 121 e->buf = kmalloc(e->size, GFP_KERNEL); 122 } 123 if (!e->buf) { 124 e->err = -ENOMEM; 125 return false; 126 } 127 128 return true; 129 #endif 130 } 131 132 __printf(2, 0) 133 static void i915_error_vprintf(struct drm_i915_error_state_buf *e, 134 const char *fmt, va_list args) 135 { 136 va_list ap; 137 int len; 138 139 if (e->err) 140 return; 141 142 va_copy(ap, args); 143 len = vsnprintf(NULL, 0, fmt, ap); 144 va_end(ap); 145 if (len <= 0) { 146 e->err = len; 147 return; 148 } 149 150 if (!__i915_error_grow(e, len)) 151 return; 152 153 GEM_BUG_ON(e->bytes >= e->size); 154 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args); 155 if (len < 0) { 156 e->err = len; 157 return; 158 } 159 e->bytes += len; 160 } 161 162 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str) 163 { 164 unsigned len; 165 166 if (e->err || !str) 167 return; 168 169 len = strlen(str); 170 if (!__i915_error_grow(e, len)) 171 return; 172 173 GEM_BUG_ON(e->bytes + len > e->size); 174 memcpy(e->buf + e->bytes, str, len); 175 e->bytes += len; 176 } 177 178 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) 179 #define err_puts(e, s) i915_error_puts(e, s) 180 181 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) 182 { 183 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va); 184 } 185 186 static inline struct drm_printer 187 i915_error_printer(struct drm_i915_error_state_buf *e) 188 { 189 struct drm_printer p = { 190 .printfn = __i915_printfn_error, 191 .arg = e, 192 }; 193 return p; 194 } 195 196 /* single threaded page allocator with a reserved stash for emergencies */ 197 static void pool_fini(struct folio_batch *fbatch) 198 { 199 STUB(); 200 #ifdef notyet 201 folio_batch_release(fbatch); 202 #endif 203 } 204 205 static int pool_refill(struct folio_batch *fbatch, gfp_t gfp) 206 { 207 STUB(); 208 return -ENOSYS; 209 #ifdef notyet 210 while (folio_batch_space(fbatch)) { 211 struct folio *folio; 212 213 folio = folio_alloc(gfp, 0); 214 if (!folio) 215 return -ENOMEM; 216 217 folio_batch_add(fbatch, folio); 218 } 219 220 return 0; 221 #endif 222 } 223 224 static int intel_pool_init(struct folio_batch *fbatch, gfp_t gfp) 225 { 226 int err; 227 228 STUB(); 229 return -ENOSYS; 230 #ifdef notyet 231 folio_batch_init(fbatch); 232 233 err = pool_refill(fbatch, gfp); 234 if (err) 235 pool_fini(fbatch); 236 237 return err; 238 #endif 239 } 240 241 static void *pool_alloc(struct folio_batch *fbatch, gfp_t gfp) 242 { 243 STUB(); 244 return NULL; 245 #ifdef notyet 246 struct folio *folio; 247 248 folio = folio_alloc(gfp, 0); 249 if (!folio && folio_batch_count(fbatch)) 250 folio = fbatch->folios[--fbatch->nr]; 251 252 return folio ? folio_address(folio) : NULL; 253 #endif 254 } 255 256 static void pool_free(struct folio_batch *fbatch, void *addr) 257 { 258 STUB(); 259 #ifdef notyet 260 struct folio *folio = virt_to_folio(addr); 261 262 if (folio_batch_space(fbatch)) 263 folio_batch_add(fbatch, folio); 264 else 265 folio_put(folio); 266 #endif 267 } 268 269 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR 270 271 struct i915_vma_compress { 272 struct folio_batch pool; 273 struct z_stream_s zstream; 274 void *tmp; 275 }; 276 277 static bool compress_init(struct i915_vma_compress *c) 278 { 279 struct z_stream_s *zstream = &c->zstream; 280 281 if (intel_pool_init(&c->pool, ALLOW_FAIL)) 282 return false; 283 284 zstream->workspace = 285 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 286 ALLOW_FAIL); 287 if (!zstream->workspace) { 288 pool_fini(&c->pool); 289 return false; 290 } 291 292 c->tmp = NULL; 293 if (i915_has_memcpy_from_wc()) 294 c->tmp = pool_alloc(&c->pool, ALLOW_FAIL); 295 296 return true; 297 } 298 299 static bool compress_start(struct i915_vma_compress *c) 300 { 301 struct z_stream_s *zstream = &c->zstream; 302 void *workspace = zstream->workspace; 303 304 memset(zstream, 0, sizeof(*zstream)); 305 zstream->workspace = workspace; 306 307 return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; 308 } 309 310 static void *compress_next_page(struct i915_vma_compress *c, 311 struct i915_vma_coredump *dst) 312 { 313 void *page_addr; 314 struct vm_page *page; 315 316 page_addr = pool_alloc(&c->pool, ALLOW_FAIL); 317 if (!page_addr) 318 return ERR_PTR(-ENOMEM); 319 320 page = virt_to_page(page_addr); 321 list_add_tail(&page->lru, &dst->page_list); 322 return page_addr; 323 } 324 325 static int compress_page(struct i915_vma_compress *c, 326 void *src, 327 struct i915_vma_coredump *dst, 328 bool wc) 329 { 330 struct z_stream_s *zstream = &c->zstream; 331 332 zstream->next_in = src; 333 if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) 334 zstream->next_in = c->tmp; 335 zstream->avail_in = PAGE_SIZE; 336 337 do { 338 if (zstream->avail_out == 0) { 339 zstream->next_out = compress_next_page(c, dst); 340 if (IS_ERR(zstream->next_out)) 341 return PTR_ERR(zstream->next_out); 342 343 zstream->avail_out = PAGE_SIZE; 344 } 345 346 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) 347 return -EIO; 348 349 cond_resched(); 350 } while (zstream->avail_in); 351 352 /* Fallback to uncompressed if we increase size? */ 353 if (0 && zstream->total_out > zstream->total_in) 354 return -E2BIG; 355 356 return 0; 357 } 358 359 static int compress_flush(struct i915_vma_compress *c, 360 struct i915_vma_coredump *dst) 361 { 362 struct z_stream_s *zstream = &c->zstream; 363 364 do { 365 switch (zlib_deflate(zstream, Z_FINISH)) { 366 case Z_OK: /* more space requested */ 367 zstream->next_out = compress_next_page(c, dst); 368 if (IS_ERR(zstream->next_out)) 369 return PTR_ERR(zstream->next_out); 370 371 zstream->avail_out = PAGE_SIZE; 372 break; 373 374 case Z_STREAM_END: 375 goto end; 376 377 default: /* any error */ 378 return -EIO; 379 } 380 } while (1); 381 382 end: 383 memset(zstream->next_out, 0, zstream->avail_out); 384 dst->unused = zstream->avail_out; 385 return 0; 386 } 387 388 static void compress_finish(struct i915_vma_compress *c) 389 { 390 zlib_deflateEnd(&c->zstream); 391 } 392 393 static void compress_fini(struct i915_vma_compress *c) 394 { 395 kfree(c->zstream.workspace); 396 if (c->tmp) 397 pool_free(&c->pool, c->tmp); 398 pool_fini(&c->pool); 399 } 400 401 static void err_compression_marker(struct drm_i915_error_state_buf *m) 402 { 403 err_puts(m, ":"); 404 } 405 406 #else 407 408 struct i915_vma_compress { 409 struct folio_batch pool; 410 }; 411 412 static bool compress_init(struct i915_vma_compress *c) 413 { 414 return intel_pool_init(&c->pool, ALLOW_FAIL) == 0; 415 } 416 417 static bool compress_start(struct i915_vma_compress *c) 418 { 419 return true; 420 } 421 422 static int compress_page(struct i915_vma_compress *c, 423 void *src, 424 struct i915_vma_coredump *dst, 425 bool wc) 426 { 427 STUB(); 428 return -ENOSYS; 429 #ifdef notyet 430 void *ptr; 431 432 ptr = pool_alloc(&c->pool, ALLOW_FAIL); 433 if (!ptr) 434 return -ENOMEM; 435 436 if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) 437 memcpy(ptr, src, PAGE_SIZE); 438 list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list); 439 cond_resched(); 440 441 return 0; 442 #endif 443 } 444 445 static int compress_flush(struct i915_vma_compress *c, 446 struct i915_vma_coredump *dst) 447 { 448 return 0; 449 } 450 451 static void compress_finish(struct i915_vma_compress *c) 452 { 453 } 454 455 static void compress_fini(struct i915_vma_compress *c) 456 { 457 pool_fini(&c->pool); 458 } 459 460 static void err_compression_marker(struct drm_i915_error_state_buf *m) 461 { 462 err_puts(m, "~"); 463 } 464 465 #endif 466 467 static void error_print_instdone(struct drm_i915_error_state_buf *m, 468 const struct intel_engine_coredump *ee) 469 { 470 int slice; 471 int subslice; 472 int iter; 473 474 err_printf(m, " INSTDONE: 0x%08x\n", 475 ee->instdone.instdone); 476 477 if (ee->engine->class != RENDER_CLASS || GRAPHICS_VER(m->i915) <= 3) 478 return; 479 480 err_printf(m, " SC_INSTDONE: 0x%08x\n", 481 ee->instdone.slice_common); 482 483 if (GRAPHICS_VER(m->i915) <= 6) 484 return; 485 486 for_each_ss_steering(iter, ee->engine->gt, slice, subslice) 487 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", 488 slice, subslice, 489 ee->instdone.sampler[slice][subslice]); 490 491 for_each_ss_steering(iter, ee->engine->gt, slice, subslice) 492 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", 493 slice, subslice, 494 ee->instdone.row[slice][subslice]); 495 496 if (GRAPHICS_VER(m->i915) < 12) 497 return; 498 499 if (GRAPHICS_VER_FULL(m->i915) >= IP_VER(12, 55)) { 500 for_each_ss_steering(iter, ee->engine->gt, slice, subslice) 501 err_printf(m, " GEOM_SVGUNIT_INSTDONE[%d][%d]: 0x%08x\n", 502 slice, subslice, 503 ee->instdone.geom_svg[slice][subslice]); 504 } 505 506 err_printf(m, " SC_INSTDONE_EXTRA: 0x%08x\n", 507 ee->instdone.slice_common_extra[0]); 508 err_printf(m, " SC_INSTDONE_EXTRA2: 0x%08x\n", 509 ee->instdone.slice_common_extra[1]); 510 } 511 512 static void error_print_request(struct drm_i915_error_state_buf *m, 513 const char *prefix, 514 const struct i915_request_coredump *erq) 515 { 516 if (!erq->seqno) 517 return; 518 519 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, head %08x, tail %08x\n", 520 prefix, erq->pid, erq->context, erq->seqno, 521 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 522 &erq->flags) ? "!" : "", 523 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 524 &erq->flags) ? "+" : "", 525 erq->sched_attr.priority, 526 erq->head, erq->tail); 527 } 528 529 static void error_print_context(struct drm_i915_error_state_buf *m, 530 const char *header, 531 const struct i915_gem_context_coredump *ctx) 532 { 533 err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n", 534 header, ctx->comm, ctx->pid, ctx->sched_attr.priority, 535 ctx->guilty, ctx->active, 536 ctx->total_runtime, ctx->avg_runtime); 537 err_printf(m, " context timeline seqno %u\n", ctx->hwsp_seqno); 538 } 539 540 static struct i915_vma_coredump * 541 __find_vma(struct i915_vma_coredump *vma, const char *name) 542 { 543 while (vma) { 544 if (strcmp(vma->name, name) == 0) 545 return vma; 546 vma = vma->next; 547 } 548 549 return NULL; 550 } 551 552 struct i915_vma_coredump * 553 intel_gpu_error_find_batch(const struct intel_engine_coredump *ee) 554 { 555 return __find_vma(ee->vma, "batch"); 556 } 557 558 static void error_print_engine(struct drm_i915_error_state_buf *m, 559 const struct intel_engine_coredump *ee) 560 { 561 struct i915_vma_coredump *batch; 562 int n; 563 564 err_printf(m, "%s command stream:\n", ee->engine->name); 565 err_printf(m, " CCID: 0x%08x\n", ee->ccid); 566 err_printf(m, " START: 0x%08x\n", ee->start); 567 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); 568 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", 569 ee->tail, ee->rq_post, ee->rq_tail); 570 err_printf(m, " CTL: 0x%08x\n", ee->ctl); 571 err_printf(m, " MODE: 0x%08x\n", ee->mode); 572 err_printf(m, " HWS: 0x%08x\n", ee->hws); 573 err_printf(m, " ACTHD: 0x%08x %08x\n", 574 (u32)(ee->acthd>>32), (u32)ee->acthd); 575 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir); 576 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr); 577 err_printf(m, " ESR: 0x%08x\n", ee->esr); 578 579 error_print_instdone(m, ee); 580 581 batch = intel_gpu_error_find_batch(ee); 582 if (batch) { 583 u64 start = batch->gtt_offset; 584 u64 end = start + batch->gtt_size; 585 586 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", 587 upper_32_bits(start), lower_32_bits(start), 588 upper_32_bits(end), lower_32_bits(end)); 589 } 590 if (GRAPHICS_VER(m->i915) >= 4) { 591 err_printf(m, " BBADDR: 0x%08x_%08x\n", 592 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr); 593 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate); 594 err_printf(m, " INSTPS: 0x%08x\n", ee->instps); 595 } 596 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm); 597 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr), 598 lower_32_bits(ee->faddr)); 599 if (GRAPHICS_VER(m->i915) >= 6) { 600 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); 601 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); 602 } 603 if (GRAPHICS_VER(m->i915) >= 11) { 604 err_printf(m, " NOPID: 0x%08x\n", ee->nopid); 605 err_printf(m, " EXCC: 0x%08x\n", ee->excc); 606 err_printf(m, " CMD_CCTL: 0x%08x\n", ee->cmd_cctl); 607 err_printf(m, " CSCMDOP: 0x%08x\n", ee->cscmdop); 608 err_printf(m, " CTX_SR_CTL: 0x%08x\n", ee->ctx_sr_ctl); 609 err_printf(m, " DMA_FADDR_HI: 0x%08x\n", ee->dma_faddr_hi); 610 err_printf(m, " DMA_FADDR_LO: 0x%08x\n", ee->dma_faddr_lo); 611 } 612 if (HAS_PPGTT(m->i915)) { 613 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); 614 615 if (GRAPHICS_VER(m->i915) >= 8) { 616 int i; 617 for (i = 0; i < 4; i++) 618 err_printf(m, " PDP%d: 0x%016llx\n", 619 i, ee->vm_info.pdp[i]); 620 } else { 621 err_printf(m, " PP_DIR_BASE: 0x%08x\n", 622 ee->vm_info.pp_dir_base); 623 } 624 } 625 626 for (n = 0; n < ee->num_ports; n++) { 627 err_printf(m, " ELSP[%d]:", n); 628 error_print_request(m, " ", &ee->execlist[n]); 629 } 630 } 631 632 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) 633 { 634 va_list args; 635 636 va_start(args, f); 637 i915_error_vprintf(e, f, args); 638 va_end(args); 639 } 640 641 void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, 642 const struct intel_engine_cs *engine, 643 const struct i915_vma_coredump *vma) 644 { 645 STUB(); 646 #ifdef notyet 647 char out[ASCII85_BUFSZ]; 648 struct vm_page *page; 649 650 if (!vma) 651 return; 652 653 err_printf(m, "%s --- %s = 0x%08x %08x\n", 654 engine ? engine->name : "global", vma->name, 655 upper_32_bits(vma->gtt_offset), 656 lower_32_bits(vma->gtt_offset)); 657 658 if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) 659 err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes); 660 661 err_compression_marker(m); 662 list_for_each_entry(page, &vma->page_list, lru) { 663 int i, len; 664 const u32 *addr = page_address(page); 665 666 len = PAGE_SIZE; 667 if (page == list_last_entry(&vma->page_list, typeof(*page), lru)) 668 len -= vma->unused; 669 len = ascii85_encode_len(len); 670 671 for (i = 0; i < len; i++) 672 err_puts(m, ascii85_encode(addr[i], out)); 673 } 674 err_puts(m, "\n"); 675 #endif 676 } 677 678 static void err_print_capabilities(struct drm_i915_error_state_buf *m, 679 struct i915_gpu_coredump *error) 680 { 681 struct drm_printer p = i915_error_printer(m); 682 683 intel_device_info_print(&error->device_info, &error->runtime_info, &p); 684 intel_display_device_info_print(&error->display_device_info, 685 &error->display_runtime_info, &p); 686 intel_driver_caps_print(&error->driver_caps, &p); 687 } 688 689 static void err_print_params(struct drm_i915_error_state_buf *m, 690 const struct i915_params *params) 691 { 692 struct drm_printer p = i915_error_printer(m); 693 694 i915_params_dump(params, &p); 695 } 696 697 static void err_print_pciid(struct drm_i915_error_state_buf *m, 698 struct drm_i915_private *i915) 699 { 700 struct pci_dev *pdev = i915->drm.pdev; 701 702 err_printf(m, "PCI ID: 0x%04x\n", pdev->device); 703 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); 704 err_printf(m, "PCI Subsystem: %04x:%04x\n", 705 pdev->subsystem_vendor, 706 pdev->subsystem_device); 707 } 708 709 static void err_print_guc_ctb(struct drm_i915_error_state_buf *m, 710 const char *name, 711 const struct intel_ctb_coredump *ctb) 712 { 713 if (!ctb->size) 714 return; 715 716 err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n", 717 name, ctb->raw_status, ctb->raw_head, ctb->raw_tail, 718 ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size); 719 } 720 721 static void err_print_uc(struct drm_i915_error_state_buf *m, 722 const struct intel_uc_coredump *error_uc) 723 { 724 struct drm_printer p = i915_error_printer(m); 725 726 intel_uc_fw_dump(&error_uc->guc_fw, &p); 727 intel_uc_fw_dump(&error_uc->huc_fw, &p); 728 err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp); 729 intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log); 730 err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence); 731 err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0); 732 err_print_guc_ctb(m, "Recv", error_uc->guc.ctb + 1); 733 intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_ctb); 734 } 735 736 static void err_free_sgl(struct scatterlist *sgl) 737 { 738 STUB(); 739 #ifdef notyet 740 while (sgl) { 741 struct scatterlist *sg; 742 743 for (sg = sgl; !sg_is_chain(sg); sg++) { 744 kfree(sg_virt(sg)); 745 if (sg_is_last(sg)) 746 break; 747 } 748 749 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg); 750 free_page((unsigned long)sgl); 751 sgl = sg; 752 } 753 #endif 754 } 755 756 static void err_print_gt_info(struct drm_i915_error_state_buf *m, 757 struct intel_gt_coredump *gt) 758 { 759 struct drm_printer p = i915_error_printer(m); 760 761 intel_gt_info_print(>->info, &p); 762 intel_sseu_print_topology(gt->_gt->i915, >->info.sseu, &p); 763 } 764 765 static void err_print_gt_display(struct drm_i915_error_state_buf *m, 766 struct intel_gt_coredump *gt) 767 { 768 err_printf(m, "IER: 0x%08x\n", gt->ier); 769 err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr); 770 } 771 772 static void err_print_gt_global_nonguc(struct drm_i915_error_state_buf *m, 773 struct intel_gt_coredump *gt) 774 { 775 int i; 776 777 err_printf(m, "GT awake: %s\n", str_yes_no(gt->awake)); 778 err_printf(m, "CS timestamp frequency: %u Hz, %d ns\n", 779 gt->clock_frequency, gt->clock_period_ns); 780 err_printf(m, "EIR: 0x%08x\n", gt->eir); 781 err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er); 782 783 for (i = 0; i < gt->ngtier; i++) 784 err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]); 785 } 786 787 static void err_print_gt_global(struct drm_i915_error_state_buf *m, 788 struct intel_gt_coredump *gt) 789 { 790 err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake); 791 792 if (IS_GRAPHICS_VER(m->i915, 6, 11)) { 793 err_printf(m, "ERROR: 0x%08x\n", gt->error); 794 err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg); 795 } 796 797 if (GRAPHICS_VER(m->i915) >= 8) 798 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", 799 gt->fault_data1, gt->fault_data0); 800 801 if (GRAPHICS_VER(m->i915) == 7) 802 err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int); 803 804 if (IS_GRAPHICS_VER(m->i915, 8, 11)) 805 err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache); 806 807 if (GRAPHICS_VER(m->i915) == 12) 808 err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err); 809 810 if (GRAPHICS_VER(m->i915) >= 12) { 811 int i; 812 813 for (i = 0; i < I915_MAX_SFC; i++) { 814 /* 815 * SFC_DONE resides in the VD forcewake domain, so it 816 * only exists if the corresponding VCS engine is 817 * present. 818 */ 819 if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 || 820 !HAS_ENGINE(gt->_gt, _VCS(i * 2))) 821 continue; 822 823 err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i, 824 gt->sfc_done[i]); 825 } 826 827 err_printf(m, " GAM_DONE: 0x%08x\n", gt->gam_done); 828 } 829 } 830 831 static void err_print_gt_fences(struct drm_i915_error_state_buf *m, 832 struct intel_gt_coredump *gt) 833 { 834 int i; 835 836 for (i = 0; i < gt->nfence; i++) 837 err_printf(m, " fence[%d] = %08llx\n", i, gt->fence[i]); 838 } 839 840 static void err_print_gt_engines(struct drm_i915_error_state_buf *m, 841 struct intel_gt_coredump *gt) 842 { 843 const struct intel_engine_coredump *ee; 844 845 for (ee = gt->engine; ee; ee = ee->next) { 846 const struct i915_vma_coredump *vma; 847 848 if (gt->uc && gt->uc->guc.is_guc_capture) { 849 if (ee->guc_capture_node) 850 intel_guc_capture_print_engine_node(m, ee); 851 else 852 err_printf(m, " Missing GuC capture node for %s\n", 853 ee->engine->name); 854 } else { 855 error_print_engine(m, ee); 856 } 857 858 err_printf(m, " hung: %u\n", ee->hung); 859 err_printf(m, " engine reset count: %u\n", ee->reset_count); 860 error_print_context(m, " Active context: ", &ee->context); 861 862 for (vma = ee->vma; vma; vma = vma->next) 863 intel_gpu_error_print_vma(m, ee->engine, vma); 864 } 865 866 } 867 868 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, 869 struct i915_gpu_coredump *error) 870 { 871 const struct intel_engine_coredump *ee; 872 struct timespec64 ts; 873 874 if (*error->error_msg) 875 err_printf(m, "%s\n", error->error_msg); 876 #ifdef __linux__ 877 err_printf(m, "Kernel: %s %s\n", 878 init_utsname()->release, 879 init_utsname()->machine); 880 #else 881 extern char machine[]; 882 err_printf(m, "Kernel: %s %s\n", 883 osrelease, 884 machine); 885 #endif 886 err_printf(m, "Driver: %s\n", DRIVER_DATE); 887 ts = ktime_to_timespec64(error->time); 888 err_printf(m, "Time: %lld s %ld us\n", 889 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 890 ts = ktime_to_timespec64(error->boottime); 891 err_printf(m, "Boottime: %lld s %ld us\n", 892 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 893 ts = ktime_to_timespec64(error->uptime); 894 err_printf(m, "Uptime: %lld s %ld us\n", 895 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 896 err_printf(m, "Capture: %lu jiffies; %d ms ago\n", 897 error->capture, jiffies_to_msecs(jiffies - error->capture)); 898 899 for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next) 900 err_printf(m, "Active process (on ring %s): %s [%d]\n", 901 ee->engine->name, 902 ee->context.comm, 903 ee->context.pid); 904 905 err_printf(m, "Reset count: %u\n", error->reset_count); 906 err_printf(m, "Suspend count: %u\n", error->suspend_count); 907 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); 908 err_printf(m, "Subplatform: 0x%x\n", 909 intel_subplatform(&error->runtime_info, 910 error->device_info.platform)); 911 err_print_pciid(m, m->i915); 912 913 err_printf(m, "IOMMU enabled?: %d\n", error->iommu); 914 915 intel_dmc_print_error_state(m, m->i915); 916 917 err_printf(m, "RPM wakelock: %s\n", str_yes_no(error->wakelock)); 918 err_printf(m, "PM suspended: %s\n", str_yes_no(error->suspended)); 919 920 if (error->gt) { 921 bool print_guc_capture = false; 922 923 if (error->gt->uc && error->gt->uc->guc.is_guc_capture) 924 print_guc_capture = true; 925 926 err_print_gt_display(m, error->gt); 927 err_print_gt_global_nonguc(m, error->gt); 928 err_print_gt_fences(m, error->gt); 929 930 /* 931 * GuC dumped global, eng-class and eng-instance registers together 932 * as part of engine state dump so we print in err_print_gt_engines 933 */ 934 if (!print_guc_capture) 935 err_print_gt_global(m, error->gt); 936 937 err_print_gt_engines(m, error->gt); 938 939 if (error->gt->uc) 940 err_print_uc(m, error->gt->uc); 941 942 err_print_gt_info(m, error->gt); 943 } 944 945 if (error->overlay) 946 intel_overlay_print_error_state(m, error->overlay); 947 948 err_print_capabilities(m, error); 949 err_print_params(m, &error->params); 950 } 951 952 static int err_print_to_sgl(struct i915_gpu_coredump *error) 953 { 954 struct drm_i915_error_state_buf m; 955 956 if (IS_ERR(error)) 957 return PTR_ERR(error); 958 959 if (READ_ONCE(error->sgl)) 960 return 0; 961 962 memset(&m, 0, sizeof(m)); 963 m.i915 = error->i915; 964 965 __err_print_to_sgl(&m, error); 966 967 if (m.buf) { 968 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter); 969 m.bytes = 0; 970 m.buf = NULL; 971 } 972 if (m.cur) { 973 GEM_BUG_ON(m.end < m.cur); 974 sg_mark_end(m.cur - 1); 975 } 976 GEM_BUG_ON(m.sgl && !m.cur); 977 978 if (m.err) { 979 err_free_sgl(m.sgl); 980 return m.err; 981 } 982 983 if (cmpxchg(&error->sgl, NULL, m.sgl)) 984 err_free_sgl(m.sgl); 985 986 return 0; 987 } 988 989 ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error, 990 char *buf, loff_t off, size_t rem) 991 { 992 STUB(); 993 return -ENOSYS; 994 #ifdef notyet 995 struct scatterlist *sg; 996 size_t count; 997 loff_t pos; 998 int err; 999 1000 if (!error || !rem) 1001 return 0; 1002 1003 err = err_print_to_sgl(error); 1004 if (err) 1005 return err; 1006 1007 sg = READ_ONCE(error->fit); 1008 if (!sg || off < sg->dma_address) 1009 sg = error->sgl; 1010 if (!sg) 1011 return 0; 1012 1013 pos = sg->dma_address; 1014 count = 0; 1015 do { 1016 size_t len, start; 1017 1018 if (sg_is_chain(sg)) { 1019 sg = sg_chain_ptr(sg); 1020 GEM_BUG_ON(sg_is_chain(sg)); 1021 } 1022 1023 len = sg->length; 1024 if (pos + len <= off) { 1025 pos += len; 1026 continue; 1027 } 1028 1029 start = sg->offset; 1030 if (pos < off) { 1031 GEM_BUG_ON(off - pos > len); 1032 len -= off - pos; 1033 start += off - pos; 1034 pos = off; 1035 } 1036 1037 len = min(len, rem); 1038 GEM_BUG_ON(!len || len > sg->length); 1039 1040 memcpy(buf, page_address(sg_page(sg)) + start, len); 1041 1042 count += len; 1043 pos += len; 1044 1045 buf += len; 1046 rem -= len; 1047 if (!rem) { 1048 WRITE_ONCE(error->fit, sg); 1049 break; 1050 } 1051 } while (!sg_is_last(sg++)); 1052 1053 return count; 1054 #endif 1055 } 1056 1057 static void i915_vma_coredump_free(struct i915_vma_coredump *vma) 1058 { 1059 STUB(); 1060 #ifdef notyet 1061 while (vma) { 1062 struct i915_vma_coredump *next = vma->next; 1063 struct vm_page *page, *n; 1064 1065 list_for_each_entry_safe(page, n, &vma->page_list, lru) { 1066 list_del_init(&page->lru); 1067 __free_page(page); 1068 } 1069 1070 kfree(vma); 1071 vma = next; 1072 } 1073 #endif 1074 } 1075 1076 static void cleanup_params(struct i915_gpu_coredump *error) 1077 { 1078 i915_params_free(&error->params); 1079 } 1080 1081 static void cleanup_uc(struct intel_uc_coredump *uc) 1082 { 1083 kfree(uc->guc_fw.file_selected.path); 1084 kfree(uc->huc_fw.file_selected.path); 1085 kfree(uc->guc_fw.file_wanted.path); 1086 kfree(uc->huc_fw.file_wanted.path); 1087 i915_vma_coredump_free(uc->guc.vma_log); 1088 i915_vma_coredump_free(uc->guc.vma_ctb); 1089 1090 kfree(uc); 1091 } 1092 1093 static void cleanup_gt(struct intel_gt_coredump *gt) 1094 { 1095 while (gt->engine) { 1096 struct intel_engine_coredump *ee = gt->engine; 1097 1098 gt->engine = ee->next; 1099 1100 i915_vma_coredump_free(ee->vma); 1101 intel_guc_capture_free_node(ee); 1102 kfree(ee); 1103 } 1104 1105 if (gt->uc) 1106 cleanup_uc(gt->uc); 1107 1108 kfree(gt); 1109 } 1110 1111 void __i915_gpu_coredump_free(struct kref *error_ref) 1112 { 1113 struct i915_gpu_coredump *error = 1114 container_of(error_ref, typeof(*error), ref); 1115 1116 while (error->gt) { 1117 struct intel_gt_coredump *gt = error->gt; 1118 1119 error->gt = gt->next; 1120 cleanup_gt(gt); 1121 } 1122 1123 kfree(error->overlay); 1124 1125 cleanup_params(error); 1126 1127 err_free_sgl(error->sgl); 1128 kfree(error); 1129 } 1130 1131 static struct i915_vma_coredump * 1132 i915_vma_coredump_create(const struct intel_gt *gt, 1133 const struct i915_vma_resource *vma_res, 1134 struct i915_vma_compress *compress, 1135 const char *name) 1136 1137 { 1138 STUB(); 1139 return NULL; 1140 #ifdef notyet 1141 struct i915_ggtt *ggtt = gt->ggtt; 1142 const u64 slot = ggtt->error_capture.start; 1143 struct i915_vma_coredump *dst; 1144 struct sgt_iter iter; 1145 int ret; 1146 1147 might_sleep(); 1148 1149 if (!vma_res || !vma_res->bi.pages || !compress) 1150 return NULL; 1151 1152 dst = kmalloc(sizeof(*dst), ALLOW_FAIL); 1153 if (!dst) 1154 return NULL; 1155 1156 if (!compress_start(compress)) { 1157 kfree(dst); 1158 return NULL; 1159 } 1160 1161 INIT_LIST_HEAD(&dst->page_list); 1162 strlcpy(dst->name, name, sizeof(dst->name)); 1163 dst->next = NULL; 1164 1165 dst->gtt_offset = vma_res->start; 1166 dst->gtt_size = vma_res->node_size; 1167 dst->gtt_page_sizes = vma_res->page_sizes_gtt; 1168 dst->unused = 0; 1169 1170 ret = -EINVAL; 1171 if (drm_mm_node_allocated(&ggtt->error_capture)) { 1172 void __iomem *s; 1173 dma_addr_t dma; 1174 1175 for_each_sgt_daddr(dma, iter, vma_res->bi.pages) { 1176 mutex_lock(&ggtt->error_mutex); 1177 if (ggtt->vm.raw_insert_page) 1178 ggtt->vm.raw_insert_page(&ggtt->vm, dma, slot, 1179 i915_gem_get_pat_index(gt->i915, 1180 I915_CACHE_NONE), 1181 0); 1182 else 1183 ggtt->vm.insert_page(&ggtt->vm, dma, slot, 1184 i915_gem_get_pat_index(gt->i915, 1185 I915_CACHE_NONE), 1186 0); 1187 mb(); 1188 1189 s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); 1190 ret = compress_page(compress, 1191 (void __force *)s, dst, 1192 true); 1193 io_mapping_unmap(s); 1194 1195 mb(); 1196 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); 1197 mutex_unlock(&ggtt->error_mutex); 1198 if (ret) 1199 break; 1200 } 1201 } else if (vma_res->bi.lmem) { 1202 struct intel_memory_region *mem = vma_res->mr; 1203 dma_addr_t dma; 1204 1205 for_each_sgt_daddr(dma, iter, vma_res->bi.pages) { 1206 dma_addr_t offset = dma - mem->region.start; 1207 void __iomem *s; 1208 1209 if (offset + PAGE_SIZE > resource_size(&mem->io)) { 1210 ret = -EINVAL; 1211 break; 1212 } 1213 1214 s = io_mapping_map_wc(&mem->iomap, offset, PAGE_SIZE); 1215 ret = compress_page(compress, 1216 (void __force *)s, dst, 1217 true); 1218 io_mapping_unmap(s); 1219 if (ret) 1220 break; 1221 } 1222 } else { 1223 struct vm_page *page; 1224 1225 for_each_sgt_page(page, iter, vma_res->bi.pages) { 1226 void *s; 1227 1228 drm_clflush_pages(&page, 1); 1229 1230 s = kmap_local_page(page); 1231 ret = compress_page(compress, s, dst, false); 1232 kunmap_local(s); 1233 1234 drm_clflush_pages(&page, 1); 1235 1236 if (ret) 1237 break; 1238 } 1239 } 1240 1241 if (ret || compress_flush(compress, dst)) { 1242 struct vm_page *page, *n; 1243 1244 list_for_each_entry_safe_reverse(page, n, &dst->page_list, lru) { 1245 list_del_init(&page->lru); 1246 pool_free(&compress->pool, page_address(page)); 1247 } 1248 1249 kfree(dst); 1250 dst = NULL; 1251 } 1252 compress_finish(compress); 1253 1254 return dst; 1255 #endif 1256 } 1257 1258 static void gt_record_fences(struct intel_gt_coredump *gt) 1259 { 1260 struct i915_ggtt *ggtt = gt->_gt->ggtt; 1261 struct intel_uncore *uncore = gt->_gt->uncore; 1262 int i; 1263 1264 if (GRAPHICS_VER(uncore->i915) >= 6) { 1265 for (i = 0; i < ggtt->num_fences; i++) 1266 gt->fence[i] = 1267 intel_uncore_read64(uncore, 1268 FENCE_REG_GEN6_LO(i)); 1269 } else if (GRAPHICS_VER(uncore->i915) >= 4) { 1270 for (i = 0; i < ggtt->num_fences; i++) 1271 gt->fence[i] = 1272 intel_uncore_read64(uncore, 1273 FENCE_REG_965_LO(i)); 1274 } else { 1275 for (i = 0; i < ggtt->num_fences; i++) 1276 gt->fence[i] = 1277 intel_uncore_read(uncore, FENCE_REG(i)); 1278 } 1279 gt->nfence = i; 1280 } 1281 1282 static void engine_record_registers(struct intel_engine_coredump *ee) 1283 { 1284 const struct intel_engine_cs *engine = ee->engine; 1285 struct drm_i915_private *i915 = engine->i915; 1286 1287 if (GRAPHICS_VER(i915) >= 6) { 1288 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); 1289 1290 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) 1291 ee->fault_reg = intel_gt_mcr_read_any(engine->gt, 1292 XEHP_RING_FAULT_REG); 1293 else if (GRAPHICS_VER(i915) >= 12) 1294 ee->fault_reg = intel_uncore_read(engine->uncore, 1295 GEN12_RING_FAULT_REG); 1296 else if (GRAPHICS_VER(i915) >= 8) 1297 ee->fault_reg = intel_uncore_read(engine->uncore, 1298 GEN8_RING_FAULT_REG); 1299 else 1300 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); 1301 } 1302 1303 if (GRAPHICS_VER(i915) >= 4) { 1304 ee->esr = ENGINE_READ(engine, RING_ESR); 1305 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); 1306 ee->ipeir = ENGINE_READ(engine, RING_IPEIR); 1307 ee->ipehr = ENGINE_READ(engine, RING_IPEHR); 1308 ee->instps = ENGINE_READ(engine, RING_INSTPS); 1309 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); 1310 ee->ccid = ENGINE_READ(engine, CCID); 1311 if (GRAPHICS_VER(i915) >= 8) { 1312 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; 1313 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; 1314 } 1315 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE); 1316 } else { 1317 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX); 1318 ee->ipeir = ENGINE_READ(engine, IPEIR); 1319 ee->ipehr = ENGINE_READ(engine, IPEHR); 1320 } 1321 1322 if (GRAPHICS_VER(i915) >= 11) { 1323 ee->cmd_cctl = ENGINE_READ(engine, RING_CMD_CCTL); 1324 ee->cscmdop = ENGINE_READ(engine, RING_CSCMDOP); 1325 ee->ctx_sr_ctl = ENGINE_READ(engine, RING_CTX_SR_CTL); 1326 ee->dma_faddr_hi = ENGINE_READ(engine, RING_DMA_FADD_UDW); 1327 ee->dma_faddr_lo = ENGINE_READ(engine, RING_DMA_FADD); 1328 ee->nopid = ENGINE_READ(engine, RING_NOPID); 1329 ee->excc = ENGINE_READ(engine, RING_EXCC); 1330 } 1331 1332 intel_engine_get_instdone(engine, &ee->instdone); 1333 1334 ee->instpm = ENGINE_READ(engine, RING_INSTPM); 1335 ee->acthd = intel_engine_get_active_head(engine); 1336 ee->start = ENGINE_READ(engine, RING_START); 1337 ee->head = ENGINE_READ(engine, RING_HEAD); 1338 ee->tail = ENGINE_READ(engine, RING_TAIL); 1339 ee->ctl = ENGINE_READ(engine, RING_CTL); 1340 if (GRAPHICS_VER(i915) > 2) 1341 ee->mode = ENGINE_READ(engine, RING_MI_MODE); 1342 1343 if (!HWS_NEEDS_PHYSICAL(i915)) { 1344 i915_reg_t mmio; 1345 1346 if (GRAPHICS_VER(i915) == 7) { 1347 switch (engine->id) { 1348 default: 1349 MISSING_CASE(engine->id); 1350 fallthrough; 1351 case RCS0: 1352 mmio = RENDER_HWS_PGA_GEN7; 1353 break; 1354 case BCS0: 1355 mmio = BLT_HWS_PGA_GEN7; 1356 break; 1357 case VCS0: 1358 mmio = BSD_HWS_PGA_GEN7; 1359 break; 1360 case VECS0: 1361 mmio = VEBOX_HWS_PGA_GEN7; 1362 break; 1363 } 1364 } else if (GRAPHICS_VER(engine->i915) == 6) { 1365 mmio = RING_HWS_PGA_GEN6(engine->mmio_base); 1366 } else { 1367 /* XXX: gen8 returns to sanity */ 1368 mmio = RING_HWS_PGA(engine->mmio_base); 1369 } 1370 1371 ee->hws = intel_uncore_read(engine->uncore, mmio); 1372 } 1373 1374 ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine); 1375 1376 if (HAS_PPGTT(i915)) { 1377 int i; 1378 1379 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); 1380 1381 if (GRAPHICS_VER(i915) == 6) { 1382 ee->vm_info.pp_dir_base = 1383 ENGINE_READ(engine, RING_PP_DIR_BASE_READ); 1384 } else if (GRAPHICS_VER(i915) == 7) { 1385 ee->vm_info.pp_dir_base = 1386 ENGINE_READ(engine, RING_PP_DIR_BASE); 1387 } else if (GRAPHICS_VER(i915) >= 8) { 1388 u32 base = engine->mmio_base; 1389 1390 for (i = 0; i < 4; i++) { 1391 ee->vm_info.pdp[i] = 1392 intel_uncore_read(engine->uncore, 1393 GEN8_RING_PDP_UDW(base, i)); 1394 ee->vm_info.pdp[i] <<= 32; 1395 ee->vm_info.pdp[i] |= 1396 intel_uncore_read(engine->uncore, 1397 GEN8_RING_PDP_LDW(base, i)); 1398 } 1399 } 1400 } 1401 } 1402 1403 static void record_request(const struct i915_request *request, 1404 struct i915_request_coredump *erq) 1405 { 1406 erq->flags = request->fence.flags; 1407 erq->context = request->fence.context; 1408 erq->seqno = request->fence.seqno; 1409 erq->sched_attr = request->sched.attr; 1410 erq->head = request->head; 1411 erq->tail = request->tail; 1412 1413 erq->pid = 0; 1414 rcu_read_lock(); 1415 if (!intel_context_is_closed(request->context)) { 1416 const struct i915_gem_context *ctx; 1417 1418 ctx = rcu_dereference(request->context->gem_context); 1419 if (ctx) 1420 #ifdef __linux__ 1421 erq->pid = pid_nr(ctx->pid); 1422 #else 1423 erq->pid = ctx->pid; 1424 #endif 1425 } 1426 rcu_read_unlock(); 1427 } 1428 1429 static void engine_record_execlists(struct intel_engine_coredump *ee) 1430 { 1431 const struct intel_engine_execlists * const el = &ee->engine->execlists; 1432 struct i915_request * const *port = el->active; 1433 unsigned int n = 0; 1434 1435 while (*port) 1436 record_request(*port++, &ee->execlist[n++]); 1437 1438 ee->num_ports = n; 1439 } 1440 1441 static bool record_context(struct i915_gem_context_coredump *e, 1442 struct intel_context *ce) 1443 { 1444 struct i915_gem_context *ctx; 1445 struct task_struct *task; 1446 bool simulated; 1447 1448 rcu_read_lock(); 1449 ctx = rcu_dereference(ce->gem_context); 1450 if (ctx && !kref_get_unless_zero(&ctx->ref)) 1451 ctx = NULL; 1452 rcu_read_unlock(); 1453 if (!ctx) 1454 return true; 1455 1456 #ifdef __linux__ 1457 rcu_read_lock(); 1458 task = pid_task(ctx->pid, PIDTYPE_PID); 1459 if (task) { 1460 strcpy(e->comm, task->comm); 1461 e->pid = task->pid; 1462 } 1463 rcu_read_unlock(); 1464 #endif 1465 1466 e->sched_attr = ctx->sched; 1467 e->guilty = atomic_read(&ctx->guilty_count); 1468 e->active = atomic_read(&ctx->active_count); 1469 e->hwsp_seqno = (ce->timeline && ce->timeline->hwsp_seqno) ? 1470 *ce->timeline->hwsp_seqno : ~0U; 1471 1472 e->total_runtime = intel_context_get_total_runtime_ns(ce); 1473 e->avg_runtime = intel_context_get_avg_runtime_ns(ce); 1474 1475 simulated = i915_gem_context_no_error_capture(ctx); 1476 1477 i915_gem_context_put(ctx); 1478 return simulated; 1479 } 1480 1481 struct intel_engine_capture_vma { 1482 struct intel_engine_capture_vma *next; 1483 struct i915_vma_resource *vma_res; 1484 char name[16]; 1485 bool lockdep_cookie; 1486 }; 1487 1488 static struct intel_engine_capture_vma * 1489 capture_vma_snapshot(struct intel_engine_capture_vma *next, 1490 struct i915_vma_resource *vma_res, 1491 gfp_t gfp, const char *name) 1492 { 1493 struct intel_engine_capture_vma *c; 1494 1495 if (!vma_res) 1496 return next; 1497 1498 c = kmalloc(sizeof(*c), gfp); 1499 if (!c) 1500 return next; 1501 1502 if (!i915_vma_resource_hold(vma_res, &c->lockdep_cookie)) { 1503 kfree(c); 1504 return next; 1505 } 1506 1507 strlcpy(c->name, name, sizeof(c->name)); 1508 c->vma_res = i915_vma_resource_get(vma_res); 1509 1510 c->next = next; 1511 return c; 1512 } 1513 1514 static struct intel_engine_capture_vma * 1515 capture_vma(struct intel_engine_capture_vma *next, 1516 struct i915_vma *vma, 1517 const char *name, 1518 gfp_t gfp) 1519 { 1520 if (!vma) 1521 return next; 1522 1523 /* 1524 * If the vma isn't pinned, then the vma should be snapshotted 1525 * to a struct i915_vma_snapshot at command submission time. 1526 * Not here. 1527 */ 1528 if (GEM_WARN_ON(!i915_vma_is_pinned(vma))) 1529 return next; 1530 1531 next = capture_vma_snapshot(next, vma->resource, gfp, name); 1532 1533 return next; 1534 } 1535 1536 static struct intel_engine_capture_vma * 1537 capture_user(struct intel_engine_capture_vma *capture, 1538 const struct i915_request *rq, 1539 gfp_t gfp) 1540 { 1541 struct i915_capture_list *c; 1542 1543 for (c = rq->capture_list; c; c = c->next) 1544 capture = capture_vma_snapshot(capture, c->vma_res, gfp, 1545 "user"); 1546 1547 return capture; 1548 } 1549 1550 static void add_vma(struct intel_engine_coredump *ee, 1551 struct i915_vma_coredump *vma) 1552 { 1553 if (vma) { 1554 vma->next = ee->vma; 1555 ee->vma = vma; 1556 } 1557 } 1558 1559 static struct i915_vma_coredump * 1560 create_vma_coredump(const struct intel_gt *gt, struct i915_vma *vma, 1561 const char *name, struct i915_vma_compress *compress) 1562 { 1563 struct i915_vma_coredump *ret = NULL; 1564 struct i915_vma_resource *vma_res; 1565 bool lockdep_cookie; 1566 1567 if (!vma) 1568 return NULL; 1569 1570 vma_res = vma->resource; 1571 1572 if (i915_vma_resource_hold(vma_res, &lockdep_cookie)) { 1573 ret = i915_vma_coredump_create(gt, vma_res, compress, name); 1574 i915_vma_resource_unhold(vma_res, lockdep_cookie); 1575 } 1576 1577 return ret; 1578 } 1579 1580 static void add_vma_coredump(struct intel_engine_coredump *ee, 1581 const struct intel_gt *gt, 1582 struct i915_vma *vma, 1583 const char *name, 1584 struct i915_vma_compress *compress) 1585 { 1586 add_vma(ee, create_vma_coredump(gt, vma, name, compress)); 1587 } 1588 1589 struct intel_engine_coredump * 1590 intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags) 1591 { 1592 struct intel_engine_coredump *ee; 1593 1594 ee = kzalloc(sizeof(*ee), gfp); 1595 if (!ee) 1596 return NULL; 1597 1598 ee->engine = engine; 1599 1600 if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)) { 1601 engine_record_registers(ee); 1602 engine_record_execlists(ee); 1603 } 1604 1605 return ee; 1606 } 1607 1608 static struct intel_engine_capture_vma * 1609 engine_coredump_add_context(struct intel_engine_coredump *ee, 1610 struct intel_context *ce, 1611 gfp_t gfp) 1612 { 1613 struct intel_engine_capture_vma *vma = NULL; 1614 1615 ee->simulated |= record_context(&ee->context, ce); 1616 if (ee->simulated) 1617 return NULL; 1618 1619 /* 1620 * We need to copy these to an anonymous buffer 1621 * as the simplest method to avoid being overwritten 1622 * by userspace. 1623 */ 1624 vma = capture_vma(vma, ce->ring->vma, "ring", gfp); 1625 vma = capture_vma(vma, ce->state, "HW context", gfp); 1626 1627 return vma; 1628 } 1629 1630 struct intel_engine_capture_vma * 1631 intel_engine_coredump_add_request(struct intel_engine_coredump *ee, 1632 struct i915_request *rq, 1633 gfp_t gfp) 1634 { 1635 struct intel_engine_capture_vma *vma; 1636 1637 vma = engine_coredump_add_context(ee, rq->context, gfp); 1638 if (!vma) 1639 return NULL; 1640 1641 /* 1642 * We need to copy these to an anonymous buffer 1643 * as the simplest method to avoid being overwritten 1644 * by userspace. 1645 */ 1646 vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch"); 1647 vma = capture_user(vma, rq, gfp); 1648 1649 ee->rq_head = rq->head; 1650 ee->rq_post = rq->postfix; 1651 ee->rq_tail = rq->tail; 1652 1653 return vma; 1654 } 1655 1656 void 1657 intel_engine_coredump_add_vma(struct intel_engine_coredump *ee, 1658 struct intel_engine_capture_vma *capture, 1659 struct i915_vma_compress *compress) 1660 { 1661 const struct intel_engine_cs *engine = ee->engine; 1662 1663 while (capture) { 1664 struct intel_engine_capture_vma *this = capture; 1665 struct i915_vma_resource *vma_res = this->vma_res; 1666 1667 add_vma(ee, 1668 i915_vma_coredump_create(engine->gt, vma_res, 1669 compress, this->name)); 1670 1671 i915_vma_resource_unhold(vma_res, this->lockdep_cookie); 1672 i915_vma_resource_put(vma_res); 1673 1674 capture = this->next; 1675 kfree(this); 1676 } 1677 1678 add_vma_coredump(ee, engine->gt, engine->status_page.vma, 1679 "HW Status", compress); 1680 1681 add_vma_coredump(ee, engine->gt, engine->wa_ctx.vma, 1682 "WA context", compress); 1683 } 1684 1685 static struct intel_engine_coredump * 1686 capture_engine(struct intel_engine_cs *engine, 1687 struct i915_vma_compress *compress, 1688 u32 dump_flags) 1689 { 1690 struct intel_engine_capture_vma *capture = NULL; 1691 struct intel_engine_coredump *ee; 1692 struct intel_context *ce = NULL; 1693 struct i915_request *rq = NULL; 1694 1695 ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags); 1696 if (!ee) 1697 return NULL; 1698 1699 intel_engine_get_hung_entity(engine, &ce, &rq); 1700 if (rq && !i915_request_started(rq)) { 1701 /* 1702 * We want to know also what is the guc_id of the context, 1703 * but if we don't have the context reference, then skip 1704 * printing it. 1705 */ 1706 if (ce) 1707 drm_info(&engine->gt->i915->drm, 1708 "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n", 1709 engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); 1710 else 1711 drm_info(&engine->gt->i915->drm, 1712 "Got hung context on %s with active request %lld:%lld not yet started\n", 1713 engine->name, rq->fence.context, rq->fence.seqno); 1714 } 1715 1716 if (rq) { 1717 capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); 1718 i915_request_put(rq); 1719 } else if (ce) { 1720 capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL); 1721 } 1722 1723 if (capture) { 1724 intel_engine_coredump_add_vma(ee, capture, compress); 1725 1726 if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) 1727 intel_guc_capture_get_matching_node(engine->gt, ee, ce); 1728 } else { 1729 kfree(ee); 1730 ee = NULL; 1731 } 1732 1733 return ee; 1734 } 1735 1736 static void 1737 gt_record_engines(struct intel_gt_coredump *gt, 1738 intel_engine_mask_t engine_mask, 1739 struct i915_vma_compress *compress, 1740 u32 dump_flags) 1741 { 1742 struct intel_engine_cs *engine; 1743 enum intel_engine_id id; 1744 1745 for_each_engine(engine, gt->_gt, id) { 1746 struct intel_engine_coredump *ee; 1747 1748 /* Refill our page pool before entering atomic section */ 1749 pool_refill(&compress->pool, ALLOW_FAIL); 1750 1751 ee = capture_engine(engine, compress, dump_flags); 1752 if (!ee) 1753 continue; 1754 1755 ee->hung = engine->mask & engine_mask; 1756 1757 gt->simulated |= ee->simulated; 1758 if (ee->simulated) { 1759 if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) 1760 intel_guc_capture_free_node(ee); 1761 kfree(ee); 1762 continue; 1763 } 1764 1765 ee->next = gt->engine; 1766 gt->engine = ee; 1767 } 1768 } 1769 1770 static void gt_record_guc_ctb(struct intel_ctb_coredump *saved, 1771 const struct intel_guc_ct_buffer *ctb, 1772 const void *blob_ptr, struct intel_guc *guc) 1773 { 1774 if (!ctb || !ctb->desc) 1775 return; 1776 1777 saved->raw_status = ctb->desc->status; 1778 saved->raw_head = ctb->desc->head; 1779 saved->raw_tail = ctb->desc->tail; 1780 saved->head = ctb->head; 1781 saved->tail = ctb->tail; 1782 saved->size = ctb->size; 1783 saved->desc_offset = ((void *)ctb->desc) - blob_ptr; 1784 saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr; 1785 } 1786 1787 static struct intel_uc_coredump * 1788 gt_record_uc(struct intel_gt_coredump *gt, 1789 struct i915_vma_compress *compress) 1790 { 1791 const struct intel_uc *uc = >->_gt->uc; 1792 struct intel_uc_coredump *error_uc; 1793 1794 error_uc = kzalloc(sizeof(*error_uc), ALLOW_FAIL); 1795 if (!error_uc) 1796 return NULL; 1797 1798 memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw)); 1799 memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw)); 1800 1801 error_uc->guc_fw.file_selected.path = kstrdup(uc->guc.fw.file_selected.path, ALLOW_FAIL); 1802 error_uc->huc_fw.file_selected.path = kstrdup(uc->huc.fw.file_selected.path, ALLOW_FAIL); 1803 error_uc->guc_fw.file_wanted.path = kstrdup(uc->guc.fw.file_wanted.path, ALLOW_FAIL); 1804 error_uc->huc_fw.file_wanted.path = kstrdup(uc->huc.fw.file_wanted.path, ALLOW_FAIL); 1805 1806 /* 1807 * Save the GuC log and include a timestamp reference for converting the 1808 * log times to system times (in conjunction with the error->boottime and 1809 * gt->clock_frequency fields saved elsewhere). 1810 */ 1811 error_uc->guc.timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP); 1812 error_uc->guc.vma_log = create_vma_coredump(gt->_gt, uc->guc.log.vma, 1813 "GuC log buffer", compress); 1814 error_uc->guc.vma_ctb = create_vma_coredump(gt->_gt, uc->guc.ct.vma, 1815 "GuC CT buffer", compress); 1816 error_uc->guc.last_fence = uc->guc.ct.requests.last_fence; 1817 gt_record_guc_ctb(error_uc->guc.ctb + 0, &uc->guc.ct.ctbs.send, 1818 uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); 1819 gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv, 1820 uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); 1821 1822 return error_uc; 1823 } 1824 1825 /* Capture display registers. */ 1826 static void gt_record_display_regs(struct intel_gt_coredump *gt) 1827 { 1828 struct intel_uncore *uncore = gt->_gt->uncore; 1829 struct drm_i915_private *i915 = uncore->i915; 1830 1831 if (GRAPHICS_VER(i915) >= 6) 1832 gt->derrmr = intel_uncore_read(uncore, DERRMR); 1833 1834 if (GRAPHICS_VER(i915) >= 8) 1835 gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); 1836 else if (IS_VALLEYVIEW(i915)) 1837 gt->ier = intel_uncore_read(uncore, VLV_IER); 1838 else if (HAS_PCH_SPLIT(i915)) 1839 gt->ier = intel_uncore_read(uncore, DEIER); 1840 else if (GRAPHICS_VER(i915) == 2) 1841 gt->ier = intel_uncore_read16(uncore, GEN2_IER); 1842 else 1843 gt->ier = intel_uncore_read(uncore, GEN2_IER); 1844 } 1845 1846 /* Capture all other registers that GuC doesn't capture. */ 1847 static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt) 1848 { 1849 struct intel_uncore *uncore = gt->_gt->uncore; 1850 struct drm_i915_private *i915 = uncore->i915; 1851 int i; 1852 1853 if (IS_VALLEYVIEW(i915)) { 1854 gt->gtier[0] = intel_uncore_read(uncore, GTIER); 1855 gt->ngtier = 1; 1856 } else if (GRAPHICS_VER(i915) >= 11) { 1857 gt->gtier[0] = 1858 intel_uncore_read(uncore, 1859 GEN11_RENDER_COPY_INTR_ENABLE); 1860 gt->gtier[1] = 1861 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); 1862 gt->gtier[2] = 1863 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); 1864 gt->gtier[3] = 1865 intel_uncore_read(uncore, 1866 GEN11_GPM_WGBOXPERF_INTR_ENABLE); 1867 gt->gtier[4] = 1868 intel_uncore_read(uncore, 1869 GEN11_CRYPTO_RSVD_INTR_ENABLE); 1870 gt->gtier[5] = 1871 intel_uncore_read(uncore, 1872 GEN11_GUNIT_CSME_INTR_ENABLE); 1873 gt->ngtier = 6; 1874 } else if (GRAPHICS_VER(i915) >= 8) { 1875 for (i = 0; i < 4; i++) 1876 gt->gtier[i] = 1877 intel_uncore_read(uncore, GEN8_GT_IER(i)); 1878 gt->ngtier = 4; 1879 } else if (HAS_PCH_SPLIT(i915)) { 1880 gt->gtier[0] = intel_uncore_read(uncore, GTIER); 1881 gt->ngtier = 1; 1882 } 1883 1884 gt->eir = intel_uncore_read(uncore, EIR); 1885 gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); 1886 } 1887 1888 /* 1889 * Capture all registers that relate to workload submission. 1890 * NOTE: In GuC submission, when GuC resets an engine, it can dump these for us 1891 */ 1892 static void gt_record_global_regs(struct intel_gt_coredump *gt) 1893 { 1894 struct intel_uncore *uncore = gt->_gt->uncore; 1895 struct drm_i915_private *i915 = uncore->i915; 1896 int i; 1897 1898 /* 1899 * General organization 1900 * 1. Registers specific to a single generation 1901 * 2. Registers which belong to multiple generations 1902 * 3. Feature specific registers. 1903 * 4. Everything else 1904 * Please try to follow the order. 1905 */ 1906 1907 /* 1: Registers specific to a single generation */ 1908 if (IS_VALLEYVIEW(i915)) 1909 gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); 1910 1911 if (GRAPHICS_VER(i915) == 7) 1912 gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); 1913 1914 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { 1915 gt->fault_data0 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt, 1916 XEHP_FAULT_TLB_DATA0); 1917 gt->fault_data1 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt, 1918 XEHP_FAULT_TLB_DATA1); 1919 } else if (GRAPHICS_VER(i915) >= 12) { 1920 gt->fault_data0 = intel_uncore_read(uncore, 1921 GEN12_FAULT_TLB_DATA0); 1922 gt->fault_data1 = intel_uncore_read(uncore, 1923 GEN12_FAULT_TLB_DATA1); 1924 } else if (GRAPHICS_VER(i915) >= 8) { 1925 gt->fault_data0 = intel_uncore_read(uncore, 1926 GEN8_FAULT_TLB_DATA0); 1927 gt->fault_data1 = intel_uncore_read(uncore, 1928 GEN8_FAULT_TLB_DATA1); 1929 } 1930 1931 if (GRAPHICS_VER(i915) == 6) { 1932 gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); 1933 gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL); 1934 gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE); 1935 } 1936 1937 /* 2: Registers which belong to multiple generations */ 1938 if (GRAPHICS_VER(i915) >= 7) 1939 gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); 1940 1941 if (GRAPHICS_VER(i915) >= 6) { 1942 if (GRAPHICS_VER(i915) < 12) { 1943 gt->error = intel_uncore_read(uncore, ERROR_GEN6); 1944 gt->done_reg = intel_uncore_read(uncore, DONE_REG); 1945 } 1946 } 1947 1948 /* 3: Feature specific registers */ 1949 if (IS_GRAPHICS_VER(i915, 6, 7)) { 1950 gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); 1951 gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); 1952 } 1953 1954 if (IS_GRAPHICS_VER(i915, 8, 11)) 1955 gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN); 1956 1957 if (GRAPHICS_VER(i915) == 12) 1958 gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG); 1959 1960 if (GRAPHICS_VER(i915) >= 12) { 1961 for (i = 0; i < I915_MAX_SFC; i++) { 1962 /* 1963 * SFC_DONE resides in the VD forcewake domain, so it 1964 * only exists if the corresponding VCS engine is 1965 * present. 1966 */ 1967 if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 || 1968 !HAS_ENGINE(gt->_gt, _VCS(i * 2))) 1969 continue; 1970 1971 gt->sfc_done[i] = 1972 intel_uncore_read(uncore, GEN12_SFC_DONE(i)); 1973 } 1974 1975 gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE); 1976 } 1977 } 1978 1979 static void gt_record_info(struct intel_gt_coredump *gt) 1980 { 1981 memcpy(>->info, >->_gt->info, sizeof(struct intel_gt_info)); 1982 gt->clock_frequency = gt->_gt->clock_frequency; 1983 gt->clock_period_ns = gt->_gt->clock_period_ns; 1984 } 1985 1986 /* 1987 * Generate a semi-unique error code. The code is not meant to have meaning, The 1988 * code's only purpose is to try to prevent false duplicated bug reports by 1989 * grossly estimating a GPU error state. 1990 * 1991 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine 1992 * the hang if we could strip the GTT offset information from it. 1993 * 1994 * It's only a small step better than a random number in its current form. 1995 */ 1996 static u32 generate_ecode(const struct intel_engine_coredump *ee) 1997 { 1998 /* 1999 * IPEHR would be an ideal way to detect errors, as it's the gross 2000 * measure of "the command that hung." However, has some very common 2001 * synchronization commands which almost always appear in the case 2002 * strictly a client bug. Use instdone to differentiate those some. 2003 */ 2004 return ee ? ee->ipehr ^ ee->instdone.instdone : 0; 2005 } 2006 2007 static const char *error_msg(struct i915_gpu_coredump *error) 2008 { 2009 struct intel_engine_coredump *first = NULL; 2010 unsigned int hung_classes = 0; 2011 struct intel_gt_coredump *gt; 2012 int len; 2013 2014 for (gt = error->gt; gt; gt = gt->next) { 2015 struct intel_engine_coredump *cs; 2016 2017 for (cs = gt->engine; cs; cs = cs->next) { 2018 if (cs->hung) { 2019 hung_classes |= BIT(cs->engine->uabi_class); 2020 if (!first) 2021 first = cs; 2022 } 2023 } 2024 } 2025 2026 len = scnprintf(error->error_msg, sizeof(error->error_msg), 2027 "GPU HANG: ecode %d:%x:%08x", 2028 GRAPHICS_VER(error->i915), hung_classes, 2029 generate_ecode(first)); 2030 if (first && first->context.pid) { 2031 /* Just show the first executing process, more is confusing */ 2032 len += scnprintf(error->error_msg + len, 2033 sizeof(error->error_msg) - len, 2034 ", in %s [%d]", 2035 first->context.comm, first->context.pid); 2036 } 2037 2038 return error->error_msg; 2039 } 2040 2041 static void capture_gen(struct i915_gpu_coredump *error) 2042 { 2043 struct drm_i915_private *i915 = error->i915; 2044 2045 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); 2046 error->suspended = i915->runtime_pm.suspended; 2047 2048 error->iommu = i915_vtd_active(i915); 2049 error->reset_count = i915_reset_count(&i915->gpu_error); 2050 error->suspend_count = i915->suspend_count; 2051 2052 i915_params_copy(&error->params, &i915->params); 2053 memcpy(&error->device_info, 2054 INTEL_INFO(i915), 2055 sizeof(error->device_info)); 2056 memcpy(&error->runtime_info, 2057 RUNTIME_INFO(i915), 2058 sizeof(error->runtime_info)); 2059 memcpy(&error->display_device_info, DISPLAY_INFO(i915), 2060 sizeof(error->display_device_info)); 2061 memcpy(&error->display_runtime_info, DISPLAY_RUNTIME_INFO(i915), 2062 sizeof(error->display_runtime_info)); 2063 error->driver_caps = i915->caps; 2064 } 2065 2066 struct i915_gpu_coredump * 2067 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp) 2068 { 2069 struct i915_gpu_coredump *error; 2070 2071 if (!i915->params.error_capture) 2072 return NULL; 2073 2074 error = kzalloc(sizeof(*error), gfp); 2075 if (!error) 2076 return NULL; 2077 2078 kref_init(&error->ref); 2079 error->i915 = i915; 2080 2081 error->time = ktime_get_real(); 2082 error->boottime = ktime_get_boottime(); 2083 error->uptime = ktime_sub(ktime_get(), to_gt(i915)->last_init_time); 2084 error->capture = jiffies; 2085 2086 capture_gen(error); 2087 2088 return error; 2089 } 2090 2091 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) 2092 2093 struct intel_gt_coredump * 2094 intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags) 2095 { 2096 struct intel_gt_coredump *gc; 2097 2098 gc = kzalloc(sizeof(*gc), gfp); 2099 if (!gc) 2100 return NULL; 2101 2102 gc->_gt = gt; 2103 gc->awake = intel_gt_pm_is_awake(gt); 2104 2105 gt_record_display_regs(gc); 2106 gt_record_global_nonguc_regs(gc); 2107 2108 /* 2109 * GuC dumps global, eng-class and eng-instance registers 2110 * (that can change as part of engine state during execution) 2111 * before an engine is reset due to a hung context. 2112 * GuC captures and reports all three groups of registers 2113 * together as a single set before the engine is reset. 2114 * Thus, if GuC triggered the context reset we retrieve 2115 * the register values as part of gt_record_engines. 2116 */ 2117 if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)) 2118 gt_record_global_regs(gc); 2119 2120 gt_record_fences(gc); 2121 2122 return gc; 2123 } 2124 2125 struct i915_vma_compress * 2126 i915_vma_capture_prepare(struct intel_gt_coredump *gt) 2127 { 2128 struct i915_vma_compress *compress; 2129 2130 compress = kmalloc(sizeof(*compress), ALLOW_FAIL); 2131 if (!compress) 2132 return NULL; 2133 2134 if (!compress_init(compress)) { 2135 kfree(compress); 2136 return NULL; 2137 } 2138 2139 return compress; 2140 } 2141 2142 void i915_vma_capture_finish(struct intel_gt_coredump *gt, 2143 struct i915_vma_compress *compress) 2144 { 2145 if (!compress) 2146 return; 2147 2148 compress_fini(compress); 2149 kfree(compress); 2150 } 2151 2152 static struct i915_gpu_coredump * 2153 __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) 2154 { 2155 struct drm_i915_private *i915 = gt->i915; 2156 struct i915_gpu_coredump *error; 2157 2158 /* Check if GPU capture has been disabled */ 2159 error = READ_ONCE(i915->gpu_error.first_error); 2160 if (IS_ERR(error)) 2161 return error; 2162 2163 error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL); 2164 if (!error) 2165 return ERR_PTR(-ENOMEM); 2166 2167 error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL, dump_flags); 2168 if (error->gt) { 2169 struct i915_vma_compress *compress; 2170 2171 compress = i915_vma_capture_prepare(error->gt); 2172 if (!compress) { 2173 kfree(error->gt); 2174 kfree(error); 2175 return ERR_PTR(-ENOMEM); 2176 } 2177 2178 if (INTEL_INFO(i915)->has_gt_uc) { 2179 error->gt->uc = gt_record_uc(error->gt, compress); 2180 if (error->gt->uc) { 2181 if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) 2182 error->gt->uc->guc.is_guc_capture = true; 2183 else 2184 GEM_BUG_ON(error->gt->uc->guc.is_guc_capture); 2185 } 2186 } 2187 2188 gt_record_info(error->gt); 2189 gt_record_engines(error->gt, engine_mask, compress, dump_flags); 2190 2191 2192 i915_vma_capture_finish(error->gt, compress); 2193 2194 error->simulated |= error->gt->simulated; 2195 } 2196 2197 error->overlay = intel_overlay_capture_error_state(i915); 2198 2199 return error; 2200 } 2201 2202 static DEFINE_MUTEX(capture_mutex); 2203 2204 struct i915_gpu_coredump * 2205 i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) 2206 { 2207 int ret = mutex_lock_interruptible(&capture_mutex); 2208 struct i915_gpu_coredump *dump; 2209 2210 if (ret) 2211 return ERR_PTR(ret); 2212 2213 dump = __i915_gpu_coredump(gt, engine_mask, dump_flags); 2214 mutex_unlock(&capture_mutex); 2215 2216 return dump; 2217 } 2218 2219 void i915_error_state_store(struct i915_gpu_coredump *error) 2220 { 2221 struct drm_i915_private *i915; 2222 static bool warned; 2223 2224 if (IS_ERR_OR_NULL(error)) 2225 return; 2226 2227 i915 = error->i915; 2228 drm_info(&i915->drm, "%s\n", error_msg(error)); 2229 2230 if (error->simulated || 2231 cmpxchg(&i915->gpu_error.first_error, NULL, error)) 2232 return; 2233 2234 i915_gpu_coredump_get(error); 2235 2236 if (!xchg(&warned, true) && 2237 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { 2238 pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); 2239 pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n"); 2240 pr_info("Please see https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs for details.\n"); 2241 pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); 2242 pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n"); 2243 pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n", 2244 i915->drm.primary->index); 2245 } 2246 } 2247 2248 /** 2249 * i915_capture_error_state - capture an error record for later analysis 2250 * @gt: intel_gt which originated the hang 2251 * @engine_mask: hung engines 2252 * @dump_flags: dump flags 2253 * 2254 * Should be called when an error is detected (either a hang or an error 2255 * interrupt) to capture error state from the time of the error. Fills 2256 * out a structure which becomes available in debugfs for user level tools 2257 * to pick up. 2258 */ 2259 void i915_capture_error_state(struct intel_gt *gt, 2260 intel_engine_mask_t engine_mask, u32 dump_flags) 2261 { 2262 struct i915_gpu_coredump *error; 2263 2264 error = i915_gpu_coredump(gt, engine_mask, dump_flags); 2265 if (IS_ERR(error)) { 2266 cmpxchg(>->i915->gpu_error.first_error, NULL, error); 2267 return; 2268 } 2269 2270 i915_error_state_store(error); 2271 i915_gpu_coredump_put(error); 2272 } 2273 2274 struct i915_gpu_coredump * 2275 i915_first_error_state(struct drm_i915_private *i915) 2276 { 2277 struct i915_gpu_coredump *error; 2278 2279 spin_lock_irq(&i915->gpu_error.lock); 2280 error = i915->gpu_error.first_error; 2281 if (!IS_ERR_OR_NULL(error)) 2282 i915_gpu_coredump_get(error); 2283 spin_unlock_irq(&i915->gpu_error.lock); 2284 2285 return error; 2286 } 2287 2288 void i915_reset_error_state(struct drm_i915_private *i915) 2289 { 2290 struct i915_gpu_coredump *error; 2291 2292 spin_lock_irq(&i915->gpu_error.lock); 2293 error = i915->gpu_error.first_error; 2294 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */ 2295 i915->gpu_error.first_error = NULL; 2296 spin_unlock_irq(&i915->gpu_error.lock); 2297 2298 if (!IS_ERR_OR_NULL(error)) 2299 i915_gpu_coredump_put(error); 2300 } 2301 2302 void i915_disable_error_state(struct drm_i915_private *i915, int err) 2303 { 2304 spin_lock_irq(&i915->gpu_error.lock); 2305 if (!i915->gpu_error.first_error) 2306 i915->gpu_error.first_error = ERR_PTR(err); 2307 spin_unlock_irq(&i915->gpu_error.lock); 2308 } 2309 2310 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) 2311 void intel_klog_error_capture(struct intel_gt *gt, 2312 intel_engine_mask_t engine_mask) 2313 { 2314 static int g_count; 2315 struct drm_i915_private *i915 = gt->i915; 2316 struct i915_gpu_coredump *error; 2317 intel_wakeref_t wakeref; 2318 size_t buf_size = PAGE_SIZE * 128; 2319 size_t pos_err; 2320 char *buf, *ptr, *next; 2321 int l_count = g_count++; 2322 int line = 0; 2323 2324 /* Can't allocate memory during a reset */ 2325 if (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { 2326 drm_err(>->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n", 2327 l_count, line++); 2328 return; 2329 } 2330 2331 error = READ_ONCE(i915->gpu_error.first_error); 2332 if (error) { 2333 drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n", 2334 l_count, line++); 2335 i915_reset_error_state(i915); 2336 } 2337 2338 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 2339 error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE); 2340 2341 if (IS_ERR(error)) { 2342 drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n", 2343 l_count, line++, PTR_ERR(error)); 2344 return; 2345 } 2346 2347 buf = kvmalloc(buf_size, GFP_KERNEL); 2348 if (!buf) { 2349 drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n", 2350 l_count, line++); 2351 i915_gpu_coredump_put(error); 2352 return; 2353 } 2354 2355 drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n", 2356 l_count, line++, __builtin_return_address(0)); 2357 2358 /* Largest string length safe to print via dmesg */ 2359 # define MAX_CHUNK 800 2360 2361 pos_err = 0; 2362 while (1) { 2363 ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1); 2364 2365 if (got <= 0) 2366 break; 2367 2368 buf[got] = 0; 2369 pos_err += got; 2370 2371 ptr = buf; 2372 while (got > 0) { 2373 size_t count; 2374 char tag[2]; 2375 2376 next = strnchr(ptr, got, '\n'); 2377 if (next) { 2378 count = next - ptr; 2379 *next = 0; 2380 tag[0] = '>'; 2381 tag[1] = '<'; 2382 } else { 2383 count = got; 2384 tag[0] = '}'; 2385 tag[1] = '{'; 2386 } 2387 2388 if (count > MAX_CHUNK) { 2389 size_t pos; 2390 char *ptr2 = ptr; 2391 2392 for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) { 2393 char chr = ptr[pos]; 2394 2395 ptr[pos] = 0; 2396 drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n", 2397 l_count, line++, ptr2); 2398 ptr[pos] = chr; 2399 ptr2 = ptr + pos; 2400 2401 /* 2402 * If spewing large amounts of data via a serial console, 2403 * this can be a very slow process. So be friendly and try 2404 * not to cause 'softlockup on CPU' problems. 2405 */ 2406 cond_resched(); 2407 } 2408 2409 if (ptr2 < (ptr + count)) 2410 drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n", 2411 l_count, line++, tag[0], ptr2, tag[1]); 2412 else if (tag[0] == '>') 2413 drm_info(&i915->drm, "[Capture/%d.%d] ><\n", 2414 l_count, line++); 2415 } else { 2416 drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n", 2417 l_count, line++, tag[0], ptr, tag[1]); 2418 } 2419 2420 ptr = next; 2421 got -= count; 2422 if (next) { 2423 ptr++; 2424 got--; 2425 } 2426 2427 /* As above. */ 2428 cond_resched(); 2429 } 2430 2431 if (got) 2432 drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n", 2433 l_count, line++, got); 2434 } 2435 2436 kvfree(buf); 2437 2438 drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err); 2439 } 2440 #endif 2441