1 /* 2 * Copyright (c) 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * Keith Packard <keithp@keithp.com> 26 * Mika Kuoppala <mika.kuoppala@intel.com> 27 * 28 */ 29 30 #include <generated/utsrelease.h> 31 #include <linux/stop_machine.h> 32 #include <linux/zlib.h> 33 #include <drm/drm_print.h> 34 #ifdef __linux__ 35 #include <linux/ascii85.h> 36 #endif 37 38 #include "i915_drv.h" 39 #include "i915_gpu_error.h" 40 41 static inline const struct intel_engine_cs * 42 engine_lookup(const struct drm_i915_private *i915, unsigned int id) 43 { 44 if (id >= I915_NUM_ENGINES) 45 return NULL; 46 47 return i915->engine[id]; 48 } 49 50 static inline const char * 51 __engine_name(const struct intel_engine_cs *engine) 52 { 53 return engine ? engine->name : ""; 54 } 55 56 static const char * 57 engine_name(const struct drm_i915_private *i915, unsigned int id) 58 { 59 return __engine_name(engine_lookup(i915, id)); 60 } 61 62 static const char *tiling_flag(int tiling) 63 { 64 switch (tiling) { 65 default: 66 case I915_TILING_NONE: return ""; 67 case I915_TILING_X: return " X"; 68 case I915_TILING_Y: return " Y"; 69 } 70 } 71 72 static const char *dirty_flag(int dirty) 73 { 74 return dirty ? " dirty" : ""; 75 } 76 77 static const char *purgeable_flag(int purgeable) 78 { 79 return purgeable ? " purgeable" : ""; 80 } 81 82 static bool __i915_error_ok(struct drm_i915_error_state_buf *e) 83 { 84 85 if (!e->err && WARN(e->bytes > (e->size - 1), "overflow")) { 86 e->err = -ENOSPC; 87 return false; 88 } 89 90 if (e->bytes == e->size - 1 || e->err) 91 return false; 92 93 return true; 94 } 95 96 static bool __i915_error_seek(struct drm_i915_error_state_buf *e, 97 unsigned len) 98 { 99 if (e->pos + len <= e->start) { 100 e->pos += len; 101 return false; 102 } 103 104 /* First vsnprintf needs to fit in its entirety for memmove */ 105 if (len >= e->size) { 106 e->err = -EIO; 107 return false; 108 } 109 110 return true; 111 } 112 113 static void __i915_error_advance(struct drm_i915_error_state_buf *e, 114 unsigned len) 115 { 116 /* If this is first printf in this window, adjust it so that 117 * start position matches start of the buffer 118 */ 119 120 if (e->pos < e->start) { 121 const size_t off = e->start - e->pos; 122 123 /* Should not happen but be paranoid */ 124 if (off > len || e->bytes) { 125 e->err = -EIO; 126 return; 127 } 128 129 memmove(e->buf, e->buf + off, len - off); 130 e->bytes = len - off; 131 e->pos = e->start; 132 return; 133 } 134 135 e->bytes += len; 136 e->pos += len; 137 } 138 139 __printf(2, 0) 140 static void i915_error_vprintf(struct drm_i915_error_state_buf *e, 141 const char *f, va_list args) 142 { 143 unsigned len; 144 145 if (!__i915_error_ok(e)) 146 return; 147 148 /* Seek the first printf which is hits start position */ 149 if (e->pos < e->start) { 150 va_list tmp; 151 152 va_copy(tmp, args); 153 len = vsnprintf(NULL, 0, f, tmp); 154 va_end(tmp); 155 156 if (!__i915_error_seek(e, len)) 157 return; 158 } 159 160 len = vsnprintf(e->buf + e->bytes, e->size - e->bytes, f, args); 161 if (len >= e->size - e->bytes) 162 len = e->size - e->bytes - 1; 163 164 __i915_error_advance(e, len); 165 } 166 167 static void i915_error_puts(struct drm_i915_error_state_buf *e, 168 const char *str) 169 { 170 unsigned len; 171 172 if (!__i915_error_ok(e)) 173 return; 174 175 len = strlen(str); 176 177 /* Seek the first printf which is hits start position */ 178 if (e->pos < e->start) { 179 if (!__i915_error_seek(e, len)) 180 return; 181 } 182 183 if (len >= e->size - e->bytes) 184 len = e->size - e->bytes - 1; 185 memcpy(e->buf + e->bytes, str, len); 186 187 __i915_error_advance(e, len); 188 } 189 190 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) 191 #define err_puts(e, s) i915_error_puts(e, s) 192 193 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) 194 { 195 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va); 196 } 197 198 static inline struct drm_printer 199 i915_error_printer(struct drm_i915_error_state_buf *e) 200 { 201 struct drm_printer p = { 202 .printfn = __i915_printfn_error, 203 .arg = e, 204 }; 205 return p; 206 } 207 208 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR 209 210 struct compress { 211 struct z_stream_s zstream; 212 void *tmp; 213 }; 214 215 static bool compress_init(struct compress *c) 216 { 217 struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream)); 218 219 zstream->workspace = 220 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 221 GFP_ATOMIC | __GFP_NOWARN); 222 if (!zstream->workspace) 223 return false; 224 225 if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) { 226 kfree(zstream->workspace); 227 return false; 228 } 229 230 c->tmp = NULL; 231 if (i915_has_memcpy_from_wc()) 232 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN); 233 234 return true; 235 } 236 237 static void *compress_next_page(struct drm_i915_error_object *dst) 238 { 239 unsigned long page; 240 241 if (dst->page_count >= dst->num_pages) 242 return ERR_PTR(-ENOSPC); 243 244 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); 245 if (!page) 246 return ERR_PTR(-ENOMEM); 247 248 return dst->pages[dst->page_count++] = (void *)page; 249 } 250 251 static int compress_page(struct compress *c, 252 void *src, 253 struct drm_i915_error_object *dst) 254 { 255 struct z_stream_s *zstream = &c->zstream; 256 257 zstream->next_in = src; 258 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) 259 zstream->next_in = c->tmp; 260 zstream->avail_in = PAGE_SIZE; 261 262 do { 263 if (zstream->avail_out == 0) { 264 zstream->next_out = compress_next_page(dst); 265 if (IS_ERR(zstream->next_out)) 266 return PTR_ERR(zstream->next_out); 267 268 zstream->avail_out = PAGE_SIZE; 269 } 270 271 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) 272 return -EIO; 273 } while (zstream->avail_in); 274 275 /* Fallback to uncompressed if we increase size? */ 276 if (0 && zstream->total_out > zstream->total_in) 277 return -E2BIG; 278 279 return 0; 280 } 281 282 static int compress_flush(struct compress *c, 283 struct drm_i915_error_object *dst) 284 { 285 struct z_stream_s *zstream = &c->zstream; 286 287 do { 288 switch (zlib_deflate(zstream, Z_FINISH)) { 289 case Z_OK: /* more space requested */ 290 zstream->next_out = compress_next_page(dst); 291 if (IS_ERR(zstream->next_out)) 292 return PTR_ERR(zstream->next_out); 293 294 zstream->avail_out = PAGE_SIZE; 295 break; 296 297 case Z_STREAM_END: 298 goto end; 299 300 default: /* any error */ 301 return -EIO; 302 } 303 } while (1); 304 305 end: 306 memset(zstream->next_out, 0, zstream->avail_out); 307 dst->unused = zstream->avail_out; 308 return 0; 309 } 310 311 static void compress_fini(struct compress *c, 312 struct drm_i915_error_object *dst) 313 { 314 struct z_stream_s *zstream = &c->zstream; 315 316 zlib_deflateEnd(zstream); 317 kfree(zstream->workspace); 318 if (c->tmp) 319 free_page((unsigned long)c->tmp); 320 } 321 322 static void err_compression_marker(struct drm_i915_error_state_buf *m) 323 { 324 err_puts(m, ":"); 325 } 326 327 #else 328 329 struct compress { 330 }; 331 332 static bool compress_init(struct compress *c) 333 { 334 return true; 335 } 336 337 static int compress_page(struct compress *c, 338 void *src, 339 struct drm_i915_error_object *dst) 340 { 341 unsigned long page; 342 void *ptr; 343 344 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); 345 if (!page) 346 return -ENOMEM; 347 348 ptr = (void *)page; 349 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) 350 memcpy(ptr, src, PAGE_SIZE); 351 dst->pages[dst->page_count++] = ptr; 352 353 return 0; 354 } 355 356 static int compress_flush(struct compress *c, 357 struct drm_i915_error_object *dst) 358 { 359 return 0; 360 } 361 362 static void compress_fini(struct compress *c, 363 struct drm_i915_error_object *dst) 364 { 365 } 366 367 static void err_compression_marker(struct drm_i915_error_state_buf *m) 368 { 369 err_puts(m, "~"); 370 } 371 372 #endif 373 374 static void print_error_buffers(struct drm_i915_error_state_buf *m, 375 const char *name, 376 struct drm_i915_error_buffer *err, 377 int count) 378 { 379 err_printf(m, "%s [%d]:\n", name, count); 380 381 while (count--) { 382 err_printf(m, " %08x_%08x %8u %02x %02x %02x", 383 upper_32_bits(err->gtt_offset), 384 lower_32_bits(err->gtt_offset), 385 err->size, 386 err->read_domains, 387 err->write_domain, 388 err->wseqno); 389 err_puts(m, tiling_flag(err->tiling)); 390 err_puts(m, dirty_flag(err->dirty)); 391 err_puts(m, purgeable_flag(err->purgeable)); 392 err_puts(m, err->userptr ? " userptr" : ""); 393 err_puts(m, err->engine != -1 ? " " : ""); 394 err_puts(m, engine_name(m->i915, err->engine)); 395 err_puts(m, i915_cache_level_str(m->i915, err->cache_level)); 396 397 if (err->name) 398 err_printf(m, " (name: %d)", err->name); 399 if (err->fence_reg != I915_FENCE_REG_NONE) 400 err_printf(m, " (fence: %d)", err->fence_reg); 401 402 err_puts(m, "\n"); 403 err++; 404 } 405 } 406 407 static void error_print_instdone(struct drm_i915_error_state_buf *m, 408 const struct drm_i915_error_engine *ee) 409 { 410 int slice; 411 int subslice; 412 413 err_printf(m, " INSTDONE: 0x%08x\n", 414 ee->instdone.instdone); 415 416 if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3) 417 return; 418 419 err_printf(m, " SC_INSTDONE: 0x%08x\n", 420 ee->instdone.slice_common); 421 422 if (INTEL_GEN(m->i915) <= 6) 423 return; 424 425 for_each_instdone_slice_subslice(m->i915, slice, subslice) 426 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", 427 slice, subslice, 428 ee->instdone.sampler[slice][subslice]); 429 430 for_each_instdone_slice_subslice(m->i915, slice, subslice) 431 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", 432 slice, subslice, 433 ee->instdone.row[slice][subslice]); 434 } 435 436 static const char *bannable(const struct drm_i915_error_context *ctx) 437 { 438 return ctx->bannable ? "" : " (unbannable)"; 439 } 440 441 static void error_print_request(struct drm_i915_error_state_buf *m, 442 const char *prefix, 443 const struct drm_i915_error_request *erq, 444 const unsigned long epoch) 445 { 446 if (!erq->seqno) 447 return; 448 449 err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", 450 prefix, erq->pid, erq->ban_score, 451 erq->context, erq->seqno, erq->sched_attr.priority, 452 jiffies_to_msecs(erq->jiffies - epoch), 453 erq->start, erq->head, erq->tail); 454 } 455 456 static void error_print_context(struct drm_i915_error_state_buf *m, 457 const char *header, 458 const struct drm_i915_error_context *ctx) 459 { 460 err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n", 461 header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, 462 ctx->sched_attr.priority, ctx->ban_score, bannable(ctx), 463 ctx->guilty, ctx->active); 464 } 465 466 static void error_print_engine(struct drm_i915_error_state_buf *m, 467 const struct drm_i915_error_engine *ee, 468 const unsigned long epoch) 469 { 470 int n; 471 472 err_printf(m, "%s command stream:\n", 473 engine_name(m->i915, ee->engine_id)); 474 err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); 475 err_printf(m, " START: 0x%08x\n", ee->start); 476 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); 477 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", 478 ee->tail, ee->rq_post, ee->rq_tail); 479 err_printf(m, " CTL: 0x%08x\n", ee->ctl); 480 err_printf(m, " MODE: 0x%08x\n", ee->mode); 481 err_printf(m, " HWS: 0x%08x\n", ee->hws); 482 err_printf(m, " ACTHD: 0x%08x %08x\n", 483 (u32)(ee->acthd>>32), (u32)ee->acthd); 484 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir); 485 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr); 486 487 error_print_instdone(m, ee); 488 489 if (ee->batchbuffer) { 490 u64 start = ee->batchbuffer->gtt_offset; 491 u64 end = start + ee->batchbuffer->gtt_size; 492 493 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", 494 upper_32_bits(start), lower_32_bits(start), 495 upper_32_bits(end), lower_32_bits(end)); 496 } 497 if (INTEL_GEN(m->i915) >= 4) { 498 err_printf(m, " BBADDR: 0x%08x_%08x\n", 499 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr); 500 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate); 501 err_printf(m, " INSTPS: 0x%08x\n", ee->instps); 502 } 503 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm); 504 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr), 505 lower_32_bits(ee->faddr)); 506 if (INTEL_GEN(m->i915) >= 6) { 507 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); 508 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); 509 err_printf(m, " SYNC_0: 0x%08x\n", 510 ee->semaphore_mboxes[0]); 511 err_printf(m, " SYNC_1: 0x%08x\n", 512 ee->semaphore_mboxes[1]); 513 if (HAS_VEBOX(m->i915)) 514 err_printf(m, " SYNC_2: 0x%08x\n", 515 ee->semaphore_mboxes[2]); 516 } 517 if (USES_PPGTT(m->i915)) { 518 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); 519 520 if (INTEL_GEN(m->i915) >= 8) { 521 int i; 522 for (i = 0; i < 4; i++) 523 err_printf(m, " PDP%d: 0x%016llx\n", 524 i, ee->vm_info.pdp[i]); 525 } else { 526 err_printf(m, " PP_DIR_BASE: 0x%08x\n", 527 ee->vm_info.pp_dir_base); 528 } 529 } 530 err_printf(m, " seqno: 0x%08x\n", ee->seqno); 531 err_printf(m, " last_seqno: 0x%08x\n", ee->last_seqno); 532 err_printf(m, " waiting: %s\n", yesno(ee->waiting)); 533 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); 534 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); 535 err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled)); 536 err_printf(m, " hangcheck action: %s\n", 537 hangcheck_action_to_str(ee->hangcheck_action)); 538 err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n", 539 jiffies_to_msecs(ee->hangcheck_timestamp - epoch), 540 ee->hangcheck_timestamp, 541 ee->hangcheck_timestamp == epoch ? "; epoch" : ""); 542 err_printf(m, " engine reset count: %u\n", ee->reset_count); 543 544 for (n = 0; n < ee->num_ports; n++) { 545 err_printf(m, " ELSP[%d]:", n); 546 error_print_request(m, " ", &ee->execlist[n], epoch); 547 } 548 549 error_print_context(m, " Active context: ", &ee->context); 550 } 551 552 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) 553 { 554 va_list args; 555 556 va_start(args, f); 557 i915_error_vprintf(e, f, args); 558 va_end(args); 559 } 560 561 static void print_error_obj(struct drm_i915_error_state_buf *m, 562 struct intel_engine_cs *engine, 563 const char *name, 564 struct drm_i915_error_object *obj) 565 { 566 STUB(); 567 #ifdef notyet 568 char out[ASCII85_BUFSZ]; 569 int page; 570 571 if (!obj) 572 return; 573 574 if (name) { 575 err_printf(m, "%s --- %s = 0x%08x %08x\n", 576 engine ? engine->name : "global", name, 577 upper_32_bits(obj->gtt_offset), 578 lower_32_bits(obj->gtt_offset)); 579 } 580 581 err_compression_marker(m); 582 for (page = 0; page < obj->page_count; page++) { 583 int i, len; 584 585 len = PAGE_SIZE; 586 if (page == obj->page_count - 1) 587 len -= obj->unused; 588 len = ascii85_encode_len(len); 589 590 for (i = 0; i < len; i++) 591 err_puts(m, ascii85_encode(obj->pages[page][i], out)); 592 } 593 err_puts(m, "\n"); 594 #endif 595 } 596 597 static void err_print_capabilities(struct drm_i915_error_state_buf *m, 598 const struct intel_device_info *info, 599 const struct intel_driver_caps *caps) 600 { 601 struct drm_printer p = i915_error_printer(m); 602 603 intel_device_info_dump_flags(info, &p); 604 intel_driver_caps_print(caps, &p); 605 intel_device_info_dump_topology(&info->sseu, &p); 606 } 607 608 static void err_print_params(struct drm_i915_error_state_buf *m, 609 const struct i915_params *params) 610 { 611 struct drm_printer p = i915_error_printer(m); 612 613 i915_params_dump(params, &p); 614 } 615 616 static void err_print_pciid(struct drm_i915_error_state_buf *m, 617 struct drm_i915_private *i915) 618 { 619 struct pci_dev *pdev = i915->drm.pdev; 620 621 err_printf(m, "PCI ID: 0x%04x\n", pdev->device); 622 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); 623 err_printf(m, "PCI Subsystem: %04x:%04x\n", 624 pdev->subsystem_vendor, 625 pdev->subsystem_device); 626 } 627 628 static void err_print_uc(struct drm_i915_error_state_buf *m, 629 const struct i915_error_uc *error_uc) 630 { 631 struct drm_printer p = i915_error_printer(m); 632 const struct i915_gpu_state *error = 633 container_of(error_uc, typeof(*error), uc); 634 635 if (!error->device_info.has_guc) 636 return; 637 638 intel_uc_fw_dump(&error_uc->guc_fw, &p); 639 intel_uc_fw_dump(&error_uc->huc_fw, &p); 640 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); 641 } 642 643 int i915_error_state_to_str(struct drm_i915_error_state_buf *m, 644 const struct i915_gpu_state *error) 645 { 646 struct drm_i915_private *dev_priv = m->i915; 647 struct drm_i915_error_object *obj; 648 struct timespec64 ts; 649 int i, j; 650 651 if (!error) { 652 err_printf(m, "No error state collected\n"); 653 return 0; 654 } 655 656 if (*error->error_msg) 657 err_printf(m, "%s\n", error->error_msg); 658 err_printf(m, "Kernel: " UTS_RELEASE "\n"); 659 ts = ktime_to_timespec64(error->time); 660 err_printf(m, "Time: %lld s %ld us\n", 661 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 662 ts = ktime_to_timespec64(error->boottime); 663 err_printf(m, "Boottime: %lld s %ld us\n", 664 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 665 ts = ktime_to_timespec64(error->uptime); 666 err_printf(m, "Uptime: %lld s %ld us\n", 667 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 668 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); 669 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", 670 error->capture, 671 jiffies_to_msecs(jiffies - error->capture), 672 jiffies_to_msecs(error->capture - error->epoch)); 673 674 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 675 if (error->engine[i].hangcheck_stalled && 676 error->engine[i].context.pid) { 677 err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", 678 engine_name(m->i915, i), 679 error->engine[i].context.comm, 680 error->engine[i].context.pid, 681 error->engine[i].context.ban_score, 682 bannable(&error->engine[i].context)); 683 } 684 } 685 err_printf(m, "Reset count: %u\n", error->reset_count); 686 err_printf(m, "Suspend count: %u\n", error->suspend_count); 687 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); 688 err_print_pciid(m, error->i915); 689 690 err_printf(m, "IOMMU enabled?: %d\n", error->iommu); 691 692 if (HAS_CSR(dev_priv)) { 693 struct intel_csr *csr = &dev_priv->csr; 694 695 err_printf(m, "DMC loaded: %s\n", 696 yesno(csr->dmc_payload != NULL)); 697 err_printf(m, "DMC fw version: %d.%d\n", 698 CSR_VERSION_MAJOR(csr->version), 699 CSR_VERSION_MINOR(csr->version)); 700 } 701 702 err_printf(m, "GT awake: %s\n", yesno(error->awake)); 703 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); 704 err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); 705 err_printf(m, "EIR: 0x%08x\n", error->eir); 706 err_printf(m, "IER: 0x%08x\n", error->ier); 707 for (i = 0; i < error->ngtier; i++) 708 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); 709 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); 710 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); 711 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); 712 err_printf(m, "CCID: 0x%08x\n", error->ccid); 713 err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings); 714 715 for (i = 0; i < error->nfence; i++) 716 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); 717 718 if (INTEL_GEN(dev_priv) >= 6) { 719 err_printf(m, "ERROR: 0x%08x\n", error->error); 720 721 if (INTEL_GEN(dev_priv) >= 8) 722 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", 723 error->fault_data1, error->fault_data0); 724 725 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); 726 } 727 728 if (IS_GEN7(dev_priv)) 729 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); 730 731 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 732 if (error->engine[i].engine_id != -1) 733 error_print_engine(m, &error->engine[i], error->epoch); 734 } 735 736 for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) { 737 char buf[128]; 738 int len, first = 1; 739 740 if (!error->active_vm[i]) 741 break; 742 743 len = scnprintf(buf, sizeof(buf), "Active ("); 744 for (j = 0; j < ARRAY_SIZE(error->engine); j++) { 745 if (error->engine[j].vm != error->active_vm[i]) 746 continue; 747 748 len += scnprintf(buf + len, sizeof(buf), "%s%s", 749 first ? "" : ", ", 750 dev_priv->engine[j]->name); 751 first = 0; 752 } 753 scnprintf(buf + len, sizeof(buf), ")"); 754 print_error_buffers(m, buf, 755 error->active_bo[i], 756 error->active_bo_count[i]); 757 } 758 759 print_error_buffers(m, "Pinned (global)", 760 error->pinned_bo, 761 error->pinned_bo_count); 762 763 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 764 const struct drm_i915_error_engine *ee = &error->engine[i]; 765 766 obj = ee->batchbuffer; 767 if (obj) { 768 err_puts(m, dev_priv->engine[i]->name); 769 if (ee->context.pid) 770 err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)", 771 ee->context.comm, 772 ee->context.pid, 773 ee->context.handle, 774 ee->context.hw_id, 775 ee->context.ban_score, 776 bannable(&ee->context)); 777 err_printf(m, " --- gtt_offset = 0x%08x %08x\n", 778 upper_32_bits(obj->gtt_offset), 779 lower_32_bits(obj->gtt_offset)); 780 print_error_obj(m, dev_priv->engine[i], NULL, obj); 781 } 782 783 for (j = 0; j < ee->user_bo_count; j++) 784 print_error_obj(m, dev_priv->engine[i], 785 "user", ee->user_bo[j]); 786 787 if (ee->num_requests) { 788 err_printf(m, "%s --- %d requests\n", 789 dev_priv->engine[i]->name, 790 ee->num_requests); 791 for (j = 0; j < ee->num_requests; j++) 792 error_print_request(m, " ", 793 &ee->requests[j], 794 error->epoch); 795 } 796 797 if (IS_ERR(ee->waiters)) { 798 err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n", 799 dev_priv->engine[i]->name); 800 } else if (ee->num_waiters) { 801 err_printf(m, "%s --- %d waiters\n", 802 dev_priv->engine[i]->name, 803 ee->num_waiters); 804 for (j = 0; j < ee->num_waiters; j++) { 805 err_printf(m, " seqno 0x%08x for %s [%d]\n", 806 ee->waiters[j].seqno, 807 ee->waiters[j].comm, 808 ee->waiters[j].pid); 809 } 810 } 811 812 print_error_obj(m, dev_priv->engine[i], 813 "ringbuffer", ee->ringbuffer); 814 815 print_error_obj(m, dev_priv->engine[i], 816 "HW Status", ee->hws_page); 817 818 print_error_obj(m, dev_priv->engine[i], 819 "HW context", ee->ctx); 820 821 print_error_obj(m, dev_priv->engine[i], 822 "WA context", ee->wa_ctx); 823 824 print_error_obj(m, dev_priv->engine[i], 825 "WA batchbuffer", ee->wa_batchbuffer); 826 827 print_error_obj(m, dev_priv->engine[i], 828 "NULL context", ee->default_state); 829 } 830 831 if (error->overlay) 832 intel_overlay_print_error_state(m, error->overlay); 833 834 if (error->display) 835 intel_display_print_error_state(m, error->display); 836 837 err_print_capabilities(m, &error->device_info, &error->driver_caps); 838 err_print_params(m, &error->params); 839 err_print_uc(m, &error->uc); 840 841 if (m->bytes == 0 && m->err) 842 return m->err; 843 844 return 0; 845 } 846 847 int i915_error_state_buf_init(struct drm_i915_error_state_buf *ebuf, 848 struct drm_i915_private *i915, 849 size_t count, loff_t pos) 850 { 851 memset(ebuf, 0, sizeof(*ebuf)); 852 ebuf->i915 = i915; 853 854 /* We need to have enough room to store any i915_error_state printf 855 * so that we can move it to start position. 856 */ 857 ebuf->size = count + 1 > PAGE_SIZE ? count + 1 : PAGE_SIZE; 858 ebuf->buf = kmalloc(ebuf->size, 859 GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); 860 861 if (ebuf->buf == NULL) { 862 ebuf->size = PAGE_SIZE; 863 ebuf->buf = kmalloc(ebuf->size, GFP_KERNEL); 864 } 865 866 if (ebuf->buf == NULL) { 867 ebuf->size = 128; 868 ebuf->buf = kmalloc(ebuf->size, GFP_KERNEL); 869 } 870 871 if (ebuf->buf == NULL) 872 return -ENOMEM; 873 874 ebuf->start = pos; 875 876 return 0; 877 } 878 879 static void i915_error_object_free(struct drm_i915_error_object *obj) 880 { 881 int page; 882 883 if (obj == NULL) 884 return; 885 886 for (page = 0; page < obj->page_count; page++) 887 free_page((unsigned long)obj->pages[page]); 888 889 kfree(obj); 890 } 891 892 static __always_inline void free_param(const char *type, void *x) 893 { 894 if (!__builtin_strcmp(type, "char *")) 895 kfree(*(void **)x); 896 } 897 898 static void cleanup_params(struct i915_gpu_state *error) 899 { 900 #define FREE(T, x, ...) free_param(#T, &error->params.x); 901 I915_PARAMS_FOR_EACH(FREE); 902 #undef FREE 903 } 904 905 static void cleanup_uc_state(struct i915_gpu_state *error) 906 { 907 struct i915_error_uc *error_uc = &error->uc; 908 909 kfree(error_uc->guc_fw.path); 910 kfree(error_uc->huc_fw.path); 911 i915_error_object_free(error_uc->guc_log); 912 } 913 914 void __i915_gpu_state_free(struct kref *error_ref) 915 { 916 struct i915_gpu_state *error = 917 container_of(error_ref, typeof(*error), ref); 918 long i, j; 919 920 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 921 struct drm_i915_error_engine *ee = &error->engine[i]; 922 923 for (j = 0; j < ee->user_bo_count; j++) 924 i915_error_object_free(ee->user_bo[j]); 925 kfree(ee->user_bo); 926 927 i915_error_object_free(ee->batchbuffer); 928 i915_error_object_free(ee->wa_batchbuffer); 929 i915_error_object_free(ee->ringbuffer); 930 i915_error_object_free(ee->hws_page); 931 i915_error_object_free(ee->ctx); 932 i915_error_object_free(ee->wa_ctx); 933 934 kfree(ee->requests); 935 if (!IS_ERR_OR_NULL(ee->waiters)) 936 kfree(ee->waiters); 937 } 938 939 for (i = 0; i < ARRAY_SIZE(error->active_bo); i++) 940 kfree(error->active_bo[i]); 941 kfree(error->pinned_bo); 942 943 kfree(error->overlay); 944 kfree(error->display); 945 946 cleanup_params(error); 947 cleanup_uc_state(error); 948 949 kfree(error); 950 } 951 952 static struct drm_i915_error_object * 953 i915_error_object_create(struct drm_i915_private *i915, 954 struct i915_vma *vma) 955 { 956 struct i915_ggtt *ggtt = &i915->ggtt; 957 const u64 slot = ggtt->error_capture.start; 958 struct drm_i915_error_object *dst; 959 struct compress compress; 960 unsigned long num_pages; 961 struct sgt_iter iter; 962 dma_addr_t dma; 963 int ret; 964 bus_space_handle_t bsh; 965 966 if (!vma) 967 return NULL; 968 969 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; 970 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ 971 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), 972 GFP_ATOMIC | __GFP_NOWARN); 973 if (!dst) 974 return NULL; 975 976 dst->gtt_offset = vma->node.start; 977 dst->gtt_size = vma->node.size; 978 dst->num_pages = num_pages; 979 dst->page_count = 0; 980 dst->unused = 0; 981 982 if (!compress_init(&compress)) { 983 kfree(dst); 984 return NULL; 985 } 986 987 ret = -EINVAL; 988 for_each_sgt_dma(dma, iter, vma->pages) { 989 void __iomem *s; 990 991 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); 992 993 #ifdef __linux__ 994 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); 995 #else 996 agp_map_atomic(i915->agph, slot, &bsh); 997 s = bus_space_vaddr(i915->bst, bsh); 998 #endif 999 ret = compress_page(&compress, (void __force *)s, dst); 1000 #ifdef __linux__ 1001 io_mapping_unmap_atomic(s); 1002 #else 1003 agp_unmap_atomic(i915->agph, bsh); 1004 #endif 1005 if (ret) 1006 break; 1007 } 1008 1009 if (ret || compress_flush(&compress, dst)) { 1010 while (dst->page_count--) 1011 free_page((unsigned long)dst->pages[dst->page_count]); 1012 kfree(dst); 1013 dst = NULL; 1014 } 1015 1016 compress_fini(&compress, dst); 1017 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); 1018 return dst; 1019 } 1020 1021 /* The error capture is special as tries to run underneath the normal 1022 * locking rules - so we use the raw version of the i915_gem_active lookup. 1023 */ 1024 static inline uint32_t 1025 __active_get_seqno(struct i915_gem_active *active) 1026 { 1027 struct i915_request *request; 1028 1029 request = __i915_gem_active_peek(active); 1030 return request ? request->global_seqno : 0; 1031 } 1032 1033 static inline int 1034 __active_get_engine_id(struct i915_gem_active *active) 1035 { 1036 struct i915_request *request; 1037 1038 request = __i915_gem_active_peek(active); 1039 return request ? request->engine->id : -1; 1040 } 1041 1042 static void capture_bo(struct drm_i915_error_buffer *err, 1043 struct i915_vma *vma) 1044 { 1045 struct drm_i915_gem_object *obj = vma->obj; 1046 1047 err->size = obj->base.size; 1048 err->name = obj->base.name; 1049 1050 err->wseqno = __active_get_seqno(&obj->frontbuffer_write); 1051 err->engine = __active_get_engine_id(&obj->frontbuffer_write); 1052 1053 err->gtt_offset = vma->node.start; 1054 err->read_domains = obj->read_domains; 1055 err->write_domain = obj->write_domain; 1056 err->fence_reg = vma->fence ? vma->fence->id : -1; 1057 err->tiling = i915_gem_object_get_tiling(obj); 1058 err->dirty = obj->mm.dirty; 1059 err->purgeable = obj->mm.madv != I915_MADV_WILLNEED; 1060 err->userptr = obj->userptr.mm != NULL; 1061 err->cache_level = obj->cache_level; 1062 } 1063 1064 static u32 capture_error_bo(struct drm_i915_error_buffer *err, 1065 int count, struct list_head *head, 1066 bool pinned_only) 1067 { 1068 struct i915_vma *vma; 1069 int i = 0; 1070 1071 list_for_each_entry(vma, head, vm_link) { 1072 if (!vma->obj) 1073 continue; 1074 1075 if (pinned_only && !i915_vma_is_pinned(vma)) 1076 continue; 1077 1078 capture_bo(err++, vma); 1079 if (++i == count) 1080 break; 1081 } 1082 1083 return i; 1084 } 1085 1086 /* Generate a semi-unique error code. The code is not meant to have meaning, The 1087 * code's only purpose is to try to prevent false duplicated bug reports by 1088 * grossly estimating a GPU error state. 1089 * 1090 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine 1091 * the hang if we could strip the GTT offset information from it. 1092 * 1093 * It's only a small step better than a random number in its current form. 1094 */ 1095 static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv, 1096 struct i915_gpu_state *error, 1097 int *engine_id) 1098 { 1099 uint32_t error_code = 0; 1100 int i; 1101 1102 /* IPEHR would be an ideal way to detect errors, as it's the gross 1103 * measure of "the command that hung." However, has some very common 1104 * synchronization commands which almost always appear in the case 1105 * strictly a client bug. Use instdone to differentiate those some. 1106 */ 1107 for (i = 0; i < I915_NUM_ENGINES; i++) { 1108 if (error->engine[i].hangcheck_stalled) { 1109 if (engine_id) 1110 *engine_id = i; 1111 1112 return error->engine[i].ipehr ^ 1113 error->engine[i].instdone.instdone; 1114 } 1115 } 1116 1117 return error_code; 1118 } 1119 1120 static void gem_record_fences(struct i915_gpu_state *error) 1121 { 1122 struct drm_i915_private *dev_priv = error->i915; 1123 int i; 1124 1125 if (INTEL_GEN(dev_priv) >= 6) { 1126 for (i = 0; i < dev_priv->num_fence_regs; i++) 1127 error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i)); 1128 } else if (INTEL_GEN(dev_priv) >= 4) { 1129 for (i = 0; i < dev_priv->num_fence_regs; i++) 1130 error->fence[i] = I915_READ64(FENCE_REG_965_LO(i)); 1131 } else { 1132 for (i = 0; i < dev_priv->num_fence_regs; i++) 1133 error->fence[i] = I915_READ(FENCE_REG(i)); 1134 } 1135 error->nfence = i; 1136 } 1137 1138 static void gen6_record_semaphore_state(struct intel_engine_cs *engine, 1139 struct drm_i915_error_engine *ee) 1140 { 1141 struct drm_i915_private *dev_priv = engine->i915; 1142 1143 ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base)); 1144 ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base)); 1145 if (HAS_VEBOX(dev_priv)) 1146 ee->semaphore_mboxes[2] = 1147 I915_READ(RING_SYNC_2(engine->mmio_base)); 1148 } 1149 1150 static void error_record_engine_waiters(struct intel_engine_cs *engine, 1151 struct drm_i915_error_engine *ee) 1152 { 1153 struct intel_breadcrumbs *b = &engine->breadcrumbs; 1154 struct drm_i915_error_waiter *waiter; 1155 struct rb_node *rb; 1156 int count; 1157 1158 ee->num_waiters = 0; 1159 ee->waiters = NULL; 1160 1161 if (RB_EMPTY_ROOT(&b->waiters)) 1162 return; 1163 1164 if (!spin_trylock_irq(&b->rb_lock)) { 1165 ee->waiters = ERR_PTR(-EDEADLK); 1166 return; 1167 } 1168 1169 count = 0; 1170 for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb)) 1171 count++; 1172 spin_unlock_irq(&b->rb_lock); 1173 1174 waiter = NULL; 1175 if (count) 1176 waiter = kmalloc_array(count, 1177 sizeof(struct drm_i915_error_waiter), 1178 GFP_ATOMIC); 1179 if (!waiter) 1180 return; 1181 1182 if (!spin_trylock_irq(&b->rb_lock)) { 1183 kfree(waiter); 1184 ee->waiters = ERR_PTR(-EDEADLK); 1185 return; 1186 } 1187 1188 ee->waiters = waiter; 1189 for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) { 1190 struct intel_wait *w = rb_entry(rb, typeof(*w), node); 1191 1192 #ifdef __linux__ 1193 strcpy(waiter->comm, w->tsk->comm); 1194 waiter->pid = w->tsk->pid; 1195 #else 1196 strlcpy(waiter->comm, w->tsk->p_p->ps_comm, 1197 sizeof(waiter->comm)); 1198 waiter->pid = w->tsk->p_p->ps_pid; 1199 #endif 1200 waiter->seqno = w->seqno; 1201 waiter++; 1202 1203 if (++ee->num_waiters == count) 1204 break; 1205 } 1206 spin_unlock_irq(&b->rb_lock); 1207 } 1208 1209 static void error_record_engine_registers(struct i915_gpu_state *error, 1210 struct intel_engine_cs *engine, 1211 struct drm_i915_error_engine *ee) 1212 { 1213 struct drm_i915_private *dev_priv = engine->i915; 1214 1215 if (INTEL_GEN(dev_priv) >= 6) { 1216 ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base)); 1217 if (INTEL_GEN(dev_priv) >= 8) { 1218 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); 1219 } else { 1220 gen6_record_semaphore_state(engine, ee); 1221 ee->fault_reg = I915_READ(RING_FAULT_REG(engine)); 1222 } 1223 } 1224 1225 if (INTEL_GEN(dev_priv) >= 4) { 1226 ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base)); 1227 ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base)); 1228 ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base)); 1229 ee->instps = I915_READ(RING_INSTPS(engine->mmio_base)); 1230 ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base)); 1231 if (INTEL_GEN(dev_priv) >= 8) { 1232 ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32; 1233 ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32; 1234 } 1235 ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base)); 1236 } else { 1237 ee->faddr = I915_READ(DMA_FADD_I8XX); 1238 ee->ipeir = I915_READ(IPEIR); 1239 ee->ipehr = I915_READ(IPEHR); 1240 } 1241 1242 intel_engine_get_instdone(engine, &ee->instdone); 1243 1244 ee->waiting = intel_engine_has_waiter(engine); 1245 ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base)); 1246 ee->acthd = intel_engine_get_active_head(engine); 1247 ee->seqno = intel_engine_get_seqno(engine); 1248 ee->last_seqno = intel_engine_last_submit(engine); 1249 ee->start = I915_READ_START(engine); 1250 ee->head = I915_READ_HEAD(engine); 1251 ee->tail = I915_READ_TAIL(engine); 1252 ee->ctl = I915_READ_CTL(engine); 1253 if (INTEL_GEN(dev_priv) > 2) 1254 ee->mode = I915_READ_MODE(engine); 1255 1256 if (!HWS_NEEDS_PHYSICAL(dev_priv)) { 1257 i915_reg_t mmio; 1258 1259 if (IS_GEN7(dev_priv)) { 1260 switch (engine->id) { 1261 default: 1262 case RCS: 1263 mmio = RENDER_HWS_PGA_GEN7; 1264 break; 1265 case BCS: 1266 mmio = BLT_HWS_PGA_GEN7; 1267 break; 1268 case VCS: 1269 mmio = BSD_HWS_PGA_GEN7; 1270 break; 1271 case VECS: 1272 mmio = VEBOX_HWS_PGA_GEN7; 1273 break; 1274 } 1275 } else if (IS_GEN6(engine->i915)) { 1276 mmio = RING_HWS_PGA_GEN6(engine->mmio_base); 1277 } else { 1278 /* XXX: gen8 returns to sanity */ 1279 mmio = RING_HWS_PGA(engine->mmio_base); 1280 } 1281 1282 ee->hws = I915_READ(mmio); 1283 } 1284 1285 ee->idle = intel_engine_is_idle(engine); 1286 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; 1287 ee->hangcheck_action = engine->hangcheck.action; 1288 ee->hangcheck_stalled = engine->hangcheck.stalled; 1289 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, 1290 engine); 1291 1292 if (USES_PPGTT(dev_priv)) { 1293 int i; 1294 1295 ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine)); 1296 1297 if (IS_GEN6(dev_priv)) 1298 ee->vm_info.pp_dir_base = 1299 I915_READ(RING_PP_DIR_BASE_READ(engine)); 1300 else if (IS_GEN7(dev_priv)) 1301 ee->vm_info.pp_dir_base = 1302 I915_READ(RING_PP_DIR_BASE(engine)); 1303 else if (INTEL_GEN(dev_priv) >= 8) 1304 for (i = 0; i < 4; i++) { 1305 ee->vm_info.pdp[i] = 1306 I915_READ(GEN8_RING_PDP_UDW(engine, i)); 1307 ee->vm_info.pdp[i] <<= 32; 1308 ee->vm_info.pdp[i] |= 1309 I915_READ(GEN8_RING_PDP_LDW(engine, i)); 1310 } 1311 } 1312 } 1313 1314 static void record_request(struct i915_request *request, 1315 struct drm_i915_error_request *erq) 1316 { 1317 struct i915_gem_context *ctx = request->gem_context; 1318 1319 erq->context = ctx->hw_id; 1320 erq->sched_attr = request->sched.attr; 1321 erq->ban_score = atomic_read(&ctx->ban_score); 1322 erq->seqno = request->global_seqno; 1323 erq->jiffies = request->emitted_jiffies; 1324 erq->start = i915_ggtt_offset(request->ring->vma); 1325 erq->head = request->head; 1326 erq->tail = request->tail; 1327 1328 rcu_read_lock(); 1329 #ifdef __linux__ 1330 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0; 1331 #else 1332 erq->pid = ctx->pid; 1333 #endif 1334 rcu_read_unlock(); 1335 } 1336 1337 static void engine_record_requests(struct intel_engine_cs *engine, 1338 struct i915_request *first, 1339 struct drm_i915_error_engine *ee) 1340 { 1341 struct i915_request *request; 1342 int count; 1343 1344 count = 0; 1345 request = first; 1346 list_for_each_entry_from(request, &engine->timeline.requests, link) 1347 count++; 1348 if (!count) 1349 return; 1350 1351 ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC); 1352 if (!ee->requests) 1353 return; 1354 1355 ee->num_requests = count; 1356 1357 count = 0; 1358 request = first; 1359 list_for_each_entry_from(request, &engine->timeline.requests, link) { 1360 if (count >= ee->num_requests) { 1361 /* 1362 * If the ring request list was changed in 1363 * between the point where the error request 1364 * list was created and dimensioned and this 1365 * point then just exit early to avoid crashes. 1366 * 1367 * We don't need to communicate that the 1368 * request list changed state during error 1369 * state capture and that the error state is 1370 * slightly incorrect as a consequence since we 1371 * are typically only interested in the request 1372 * list state at the point of error state 1373 * capture, not in any changes happening during 1374 * the capture. 1375 */ 1376 break; 1377 } 1378 1379 record_request(request, &ee->requests[count++]); 1380 } 1381 ee->num_requests = count; 1382 } 1383 1384 static void error_record_engine_execlists(struct intel_engine_cs *engine, 1385 struct drm_i915_error_engine *ee) 1386 { 1387 const struct intel_engine_execlists * const execlists = &engine->execlists; 1388 unsigned int n; 1389 1390 for (n = 0; n < execlists_num_ports(execlists); n++) { 1391 struct i915_request *rq = port_request(&execlists->port[n]); 1392 1393 if (!rq) 1394 break; 1395 1396 record_request(rq, &ee->execlist[n]); 1397 } 1398 1399 ee->num_ports = n; 1400 } 1401 1402 static void record_context(struct drm_i915_error_context *e, 1403 struct i915_gem_context *ctx) 1404 { 1405 #ifdef __linux__ 1406 if (ctx->pid) { 1407 struct task_struct *task; 1408 1409 rcu_read_lock(); 1410 task = pid_task(ctx->pid, PIDTYPE_PID); 1411 if (task) { 1412 strcpy(e->comm, task->comm); 1413 e->pid = task->pid; 1414 } 1415 rcu_read_unlock(); 1416 } 1417 #endif 1418 1419 e->handle = ctx->user_handle; 1420 e->hw_id = ctx->hw_id; 1421 e->sched_attr = ctx->sched; 1422 e->ban_score = atomic_read(&ctx->ban_score); 1423 e->bannable = i915_gem_context_is_bannable(ctx); 1424 e->guilty = atomic_read(&ctx->guilty_count); 1425 e->active = atomic_read(&ctx->active_count); 1426 } 1427 1428 static void request_record_user_bo(struct i915_request *request, 1429 struct drm_i915_error_engine *ee) 1430 { 1431 struct i915_capture_list *c; 1432 struct drm_i915_error_object **bo; 1433 long count; 1434 1435 count = 0; 1436 for (c = request->capture_list; c; c = c->next) 1437 count++; 1438 1439 bo = NULL; 1440 if (count) 1441 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC); 1442 if (!bo) 1443 return; 1444 1445 count = 0; 1446 for (c = request->capture_list; c; c = c->next) { 1447 bo[count] = i915_error_object_create(request->i915, c->vma); 1448 if (!bo[count]) 1449 break; 1450 count++; 1451 } 1452 1453 ee->user_bo = bo; 1454 ee->user_bo_count = count; 1455 } 1456 1457 static struct drm_i915_error_object * 1458 capture_object(struct drm_i915_private *dev_priv, 1459 struct drm_i915_gem_object *obj) 1460 { 1461 if (obj && i915_gem_object_has_pages(obj)) { 1462 struct i915_vma fake = { 1463 .node = { .start = U64_MAX, .size = obj->base.size }, 1464 .size = obj->base.size, 1465 .pages = obj->mm.pages, 1466 .obj = obj, 1467 }; 1468 1469 return i915_error_object_create(dev_priv, &fake); 1470 } else { 1471 return NULL; 1472 } 1473 } 1474 1475 static void gem_record_rings(struct i915_gpu_state *error) 1476 { 1477 struct drm_i915_private *i915 = error->i915; 1478 struct i915_ggtt *ggtt = &i915->ggtt; 1479 int i; 1480 1481 for (i = 0; i < I915_NUM_ENGINES; i++) { 1482 struct intel_engine_cs *engine = i915->engine[i]; 1483 struct drm_i915_error_engine *ee = &error->engine[i]; 1484 struct i915_request *request; 1485 1486 ee->engine_id = -1; 1487 1488 if (!engine) 1489 continue; 1490 1491 ee->engine_id = i; 1492 1493 error_record_engine_registers(error, engine, ee); 1494 error_record_engine_waiters(engine, ee); 1495 error_record_engine_execlists(engine, ee); 1496 1497 request = i915_gem_find_active_request(engine); 1498 if (request) { 1499 struct i915_gem_context *ctx = request->gem_context; 1500 struct intel_ring *ring; 1501 1502 ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm; 1503 1504 record_context(&ee->context, ctx); 1505 1506 /* We need to copy these to an anonymous buffer 1507 * as the simplest method to avoid being overwritten 1508 * by userspace. 1509 */ 1510 ee->batchbuffer = 1511 i915_error_object_create(i915, request->batch); 1512 1513 if (HAS_BROKEN_CS_TLB(i915)) 1514 ee->wa_batchbuffer = 1515 i915_error_object_create(i915, 1516 engine->scratch); 1517 request_record_user_bo(request, ee); 1518 1519 ee->ctx = 1520 i915_error_object_create(i915, 1521 request->hw_context->state); 1522 1523 error->simulated |= 1524 i915_gem_context_no_error_capture(ctx); 1525 1526 ee->rq_head = request->head; 1527 ee->rq_post = request->postfix; 1528 ee->rq_tail = request->tail; 1529 1530 ring = request->ring; 1531 ee->cpu_ring_head = ring->head; 1532 ee->cpu_ring_tail = ring->tail; 1533 ee->ringbuffer = 1534 i915_error_object_create(i915, ring->vma); 1535 1536 engine_record_requests(engine, request, ee); 1537 } 1538 1539 ee->hws_page = 1540 i915_error_object_create(i915, 1541 engine->status_page.vma); 1542 1543 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma); 1544 1545 ee->default_state = capture_object(i915, engine->default_state); 1546 } 1547 } 1548 1549 static void gem_capture_vm(struct i915_gpu_state *error, 1550 struct i915_address_space *vm, 1551 int idx) 1552 { 1553 struct drm_i915_error_buffer *active_bo; 1554 struct i915_vma *vma; 1555 int count; 1556 1557 count = 0; 1558 list_for_each_entry(vma, &vm->active_list, vm_link) 1559 count++; 1560 1561 active_bo = NULL; 1562 if (count) 1563 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC); 1564 if (active_bo) 1565 count = capture_error_bo(active_bo, count, &vm->active_list, false); 1566 else 1567 count = 0; 1568 1569 error->active_vm[idx] = vm; 1570 error->active_bo[idx] = active_bo; 1571 error->active_bo_count[idx] = count; 1572 } 1573 1574 static void capture_active_buffers(struct i915_gpu_state *error) 1575 { 1576 int cnt = 0, i, j; 1577 1578 BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo)); 1579 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm)); 1580 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count)); 1581 1582 /* Scan each engine looking for unique active contexts/vm */ 1583 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 1584 struct drm_i915_error_engine *ee = &error->engine[i]; 1585 bool found; 1586 1587 if (!ee->vm) 1588 continue; 1589 1590 found = false; 1591 for (j = 0; j < i && !found; j++) 1592 found = error->engine[j].vm == ee->vm; 1593 if (!found) 1594 gem_capture_vm(error, ee->vm, cnt++); 1595 } 1596 } 1597 1598 static void capture_pinned_buffers(struct i915_gpu_state *error) 1599 { 1600 struct i915_address_space *vm = &error->i915->ggtt.vm; 1601 struct drm_i915_error_buffer *bo; 1602 struct i915_vma *vma; 1603 int count_inactive, count_active; 1604 1605 count_inactive = 0; 1606 list_for_each_entry(vma, &vm->inactive_list, vm_link) 1607 count_inactive++; 1608 1609 count_active = 0; 1610 list_for_each_entry(vma, &vm->active_list, vm_link) 1611 count_active++; 1612 1613 bo = NULL; 1614 if (count_inactive + count_active) 1615 bo = kcalloc(count_inactive + count_active, 1616 sizeof(*bo), GFP_ATOMIC); 1617 if (!bo) 1618 return; 1619 1620 count_inactive = capture_error_bo(bo, count_inactive, 1621 &vm->active_list, true); 1622 count_active = capture_error_bo(bo + count_inactive, count_active, 1623 &vm->inactive_list, true); 1624 error->pinned_bo_count = count_inactive + count_active; 1625 error->pinned_bo = bo; 1626 } 1627 1628 static void capture_uc_state(struct i915_gpu_state *error) 1629 { 1630 struct drm_i915_private *i915 = error->i915; 1631 struct i915_error_uc *error_uc = &error->uc; 1632 1633 /* Capturing uC state won't be useful if there is no GuC */ 1634 if (!error->device_info.has_guc) 1635 return; 1636 1637 error_uc->guc_fw = i915->guc.fw; 1638 error_uc->huc_fw = i915->huc.fw; 1639 1640 /* Non-default firmware paths will be specified by the modparam. 1641 * As modparams are generally accesible from the userspace make 1642 * explicit copies of the firmware paths. 1643 */ 1644 error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC); 1645 error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC); 1646 error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma); 1647 } 1648 1649 /* Capture all registers which don't fit into another category. */ 1650 static void capture_reg_state(struct i915_gpu_state *error) 1651 { 1652 struct drm_i915_private *dev_priv = error->i915; 1653 int i; 1654 1655 /* General organization 1656 * 1. Registers specific to a single generation 1657 * 2. Registers which belong to multiple generations 1658 * 3. Feature specific registers. 1659 * 4. Everything else 1660 * Please try to follow the order. 1661 */ 1662 1663 /* 1: Registers specific to a single generation */ 1664 if (IS_VALLEYVIEW(dev_priv)) { 1665 error->gtier[0] = I915_READ(GTIER); 1666 error->ier = I915_READ(VLV_IER); 1667 error->forcewake = I915_READ_FW(FORCEWAKE_VLV); 1668 } 1669 1670 if (IS_GEN7(dev_priv)) 1671 error->err_int = I915_READ(GEN7_ERR_INT); 1672 1673 if (INTEL_GEN(dev_priv) >= 8) { 1674 error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0); 1675 error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1); 1676 } 1677 1678 if (IS_GEN6(dev_priv)) { 1679 error->forcewake = I915_READ_FW(FORCEWAKE); 1680 error->gab_ctl = I915_READ(GAB_CTL); 1681 error->gfx_mode = I915_READ(GFX_MODE); 1682 } 1683 1684 /* 2: Registers which belong to multiple generations */ 1685 if (INTEL_GEN(dev_priv) >= 7) 1686 error->forcewake = I915_READ_FW(FORCEWAKE_MT); 1687 1688 if (INTEL_GEN(dev_priv) >= 6) { 1689 error->derrmr = I915_READ(DERRMR); 1690 error->error = I915_READ(ERROR_GEN6); 1691 error->done_reg = I915_READ(DONE_REG); 1692 } 1693 1694 if (INTEL_GEN(dev_priv) >= 5) 1695 error->ccid = I915_READ(CCID); 1696 1697 /* 3: Feature specific registers */ 1698 if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) { 1699 error->gam_ecochk = I915_READ(GAM_ECOCHK); 1700 error->gac_eco = I915_READ(GAC_ECO_BITS); 1701 } 1702 1703 /* 4: Everything else */ 1704 if (INTEL_GEN(dev_priv) >= 11) { 1705 error->ier = I915_READ(GEN8_DE_MISC_IER); 1706 error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE); 1707 error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE); 1708 error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE); 1709 error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE); 1710 error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE); 1711 error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE); 1712 error->ngtier = 6; 1713 } else if (INTEL_GEN(dev_priv) >= 8) { 1714 error->ier = I915_READ(GEN8_DE_MISC_IER); 1715 for (i = 0; i < 4; i++) 1716 error->gtier[i] = I915_READ(GEN8_GT_IER(i)); 1717 error->ngtier = 4; 1718 } else if (HAS_PCH_SPLIT(dev_priv)) { 1719 error->ier = I915_READ(DEIER); 1720 error->gtier[0] = I915_READ(GTIER); 1721 error->ngtier = 1; 1722 } else if (IS_GEN2(dev_priv)) { 1723 error->ier = I915_READ16(IER); 1724 } else if (!IS_VALLEYVIEW(dev_priv)) { 1725 error->ier = I915_READ(IER); 1726 } 1727 error->eir = I915_READ(EIR); 1728 error->pgtbl_er = I915_READ(PGTBL_ER); 1729 } 1730 1731 static void i915_error_capture_msg(struct drm_i915_private *dev_priv, 1732 struct i915_gpu_state *error, 1733 u32 engine_mask, 1734 const char *error_msg) 1735 { 1736 u32 ecode; 1737 int engine_id = -1, len; 1738 1739 ecode = i915_error_generate_code(dev_priv, error, &engine_id); 1740 1741 len = scnprintf(error->error_msg, sizeof(error->error_msg), 1742 "GPU HANG: ecode %d:%d:0x%08x", 1743 INTEL_GEN(dev_priv), engine_id, ecode); 1744 1745 if (engine_id != -1 && error->engine[engine_id].context.pid) 1746 len += scnprintf(error->error_msg + len, 1747 sizeof(error->error_msg) - len, 1748 ", in %s [%d]", 1749 error->engine[engine_id].context.comm, 1750 error->engine[engine_id].context.pid); 1751 1752 scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, 1753 ", reason: %s, action: %s", 1754 error_msg, 1755 engine_mask ? "reset" : "continue"); 1756 } 1757 1758 static void capture_gen_state(struct i915_gpu_state *error) 1759 { 1760 struct drm_i915_private *i915 = error->i915; 1761 1762 error->awake = i915->gt.awake; 1763 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); 1764 error->suspended = i915->runtime_pm.suspended; 1765 1766 error->iommu = -1; 1767 #ifdef CONFIG_INTEL_IOMMU 1768 error->iommu = intel_iommu_gfx_mapped; 1769 #endif 1770 error->reset_count = i915_reset_count(&i915->gpu_error); 1771 error->suspend_count = i915->suspend_count; 1772 1773 memcpy(&error->device_info, 1774 INTEL_INFO(i915), 1775 sizeof(error->device_info)); 1776 error->driver_caps = i915->caps; 1777 } 1778 1779 static __always_inline void dup_param(const char *type, void *x) 1780 { 1781 if (!__builtin_strcmp(type, "char *")) 1782 *(void **)x = kstrdup(*(void **)x, GFP_ATOMIC); 1783 } 1784 1785 static void capture_params(struct i915_gpu_state *error) 1786 { 1787 error->params = i915_modparams; 1788 #define DUP(T, x, ...) dup_param(#T, &error->params.x); 1789 I915_PARAMS_FOR_EACH(DUP); 1790 #undef DUP 1791 } 1792 1793 static unsigned long capture_find_epoch(const struct i915_gpu_state *error) 1794 { 1795 unsigned long epoch = error->capture; 1796 int i; 1797 1798 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 1799 const struct drm_i915_error_engine *ee = &error->engine[i]; 1800 1801 if (ee->hangcheck_stalled && 1802 time_before(ee->hangcheck_timestamp, epoch)) 1803 epoch = ee->hangcheck_timestamp; 1804 } 1805 1806 return epoch; 1807 } 1808 1809 static int capture(void *data) 1810 { 1811 struct i915_gpu_state *error = data; 1812 1813 error->time = ktime_get_real(); 1814 error->boottime = ktime_get_boottime(); 1815 error->uptime = ktime_sub(ktime_get(), 1816 error->i915->gt.last_init_time); 1817 error->capture = jiffies; 1818 1819 #ifdef notyet 1820 capture_params(error); 1821 #endif 1822 capture_gen_state(error); 1823 capture_uc_state(error); 1824 capture_reg_state(error); 1825 gem_record_fences(error); 1826 gem_record_rings(error); 1827 capture_active_buffers(error); 1828 capture_pinned_buffers(error); 1829 1830 error->overlay = intel_overlay_capture_error_state(error->i915); 1831 error->display = intel_display_capture_error_state(error->i915); 1832 1833 error->epoch = capture_find_epoch(error); 1834 1835 return 0; 1836 } 1837 1838 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) 1839 1840 struct i915_gpu_state * 1841 i915_capture_gpu_state(struct drm_i915_private *i915) 1842 { 1843 struct i915_gpu_state *error; 1844 1845 error = kzalloc(sizeof(*error), GFP_ATOMIC); 1846 if (!error) 1847 return NULL; 1848 1849 kref_init(&error->ref); 1850 error->i915 = i915; 1851 1852 stop_machine(capture, error, NULL); 1853 1854 return error; 1855 } 1856 1857 /** 1858 * i915_capture_error_state - capture an error record for later analysis 1859 * @i915: i915 device 1860 * @engine_mask: the mask of engines triggering the hang 1861 * @error_msg: a message to insert into the error capture header 1862 * 1863 * Should be called when an error is detected (either a hang or an error 1864 * interrupt) to capture error state from the time of the error. Fills 1865 * out a structure which becomes available in debugfs for user level tools 1866 * to pick up. 1867 */ 1868 void i915_capture_error_state(struct drm_i915_private *i915, 1869 u32 engine_mask, 1870 const char *error_msg) 1871 { 1872 static bool warned; 1873 struct i915_gpu_state *error; 1874 unsigned long flags; 1875 1876 if (!i915_modparams.error_capture) 1877 return; 1878 1879 if (READ_ONCE(i915->gpu_error.first_error)) 1880 return; 1881 1882 error = i915_capture_gpu_state(i915); 1883 if (!error) { 1884 DRM_DEBUG_DRIVER("out of memory, not capturing error state\n"); 1885 return; 1886 } 1887 1888 i915_error_capture_msg(i915, error, engine_mask, error_msg); 1889 DRM_INFO("%s\n", error->error_msg); 1890 1891 if (!error->simulated) { 1892 spin_lock_irqsave(&i915->gpu_error.lock, flags); 1893 if (!i915->gpu_error.first_error) { 1894 i915->gpu_error.first_error = error; 1895 error = NULL; 1896 } 1897 spin_unlock_irqrestore(&i915->gpu_error.lock, flags); 1898 } 1899 1900 if (error) { 1901 __i915_gpu_state_free(&error->ref); 1902 return; 1903 } 1904 1905 if (!warned && 1906 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { 1907 #ifdef __linux__ 1908 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); 1909 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); 1910 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); 1911 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); 1912 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", 1913 i915->drm.primary->index); 1914 #endif 1915 warned = true; 1916 } 1917 } 1918 1919 struct i915_gpu_state * 1920 i915_first_error_state(struct drm_i915_private *i915) 1921 { 1922 struct i915_gpu_state *error; 1923 1924 spin_lock_irq(&i915->gpu_error.lock); 1925 error = i915->gpu_error.first_error; 1926 if (error) 1927 i915_gpu_state_get(error); 1928 spin_unlock_irq(&i915->gpu_error.lock); 1929 1930 return error; 1931 } 1932 1933 void i915_reset_error_state(struct drm_i915_private *i915) 1934 { 1935 struct i915_gpu_state *error; 1936 1937 spin_lock_irq(&i915->gpu_error.lock); 1938 error = i915->gpu_error.first_error; 1939 i915->gpu_error.first_error = NULL; 1940 spin_unlock_irq(&i915->gpu_error.lock); 1941 1942 i915_gpu_state_put(error); 1943 } 1944