1 /* 2 * Copyright 2022 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 #include <linux/slab.h> 24 #include <drm/drm_print.h> 25 26 #include "amdgpu_ring_mux.h" 27 #include "amdgpu_ring.h" 28 #include "amdgpu.h" 29 30 #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2) 31 #define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000 32 33 static const struct ring_info { 34 unsigned int hw_pio; 35 const char *ring_name; 36 } sw_ring_info[] = { 37 { AMDGPU_RING_PRIO_DEFAULT, "gfx_low"}, 38 { AMDGPU_RING_PRIO_2, "gfx_high"}, 39 }; 40 41 static struct pool amdgpu_mux_chunk_slab; 42 43 static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux, 44 struct amdgpu_ring *ring) 45 { 46 return ring->entry_index < mux->ring_entry_size ? 47 &mux->ring_entry[ring->entry_index] : NULL; 48 } 49 50 /* copy packages on sw ring range[begin, end) */ 51 static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux, 52 struct amdgpu_ring *ring, 53 u64 s_start, u64 s_end) 54 { 55 u64 start, end; 56 struct amdgpu_ring *real_ring = mux->real_ring; 57 58 start = s_start & ring->buf_mask; 59 end = s_end & ring->buf_mask; 60 61 if (start == end) { 62 DRM_ERROR("no more data copied from sw ring\n"); 63 return; 64 } 65 if (start > end) { 66 amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start); 67 amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], 68 (ring->ring_size >> 2) - start); 69 amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end); 70 } else { 71 amdgpu_ring_alloc(real_ring, end - start); 72 amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start); 73 } 74 } 75 76 static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux) 77 { 78 struct amdgpu_mux_entry *e = NULL; 79 struct amdgpu_mux_chunk *chunk; 80 uint32_t seq, last_seq; 81 int i; 82 83 /*find low priority entries:*/ 84 if (!mux->s_resubmit) 85 return; 86 87 for (i = 0; i < mux->num_ring_entries; i++) { 88 if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) { 89 e = &mux->ring_entry[i]; 90 break; 91 } 92 } 93 94 if (!e) { 95 DRM_ERROR("%s no low priority ring found\n", __func__); 96 return; 97 } 98 99 last_seq = atomic_read(&e->ring->fence_drv.last_seq); 100 seq = mux->seqno_to_resubmit; 101 if (last_seq < seq) { 102 /*resubmit all the fences between (last_seq, seq]*/ 103 list_for_each_entry(chunk, &e->list, entry) { 104 if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) { 105 amdgpu_fence_update_start_timestamp(e->ring, 106 chunk->sync_seq, 107 ktime_get()); 108 if (chunk->sync_seq == 109 le32_to_cpu(*(e->ring->fence_drv.cpu_addr + 2))) { 110 if (chunk->cntl_offset <= e->ring->buf_mask) 111 amdgpu_ring_patch_cntl(e->ring, 112 chunk->cntl_offset); 113 if (chunk->ce_offset <= e->ring->buf_mask) 114 amdgpu_ring_patch_ce(e->ring, chunk->ce_offset); 115 if (chunk->de_offset <= e->ring->buf_mask) 116 amdgpu_ring_patch_de(e->ring, chunk->de_offset); 117 } 118 amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring, 119 chunk->start, 120 chunk->end); 121 mux->wptr_resubmit = chunk->end; 122 amdgpu_ring_commit(mux->real_ring); 123 } 124 } 125 } 126 127 del_timer(&mux->resubmit_timer); 128 mux->s_resubmit = false; 129 } 130 131 static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux) 132 { 133 mod_timer(&mux->resubmit_timer, jiffies + AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT); 134 } 135 136 static void amdgpu_mux_resubmit_fallback(void *arg) 137 { 138 struct amdgpu_ring_mux *mux = arg; 139 140 if (!spin_trylock(&mux->lock)) { 141 amdgpu_ring_mux_schedule_resubmit(mux); 142 DRM_ERROR("reschedule resubmit\n"); 143 return; 144 } 145 amdgpu_mux_resubmit_chunks(mux); 146 spin_unlock(&mux->lock); 147 } 148 149 int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, 150 unsigned int entry_size) 151 { 152 mux->real_ring = ring; 153 mux->num_ring_entries = 0; 154 155 mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL); 156 if (!mux->ring_entry) 157 return -ENOMEM; 158 159 mux->ring_entry_size = entry_size; 160 mux->s_resubmit = false; 161 162 #ifdef __linux__ 163 amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk", 164 sizeof(struct amdgpu_mux_chunk), 0, 165 SLAB_HWCACHE_ALIGN, NULL); 166 if (!amdgpu_mux_chunk_slab) { 167 DRM_ERROR("create amdgpu_mux_chunk cache failed\n"); 168 return -ENOMEM; 169 } 170 #else 171 pool_init(&amdgpu_mux_chunk_slab, sizeof(struct amdgpu_mux_chunk), 172 CACHELINESIZE, IPL_TTY, 0, "amdgpu_mux_chunk", NULL); 173 #endif 174 175 mtx_init(&mux->lock, IPL_NONE); 176 #ifdef __linux__ 177 timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0); 178 #else 179 timeout_set(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, mux); 180 #endif 181 182 return 0; 183 } 184 185 void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux) 186 { 187 struct amdgpu_mux_entry *e; 188 struct amdgpu_mux_chunk *chunk, *chunk2; 189 int i; 190 191 for (i = 0; i < mux->num_ring_entries; i++) { 192 e = &mux->ring_entry[i]; 193 list_for_each_entry_safe(chunk, chunk2, &e->list, entry) { 194 list_del(&chunk->entry); 195 #ifdef __linux__ 196 kmem_cache_free(amdgpu_mux_chunk_slab, chunk); 197 #else 198 pool_put(&amdgpu_mux_chunk_slab, chunk); 199 #endif 200 } 201 } 202 #ifdef __linux__ 203 kmem_cache_destroy(amdgpu_mux_chunk_slab); 204 #else 205 pool_destroy(&amdgpu_mux_chunk_slab); 206 #endif 207 kfree(mux->ring_entry); 208 mux->ring_entry = NULL; 209 mux->num_ring_entries = 0; 210 mux->ring_entry_size = 0; 211 } 212 213 int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) 214 { 215 struct amdgpu_mux_entry *e; 216 217 if (mux->num_ring_entries >= mux->ring_entry_size) { 218 DRM_ERROR("add sw ring exceeding max entry size\n"); 219 return -ENOENT; 220 } 221 222 e = &mux->ring_entry[mux->num_ring_entries]; 223 ring->entry_index = mux->num_ring_entries; 224 e->ring = ring; 225 226 INIT_LIST_HEAD(&e->list); 227 mux->num_ring_entries += 1; 228 return 0; 229 } 230 231 void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr) 232 { 233 struct amdgpu_mux_entry *e; 234 235 spin_lock(&mux->lock); 236 237 if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) 238 amdgpu_mux_resubmit_chunks(mux); 239 240 e = amdgpu_ring_mux_sw_entry(mux, ring); 241 if (!e) { 242 DRM_ERROR("cannot find entry for sw ring\n"); 243 spin_unlock(&mux->lock); 244 return; 245 } 246 247 /* We could skip this set wptr as preemption in process. */ 248 if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) { 249 spin_unlock(&mux->lock); 250 return; 251 } 252 253 e->sw_cptr = e->sw_wptr; 254 /* Update cptr if the package already copied in resubmit functions */ 255 if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit) 256 e->sw_cptr = mux->wptr_resubmit; 257 e->sw_wptr = wptr; 258 e->start_ptr_in_hw_ring = mux->real_ring->wptr; 259 260 /* Skip copying for the packages already resubmitted.*/ 261 if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) { 262 amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr); 263 e->end_ptr_in_hw_ring = mux->real_ring->wptr; 264 amdgpu_ring_commit(mux->real_ring); 265 } else { 266 e->end_ptr_in_hw_ring = mux->real_ring->wptr; 267 } 268 spin_unlock(&mux->lock); 269 } 270 271 u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) 272 { 273 struct amdgpu_mux_entry *e; 274 275 e = amdgpu_ring_mux_sw_entry(mux, ring); 276 if (!e) { 277 DRM_ERROR("cannot find entry for sw ring\n"); 278 return 0; 279 } 280 281 return e->sw_wptr; 282 } 283 284 /** 285 * amdgpu_ring_mux_get_rptr - get the readptr of the software ring 286 * @mux: the multiplexer the software rings attach to 287 * @ring: the software ring of which we calculate the readptr 288 * 289 * The return value of the readptr is not precise while the other rings could 290 * write data onto the real ring buffer.After overwriting on the real ring, we 291 * can not decide if our packages have been excuted or not read yet. However, 292 * this function is only called by the tools such as umr to collect the latest 293 * packages for the hang analysis. We assume the hang happens near our latest 294 * submit. Thus we could use the following logic to give the clue: 295 * If the readptr is between start and end, then we return the copy pointer 296 * plus the distance from start to readptr. If the readptr is before start, we 297 * return the copy pointer. Lastly, if the readptr is past end, we return the 298 * write pointer. 299 */ 300 u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) 301 { 302 struct amdgpu_mux_entry *e; 303 u64 readp, offset, start, end; 304 305 e = amdgpu_ring_mux_sw_entry(mux, ring); 306 if (!e) { 307 DRM_ERROR("no sw entry found!\n"); 308 return 0; 309 } 310 311 readp = amdgpu_ring_get_rptr(mux->real_ring); 312 313 start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask; 314 end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask; 315 if (start > end) { 316 if (readp <= end) 317 readp += mux->real_ring->ring_size >> 2; 318 end += mux->real_ring->ring_size >> 2; 319 } 320 321 if (start <= readp && readp <= end) { 322 offset = readp - start; 323 e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask; 324 } else if (readp < start) { 325 e->sw_rptr = e->sw_cptr; 326 } else { 327 /* end < readptr */ 328 e->sw_rptr = e->sw_wptr; 329 } 330 331 return e->sw_rptr; 332 } 333 334 u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring) 335 { 336 struct amdgpu_device *adev = ring->adev; 337 struct amdgpu_ring_mux *mux = &adev->gfx.muxer; 338 339 WARN_ON(!ring->is_sw_ring); 340 return amdgpu_ring_mux_get_rptr(mux, ring); 341 } 342 343 u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring) 344 { 345 struct amdgpu_device *adev = ring->adev; 346 struct amdgpu_ring_mux *mux = &adev->gfx.muxer; 347 348 WARN_ON(!ring->is_sw_ring); 349 return amdgpu_ring_mux_get_wptr(mux, ring); 350 } 351 352 void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring) 353 { 354 struct amdgpu_device *adev = ring->adev; 355 struct amdgpu_ring_mux *mux = &adev->gfx.muxer; 356 357 WARN_ON(!ring->is_sw_ring); 358 amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr); 359 } 360 361 /* Override insert_nop to prevent emitting nops to the software rings */ 362 void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count) 363 { 364 WARN_ON(!ring->is_sw_ring); 365 } 366 367 const char *amdgpu_sw_ring_name(int idx) 368 { 369 return idx < ARRAY_SIZE(sw_ring_info) ? 370 sw_ring_info[idx].ring_name : NULL; 371 } 372 373 unsigned int amdgpu_sw_ring_priority(int idx) 374 { 375 return idx < ARRAY_SIZE(sw_ring_info) ? 376 sw_ring_info[idx].hw_pio : AMDGPU_RING_PRIO_DEFAULT; 377 } 378 379 /*Scan on low prio rings to have unsignaled fence and high ring has no fence.*/ 380 static int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux) 381 { 382 struct amdgpu_ring *ring; 383 int i, need_preempt; 384 385 need_preempt = 0; 386 for (i = 0; i < mux->num_ring_entries; i++) { 387 ring = mux->ring_entry[i].ring; 388 if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT && 389 amdgpu_fence_count_emitted(ring) > 0) 390 return 0; 391 if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && 392 amdgpu_fence_last_unsignaled_time_us(ring) > 393 AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US) 394 need_preempt = 1; 395 } 396 return need_preempt && !mux->s_resubmit; 397 } 398 399 /* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */ 400 static int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux) 401 { 402 int r; 403 404 spin_lock(&mux->lock); 405 mux->pending_trailing_fence_signaled = true; 406 r = amdgpu_ring_preempt_ib(mux->real_ring); 407 spin_unlock(&mux->lock); 408 return r; 409 } 410 411 void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring) 412 { 413 struct amdgpu_device *adev = ring->adev; 414 struct amdgpu_ring_mux *mux = &adev->gfx.muxer; 415 416 WARN_ON(!ring->is_sw_ring); 417 if (adev->gfx.mcbp && ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) { 418 if (amdgpu_mcbp_scan(mux) > 0) 419 amdgpu_mcbp_trigger_preempt(mux); 420 return; 421 } 422 423 amdgpu_ring_mux_start_ib(mux, ring); 424 } 425 426 void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) 427 { 428 struct amdgpu_device *adev = ring->adev; 429 struct amdgpu_ring_mux *mux = &adev->gfx.muxer; 430 431 WARN_ON(!ring->is_sw_ring); 432 if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) 433 return; 434 amdgpu_ring_mux_end_ib(mux, ring); 435 } 436 437 void amdgpu_sw_ring_ib_mark_offset(struct amdgpu_ring *ring, enum amdgpu_ring_mux_offset_type type) 438 { 439 struct amdgpu_device *adev = ring->adev; 440 struct amdgpu_ring_mux *mux = &adev->gfx.muxer; 441 unsigned offset; 442 443 if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) 444 return; 445 446 offset = ring->wptr & ring->buf_mask; 447 448 amdgpu_ring_mux_ib_mark_offset(mux, ring, offset, type); 449 } 450 451 void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) 452 { 453 struct amdgpu_mux_entry *e; 454 struct amdgpu_mux_chunk *chunk; 455 456 spin_lock(&mux->lock); 457 amdgpu_mux_resubmit_chunks(mux); 458 spin_unlock(&mux->lock); 459 460 e = amdgpu_ring_mux_sw_entry(mux, ring); 461 if (!e) { 462 DRM_ERROR("cannot find entry!\n"); 463 return; 464 } 465 466 #ifdef __linux__ 467 chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL); 468 #else 469 chunk = pool_get(&amdgpu_mux_chunk_slab, PR_WAITOK); 470 #endif 471 if (!chunk) { 472 DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n"); 473 return; 474 } 475 476 chunk->start = ring->wptr; 477 /* the initialized value used to check if they are set by the ib submission*/ 478 chunk->cntl_offset = ring->buf_mask + 1; 479 chunk->de_offset = ring->buf_mask + 1; 480 chunk->ce_offset = ring->buf_mask + 1; 481 list_add_tail(&chunk->entry, &e->list); 482 } 483 484 static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) 485 { 486 uint32_t last_seq = 0; 487 struct amdgpu_mux_entry *e; 488 struct amdgpu_mux_chunk *chunk, *tmp; 489 490 e = amdgpu_ring_mux_sw_entry(mux, ring); 491 if (!e) { 492 DRM_ERROR("cannot find entry!\n"); 493 return; 494 } 495 496 last_seq = atomic_read(&ring->fence_drv.last_seq); 497 498 list_for_each_entry_safe(chunk, tmp, &e->list, entry) { 499 if (chunk->sync_seq <= last_seq) { 500 list_del(&chunk->entry); 501 #ifdef __linux__ 502 kmem_cache_free(amdgpu_mux_chunk_slab, chunk); 503 #else 504 pool_put(&amdgpu_mux_chunk_slab, chunk); 505 #endif 506 } 507 } 508 } 509 510 void amdgpu_ring_mux_ib_mark_offset(struct amdgpu_ring_mux *mux, 511 struct amdgpu_ring *ring, u64 offset, 512 enum amdgpu_ring_mux_offset_type type) 513 { 514 struct amdgpu_mux_entry *e; 515 struct amdgpu_mux_chunk *chunk; 516 517 e = amdgpu_ring_mux_sw_entry(mux, ring); 518 if (!e) { 519 DRM_ERROR("cannot find entry!\n"); 520 return; 521 } 522 523 chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry); 524 if (!chunk) { 525 DRM_ERROR("cannot find chunk!\n"); 526 return; 527 } 528 529 switch (type) { 530 case AMDGPU_MUX_OFFSET_TYPE_CONTROL: 531 chunk->cntl_offset = offset; 532 break; 533 case AMDGPU_MUX_OFFSET_TYPE_DE: 534 chunk->de_offset = offset; 535 break; 536 case AMDGPU_MUX_OFFSET_TYPE_CE: 537 chunk->ce_offset = offset; 538 break; 539 default: 540 DRM_ERROR("invalid type (%d)\n", type); 541 break; 542 } 543 } 544 545 void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) 546 { 547 struct amdgpu_mux_entry *e; 548 struct amdgpu_mux_chunk *chunk; 549 550 e = amdgpu_ring_mux_sw_entry(mux, ring); 551 if (!e) { 552 DRM_ERROR("cannot find entry!\n"); 553 return; 554 } 555 556 chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry); 557 if (!chunk) { 558 DRM_ERROR("cannot find chunk!\n"); 559 return; 560 } 561 562 chunk->end = ring->wptr; 563 chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq); 564 565 scan_and_remove_signaled_chunk(mux, ring); 566 } 567 568 bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux) 569 { 570 struct amdgpu_mux_entry *e; 571 struct amdgpu_ring *ring = NULL; 572 int i; 573 574 if (!mux->pending_trailing_fence_signaled) 575 return false; 576 577 if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr)) 578 return false; 579 580 for (i = 0; i < mux->num_ring_entries; i++) { 581 e = &mux->ring_entry[i]; 582 if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) { 583 ring = e->ring; 584 break; 585 } 586 } 587 588 if (!ring) { 589 DRM_ERROR("cannot find low priority ring\n"); 590 return false; 591 } 592 593 amdgpu_fence_process(ring); 594 if (amdgpu_fence_count_emitted(ring) > 0) { 595 mux->s_resubmit = true; 596 mux->seqno_to_resubmit = ring->fence_drv.sync_seq; 597 amdgpu_ring_mux_schedule_resubmit(mux); 598 } 599 600 mux->pending_trailing_fence_signaled = false; 601 return true; 602 } 603