1 /* workitems.c -- The main runtime entry that performs work-item execution in 2 various ways and the builtin functions closely related to the 3 implementation. 4 5 Copyright (C) 2015-2020 Free Software Foundation, Inc. 6 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com> 7 for General Processor Tech. 8 9 Permission is hereby granted, free of charge, to any person obtaining a 10 copy of this software and associated documentation files 11 (the "Software"), to deal in the Software without restriction, including 12 without limitation the rights to use, copy, modify, merge, publish, 13 distribute, sublicense, and/or sell copies of the Software, and to 14 permit persons to whom the Software is furnished to do so, subject to 15 the following conditions: 16 17 The above copyright notice and this permission notice shall be included 18 in all copies or substantial portions of the Software. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 24 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 25 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 26 USE OR OTHER DEALINGS IN THE SOFTWARE. 27 */ 28 29 /* The fiber based multiple work-item work-group execution uses ucontext 30 based user mode threading. However, if gccbrig is able to optimize the 31 kernel to a much faster work-group function that implements the multiple 32 WI execution using loops instead of fibers requiring slow context switches, 33 the fiber-based implementation won't be called. 34 */ 35 36 #include <stdlib.h> 37 #include <signal.h> 38 #include <string.h> 39 40 #include "workitems.h" 41 #include "phsa-rt.h" 42 43 #ifdef HAVE_FIBERS 44 #include "fibers.h" 45 #endif 46 47 #ifdef BENCHMARK_PHSA_RT 48 #include <stdio.h> 49 #include <time.h> 50 51 static uint64_t wi_count = 0; 52 static uint64_t wis_skipped = 0; 53 static uint64_t wi_total = 0; 54 static clock_t start_time; 55 56 #endif 57 58 #ifdef DEBUG_PHSA_RT 59 #include <stdio.h> 60 #endif 61 62 #define PRIVATE_SEGMENT_ALIGN 256 63 #define FIBER_STACK_SIZE (64*1024) 64 #define GROUP_SEGMENT_ALIGN 256 65 66 /* Preserve this amount of additional space in the alloca stack as we need to 67 store the alloca frame pointer to the alloca frame, thus must preserve 68 space for it. This thus supports at most 1024 functions with allocas in 69 a call chain. */ 70 #define ALLOCA_OVERHEAD 1024*4 71 72 uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context); 73 74 uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context); 75 76 uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context); 77 78 uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi); 79 80 uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi); 81 82 void 83 phsa_fatal_error (int code) 84 { 85 exit (code); 86 } 87 88 #ifdef HAVE_FIBERS 89 /* ucontext-based work-item thread implementation. Runs all work-items in 90 separate fibers. */ 91 92 static void 93 phsa_work_item_thread (int arg0, int arg1) 94 { 95 void *arg = fiber_int_args_to_ptr (arg0, arg1); 96 97 PHSAWorkItem *wi = (PHSAWorkItem *) arg; 98 volatile PHSAWorkGroup *wg = wi->wg; 99 PHSAKernelLaunchData *l_data = wi->launch_data; 100 101 do 102 { 103 int retcode 104 = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier); 105 106 /* At this point the threads can assume that either more_wgs is 0 or 107 the current_work_group_* is set to point to the WG executed next. */ 108 if (!wi->wg->more_wgs) 109 break; 110 111 wi->group_x = wg->x; 112 wi->group_y = wg->y; 113 wi->group_z = wg->z; 114 115 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); 116 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); 117 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); 118 119 #ifdef DEBUG_PHSA_RT 120 printf ( 121 "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n", 122 wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z, 123 l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z); 124 #endif 125 126 if (wi->x < __hsail_currentworkgroupsize (0, wi) 127 && wi->y < __hsail_currentworkgroupsize (1, wi) 128 && wi->z < __hsail_currentworkgroupsize (2, wi)) 129 { 130 l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr, 131 wg->initial_group_offset, wg->private_base_ptr); 132 #ifdef DEBUG_PHSA_RT 133 printf ("done.\n"); 134 #endif 135 #ifdef BENCHMARK_PHSA_RT 136 wi_count++; 137 #endif 138 } 139 else 140 { 141 #ifdef DEBUG_PHSA_RT 142 printf ("skipped (partial WG).\n"); 143 #endif 144 #ifdef BENCHMARK_PHSA_RT 145 wis_skipped++; 146 #endif 147 } 148 149 retcode 150 = fiber_barrier_reach ((fiber_barrier_t *) 151 l_data->wg_completion_barrier); 152 153 /* The first thread updates the WG to execute next etc. */ 154 155 if (retcode == 0) 156 { 157 #ifdef EXECUTE_WGS_BACKWARDS 158 if (wg->x == l_data->wg_min_x) 159 { 160 wg->x = l_data->wg_max_x - 1; 161 if (wg->y == l_data->wg_min_y) 162 { 163 wg->y = l_data->wg_max_y - 1; 164 if (wg->z == l_data->wg_min_z) 165 wg->more_wgs = 0; 166 else 167 wg->z--; 168 } 169 else 170 wg->y--; 171 } 172 else 173 wg->x--; 174 #else 175 if (wg->x + 1 >= l_data->wg_max_x) 176 { 177 wg->x = l_data->wg_min_x; 178 if (wg->y + 1 >= l_data->wg_max_y) 179 { 180 wg->y = l_data->wg_min_y; 181 if (wg->z + 1 >= l_data->wg_max_z) 182 wg->more_wgs = 0; 183 else 184 wg->z++; 185 } 186 else 187 wg->y++; 188 } 189 else 190 wg->x++; 191 #endif 192 wi->group_x = wg->x; 193 wi->group_y = wg->y; 194 wi->group_z = wg->z; 195 196 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); 197 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); 198 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); 199 200 /* Reinitialize the work-group barrier according to the new WG's 201 size, which might not be the same as the previous ones, due 202 to "partial WGs". */ 203 size_t wg_size = __hsail_currentworkgroupsize (0, wi) 204 * __hsail_currentworkgroupsize (1, wi) 205 * __hsail_currentworkgroupsize (2, wi); 206 207 #ifdef DEBUG_PHSA_RT 208 printf ("Reinitializing the WG barrier to %lu.\n", wg_size); 209 #endif 210 fiber_barrier_init ((fiber_barrier_t *) 211 wi->launch_data->wg_sync_barrier, 212 wg_size); 213 214 #ifdef BENCHMARK_PHSA_RT 215 if (wi_count % 1000 == 0) 216 { 217 clock_t spent_time = clock () - start_time; 218 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC; 219 double wis_per_sec = wi_count / spent_time_sec; 220 uint64_t eta_sec 221 = (wi_total - wi_count - wis_skipped) / wis_per_sec; 222 223 printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in " 224 "%lu s)\n", 225 wi_count, wis_skipped, (uint64_t) spent_time_sec, 226 (uint64_t) wis_per_sec, (uint64_t) eta_sec); 227 } 228 #endif 229 } 230 } 231 while (1); 232 233 fiber_exit (); 234 } 235 #endif 236 237 #define MIN(a, b) ((a < b) ? a : b) 238 #define MAX(a, b) ((a > b) ? a : b) 239 240 #ifdef HAVE_FIBERS 241 /* Spawns a given number of work-items to execute a set of work-groups, 242 blocks until their completion. */ 243 244 static void 245 phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, 246 uint32_t group_local_offset, size_t wg_size_x, 247 size_t wg_size_y, size_t wg_size_z) 248 { 249 PHSAWorkItem *wi_threads = NULL; 250 PHSAWorkGroup wg; 251 size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z; 252 uint32_t group_x, group_y, group_z; 253 fiber_barrier_t wg_start_barrier; 254 fiber_barrier_t wg_completion_barrier; 255 fiber_barrier_t wg_sync_barrier; 256 257 max_x = wg_size_x == 0 ? 1 : wg_size_x; 258 max_y = wg_size_y == 0 ? 1 : wg_size_y; 259 max_z = wg_size_z == 0 ? 1 : wg_size_z; 260 261 size_t wg_size = max_x * max_y * max_z; 262 if (wg_size > PHSA_MAX_WG_SIZE) 263 phsa_fatal_error (2); 264 265 wg.private_segment_total_size = context->dp->private_segment_size * wg_size; 266 if (wg.private_segment_total_size > 0 267 && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN, 268 wg.private_segment_total_size) 269 != 0) 270 phsa_fatal_error (3); 271 272 wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD; 273 wg.alloca_frame_p = wg.alloca_stack_p; 274 wg.initial_group_offset = group_local_offset; 275 276 #ifdef EXECUTE_WGS_BACKWARDS 277 group_x = context->wg_max_x - 1; 278 group_y = context->wg_max_y - 1; 279 group_z = context->wg_max_z - 1; 280 #else 281 group_x = context->wg_min_x; 282 group_y = context->wg_min_y; 283 group_z = context->wg_min_z; 284 #endif 285 286 fiber_barrier_init (&wg_sync_barrier, wg_size); 287 fiber_barrier_init (&wg_start_barrier, wg_size); 288 fiber_barrier_init (&wg_completion_barrier, wg_size); 289 290 context->wg_start_barrier = &wg_start_barrier; 291 context->wg_sync_barrier = &wg_sync_barrier; 292 context->wg_completion_barrier = &wg_completion_barrier; 293 294 wg.more_wgs = 1; 295 wg.group_base_ptr = group_base_ptr; 296 297 #ifdef BENCHMARK_PHSA_RT 298 wi_count = 0; 299 wis_skipped = 0; 300 start_time = clock (); 301 #endif 302 wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z); 303 for (x = 0; x < max_x; ++x) 304 for (y = 0; y < max_y; ++y) 305 for (z = 0; z < max_z; ++z) 306 { 307 PHSAWorkItem *wi = &wi_threads[flat_wi_id]; 308 wi->launch_data = context; 309 wi->wg = &wg; 310 311 wg.x = wi->group_x = group_x; 312 wg.y = wi->group_y = group_y; 313 wg.z = wi->group_z = group_z; 314 315 wi->wg_size_x = context->dp->workgroup_size_x; 316 wi->wg_size_y = context->dp->workgroup_size_y; 317 wi->wg_size_z = context->dp->workgroup_size_z; 318 319 wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); 320 wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); 321 wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); 322 323 wi->x = x; 324 wi->y = y; 325 wi->z = z; 326 327 /* TODO: set the stack size according to the private 328 segment size. Too big stack consumes huge amount of 329 memory in case of huge number of WIs and a too small stack 330 will fail in mysterious and potentially dangerous ways. */ 331 332 fiber_init (&wi->fiber, phsa_work_item_thread, wi, 333 FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN); 334 ++flat_wi_id; 335 } 336 337 do 338 { 339 --flat_wi_id; 340 fiber_join (&wi_threads[flat_wi_id].fiber); 341 } 342 while (flat_wi_id > 0); 343 344 if (wg.private_segment_total_size > 0) 345 free (wg.private_base_ptr); 346 347 free (wi_threads); 348 } 349 350 /* Spawn the work-item threads to execute work-groups and let 351 them execute all the WGs, including a potential partial WG. */ 352 353 static void 354 phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr, 355 uint32_t group_local_offset) 356 { 357 hsa_kernel_dispatch_packet_t *dp = context->dp; 358 size_t x, y, z; 359 360 context->group_segment_start_addr = (size_t) group_base_ptr; 361 362 /* HSA seems to allow the WG size to be larger than the grid size. We need to 363 saturate the effective WG size to the grid size to prevent the extra WIs 364 from executing. */ 365 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size; 366 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x); 367 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y); 368 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z); 369 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z; 370 371 #ifdef BENCHMARK_PHSA_RT 372 wi_total = (uint64_t) dp->grid_size_x 373 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1) 374 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1); 375 #endif 376 377 /* For now execute all work groups in a single coarse thread (does not utilize 378 multicore/multithread). */ 379 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0; 380 381 int dims = dp->setup & 0x3; 382 383 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1) 384 / dp->workgroup_size_x; 385 386 context->wg_max_y 387 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1) 388 / dp->workgroup_size_y; 389 390 context->wg_max_z 391 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1) 392 / dp->workgroup_size_z; 393 394 #ifdef DEBUG_PHSA_RT 395 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with " 396 "wg size %lu/%lu/%lu grid size %u/%u/%u\n", 397 context->wg_min_x, context->wg_min_y, context->wg_min_z, 398 context->wg_max_x, context->wg_max_y, context->wg_max_z, 399 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x, 400 dp->grid_size_y, dp->grid_size_z); 401 #endif 402 403 phsa_execute_wi_gang (context, group_base_ptr, group_local_offset, 404 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z); 405 } 406 #endif 407 408 /* Executes the given work-group function for all work groups in the grid. 409 410 A work-group function is a version of the original kernel which executes 411 the kernel for all work-items in a work-group. It is produced by gccbrig 412 if it can handle the kernel's barrier usage and is much faster way to 413 execute massive numbers of work-items in a non-SPMD machine than fibers 414 (easily 100x faster). */ 415 static void 416 phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr, 417 uint32_t group_local_offset) 418 { 419 hsa_kernel_dispatch_packet_t *dp = context->dp; 420 size_t x, y, z, wg_x, wg_y, wg_z; 421 422 context->group_segment_start_addr = (size_t) group_base_ptr; 423 424 /* HSA seems to allow the WG size to be larger than the grid size. We need 425 to saturate the effective WG size to the grid size to prevent the extra WIs 426 from executing. */ 427 size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size; 428 sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x); 429 sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y); 430 sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z); 431 sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z; 432 433 #ifdef BENCHMARK_PHSA_RT 434 wi_total = (uint64_t) dp->grid_size_x 435 * (dp->grid_size_y > 0 ? dp->grid_size_y : 1) 436 * (dp->grid_size_z > 0 ? dp->grid_size_z : 1); 437 #endif 438 439 context->wg_min_x = context->wg_min_y = context->wg_min_z = 0; 440 441 int dims = dp->setup & 0x3; 442 443 context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1) 444 / dp->workgroup_size_x; 445 446 context->wg_max_y 447 = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1) 448 / dp->workgroup_size_y; 449 450 context->wg_max_z 451 = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1) 452 / dp->workgroup_size_z; 453 454 #ifdef DEBUG_PHSA_RT 455 printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with " 456 "wg size %lu/%lu/%lu grid size %u/%u/%u\n", 457 context->wg_min_x, context->wg_min_y, context->wg_min_z, 458 context->wg_max_x, context->wg_max_y, context->wg_max_z, 459 sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x, 460 dp->grid_size_y, dp->grid_size_z); 461 #endif 462 463 PHSAWorkItem wi; 464 PHSAWorkGroup wg; 465 wi.wg = &wg; 466 wi.x = wi.y = wi.z = 0; 467 wi.launch_data = context; 468 469 #ifdef BENCHMARK_PHSA_RT 470 start_time = clock (); 471 uint64_t wg_count = 0; 472 #endif 473 474 size_t wg_size = __hsail_workgroupsize (0, &wi) 475 * __hsail_workgroupsize (1, &wi) 476 * __hsail_workgroupsize (2, &wi); 477 478 void *private_base_ptr = NULL; 479 if (dp->private_segment_size > 0 480 && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN, 481 dp->private_segment_size * wg_size) 482 != 0) 483 phsa_fatal_error (3); 484 485 wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD; 486 wg.alloca_frame_p = wg.alloca_stack_p; 487 488 wg.private_base_ptr = private_base_ptr; 489 wg.group_base_ptr = group_base_ptr; 490 491 #ifdef DEBUG_PHSA_RT 492 printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size, 493 wg_size, private_base_ptr); 494 #endif 495 496 for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z) 497 for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y) 498 for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x) 499 { 500 wi.group_x = wg_x; 501 wi.group_y = wg_y; 502 wi.group_z = wg_z; 503 504 wi.wg_size_x = context->dp->workgroup_size_x; 505 wi.wg_size_y = context->dp->workgroup_size_y; 506 wi.wg_size_z = context->dp->workgroup_size_z; 507 508 wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi); 509 wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi); 510 wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi); 511 512 context->kernel (context->kernarg_addr, &wi, group_base_ptr, 513 group_local_offset, private_base_ptr); 514 515 #if defined (BENCHMARK_PHSA_RT) 516 wg_count++; 517 if (wg_count % 1000000 == 0) 518 { 519 clock_t spent_time = clock () - start_time; 520 uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y 521 + wg_z * sat_wg_size_z; 522 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC; 523 double wis_per_sec = wi_count / spent_time_sec; 524 uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec; 525 526 printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n", 527 wi_count, (uint64_t) spent_time_sec, 528 (uint64_t) wis_per_sec, (uint64_t) eta_sec); 529 } 530 #endif 531 } 532 533 #ifdef BENCHMARK_PHSA_RT 534 clock_t spent_time = clock () - start_time; 535 double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC; 536 double wis_per_sec = wi_total / spent_time_sec; 537 538 printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total, 539 (uint64_t) spent_time_sec, (uint64_t) wis_per_sec); 540 #endif 541 free (private_base_ptr); 542 private_base_ptr = NULL; 543 } 544 545 /* gccbrig generates the following from each HSAIL kernel: 546 547 1) The actual kernel function (a single work-item kernel or a work-group 548 function) generated from HSAIL (BRIG). 549 550 static void _Kernel (void* args, void* context, void* group_base_ptr) 551 { 552 ... 553 } 554 555 2) A public facing kernel function that is called from the PHSA runtime: 556 557 a) A single work-item function (that requires fibers for multi-WI): 558 559 void Kernel (void* context) 560 { 561 __launch_launch_kernel (_Kernel, context); 562 } 563 564 or 565 566 b) a when gccbrig could generate a work-group function: 567 568 void Kernel (void* context) 569 { 570 __hsail_launch_wg_function (_Kernel, context); 571 } 572 */ 573 574 #ifdef HAVE_FIBERS 575 576 void 577 __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context, 578 void *group_base_ptr, uint32_t group_local_offset) 579 { 580 context->kernel = kernel; 581 phsa_spawn_work_items (context, group_base_ptr, group_local_offset); 582 } 583 #endif 584 585 void 586 __hsail_launch_wg_function (gccbrigKernelFunc kernel, 587 PHSAKernelLaunchData *context, void *group_base_ptr, 588 uint32_t group_local_offset) 589 { 590 context->kernel = kernel; 591 phsa_execute_work_groups (context, group_base_ptr, group_local_offset); 592 } 593 594 uint32_t 595 __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context) 596 { 597 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; 598 599 uint32_t id; 600 switch (dim) 601 { 602 default: 603 case 0: 604 /* Overflow semantics in the case of WG dim > grid dim. */ 605 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) 606 % dp->grid_size_x; 607 break; 608 case 1: 609 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) 610 % dp->grid_size_y; 611 break; 612 case 2: 613 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) 614 % dp->grid_size_z; 615 break; 616 } 617 return id; 618 } 619 620 uint64_t 621 __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context) 622 { 623 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; 624 625 uint64_t id; 626 switch (dim) 627 { 628 default: 629 case 0: 630 /* Overflow semantics in the case of WG dim > grid dim. */ 631 id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) 632 % dp->grid_size_x; 633 break; 634 case 1: 635 id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) 636 % dp->grid_size_y; 637 break; 638 case 2: 639 id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) 640 % dp->grid_size_z; 641 break; 642 } 643 return id; 644 } 645 646 647 uint32_t 648 __hsail_workitemid (uint32_t dim, PHSAWorkItem *context) 649 { 650 PHSAWorkItem *c = (PHSAWorkItem *) context; 651 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; 652 653 /* The number of dimensions is in the two least significant bits. */ 654 int dims = dp->setup & 0x3; 655 656 uint32_t id; 657 switch (dim) 658 { 659 default: 660 case 0: 661 id = c->x; 662 break; 663 case 1: 664 id = dims < 2 ? 0 : c->y; 665 break; 666 case 2: 667 id = dims < 3 ? 0 : c->z; 668 break; 669 } 670 return id; 671 } 672 673 uint32_t 674 __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context) 675 { 676 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; 677 int dims = dp->setup & 0x3; 678 679 uint32_t id; 680 switch (dim) 681 { 682 default: 683 case 0: 684 id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x; 685 break; 686 case 1: 687 id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1) 688 / dp->workgroup_size_y; 689 break; 690 case 2: 691 id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1) 692 / dp->workgroup_size_z; 693 break; 694 } 695 return id; 696 } 697 698 uint32_t 699 __hsail_workitemflatid (PHSAWorkItem *c) 700 { 701 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp; 702 703 return c->x + c->y * dp->workgroup_size_x 704 + c->z * dp->workgroup_size_x * dp->workgroup_size_y; 705 } 706 707 uint32_t 708 __hsail_currentworkitemflatid (PHSAWorkItem *c) 709 { 710 hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp; 711 712 return c->x + c->y * __hsail_currentworkgroupsize (0, c) 713 + c->z * __hsail_currentworkgroupsize (0, c) 714 * __hsail_currentworkgroupsize (1, c); 715 } 716 717 void 718 __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context) 719 { 720 switch (dim) 721 { 722 default: 723 case 0: 724 context->x = id; 725 break; 726 case 1: 727 context->y = id; 728 break; 729 case 2: 730 context->z = id; 731 break; 732 } 733 } 734 735 uint64_t 736 __hsail_workitemflatabsid_u64 (PHSAWorkItem *context) 737 { 738 PHSAWorkItem *c = (PHSAWorkItem *) context; 739 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; 740 741 /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */ 742 uint64_t id0 = __hsail_workitemabsid (0, context); 743 uint64_t id1 = __hsail_workitemabsid (1, context); 744 uint64_t id2 = __hsail_workitemabsid (2, context); 745 746 uint64_t max0 = dp->grid_size_x; 747 uint64_t max1 = dp->grid_size_y; 748 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1; 749 750 return id; 751 } 752 753 uint32_t 754 __hsail_workitemflatabsid_u32 (PHSAWorkItem *context) 755 { 756 PHSAWorkItem *c = (PHSAWorkItem *) context; 757 hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; 758 759 /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */ 760 uint64_t id0 = __hsail_workitemabsid (0, context); 761 uint64_t id1 = __hsail_workitemabsid (1, context); 762 uint64_t id2 = __hsail_workitemabsid (2, context); 763 764 uint64_t max0 = dp->grid_size_x; 765 uint64_t max1 = dp->grid_size_y; 766 uint64_t id = id0 + id1 * max0 + id2 * max0 * max1; 767 return (uint32_t) id; 768 } 769 770 uint32_t 771 __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi) 772 { 773 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; 774 uint32_t wg_size = 0; 775 switch (dim) 776 { 777 default: 778 case 0: 779 if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x) 780 wg_size = dp->workgroup_size_x; /* Full WG. */ 781 else 782 wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */ 783 break; 784 case 1: 785 if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y) 786 wg_size = dp->workgroup_size_y; /* Full WG. */ 787 else 788 wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */ 789 break; 790 case 2: 791 if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z) 792 wg_size = dp->workgroup_size_z; /* Full WG. */ 793 else 794 wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */ 795 break; 796 } 797 return wg_size; 798 } 799 800 uint32_t 801 __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi) 802 { 803 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; 804 switch (dim) 805 { 806 default: 807 case 0: 808 return dp->workgroup_size_x; 809 case 1: 810 return dp->workgroup_size_y; 811 case 2: 812 return dp->workgroup_size_z; 813 } 814 } 815 816 uint32_t 817 __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi) 818 { 819 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; 820 switch (dim) 821 { 822 default: 823 case 0: 824 return dp->grid_size_x; 825 case 1: 826 return dp->grid_size_y; 827 case 2: 828 return dp->grid_size_z; 829 } 830 } 831 832 uint32_t 833 __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi) 834 { 835 switch (dim) 836 { 837 default: 838 case 0: 839 return wi->group_x; 840 case 1: 841 return wi->group_y; 842 case 2: 843 return wi->group_z; 844 } 845 } 846 847 uint32_t 848 __hsail_dim (PHSAWorkItem *wi) 849 { 850 hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; 851 return dp->setup & 0x3; 852 } 853 854 uint64_t 855 __hsail_packetid (PHSAWorkItem *wi) 856 { 857 return wi->launch_data->packet_id; 858 } 859 860 uint32_t 861 __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi) 862 { 863 return (uint32_t) wi->launch_data->dp->completion_signal.handle; 864 } 865 866 uint64_t 867 __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi) 868 { 869 return (uint64_t) (wi->launch_data->dp->completion_signal.handle); 870 } 871 872 #ifdef HAVE_FIBERS 873 void 874 __hsail_barrier (PHSAWorkItem *wi) 875 { 876 fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier); 877 } 878 #endif 879 880 /* Return a 32b private segment address that points to a dynamically 881 allocated chunk of 'size' with 'align'. 882 883 Allocates the space from the end of the private segment allocated 884 for the whole work group. In implementations with separate private 885 memories per WI, we will need to have a stack pointer per WI. But in 886 the current implementation, the segment is shared, so we possibly 887 save some space in case all WIs do not call the alloca. 888 889 The "alloca frames" are organized as follows: 890 891 wg->alloca_stack_p points to the last allocated data (initially 892 outside the private segment) 893 wg->alloca_frame_p points to the first address _outside_ the current 894 function's allocations (initially to the same as alloca_stack_p) 895 896 The data is allocated downwards from the end of the private segment. 897 898 In the beginning of a new function which has allocas, a new alloca 899 frame is pushed which adds the current alloca_frame_p (the current 900 function's frame starting point) to the top of the alloca stack and 901 alloca_frame_p is set to the current stack position. 902 903 At the exit points of a function with allocas, the alloca frame 904 is popped before returning. This involves popping the alloca_frame_p 905 to the one of the previous function in the call stack, and alloca_stack_p 906 similarly, to the position of the last word alloca'd by the previous 907 function. 908 */ 909 910 uint32_t 911 __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi) 912 { 913 volatile PHSAWorkGroup *wg = wi->wg; 914 int64_t new_pos = wg->alloca_stack_p - size; 915 while (new_pos % align != 0) 916 new_pos--; 917 if (new_pos < 0) 918 phsa_fatal_error (2); 919 920 wg->alloca_stack_p = new_pos; 921 922 #ifdef DEBUG_ALLOCA 923 printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align, 924 wg->alloca_stack_p, wg->alloca_frame_p); 925 #endif 926 return new_pos; 927 } 928 929 /* Initializes a new "alloca frame" in the private segment. 930 This should be called at all the function entry points in case 931 the function contains at least one call to alloca. */ 932 933 void 934 __hsail_alloca_push_frame (PHSAWorkItem *wi) 935 { 936 volatile PHSAWorkGroup *wg = wi->wg; 937 938 /* Store the alloca_frame_p without any alignment padding so 939 we know exactly where the previous frame ended after popping 940 it. */ 941 #ifdef DEBUG_ALLOCA 942 printf ("--- push frame "); 943 #endif 944 uint32_t last_word_offs = __hsail_alloca (4, 1, wi); 945 memcpy (wg->private_base_ptr + last_word_offs, 946 (const void *) &wg->alloca_frame_p, 4); 947 wg->alloca_frame_p = last_word_offs; 948 949 #ifdef DEBUG_ALLOCA 950 printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p); 951 #endif 952 } 953 954 /* Frees the current "alloca frame" and restores the frame 955 pointer. 956 This should be called at all the function return points in case 957 the function contains at least one call to alloca. Restores the 958 alloca stack to the condition it was before pushing the frame 959 the last time. */ 960 void 961 __hsail_alloca_pop_frame (PHSAWorkItem *wi) 962 { 963 volatile PHSAWorkGroup *wg = wi->wg; 964 965 wg->alloca_stack_p = wg->alloca_frame_p; 966 memcpy ((void *) &wg->alloca_frame_p, 967 (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4); 968 /* Now frame_p points to the beginning of the previous function's 969 frame and stack_p to its end. */ 970 971 wg->alloca_stack_p += 4; 972 973 #ifdef DEBUG_ALLOCA 974 printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p, 975 wg->alloca_frame_p); 976 #endif 977 } 978