11b01064fSDurgadoss R; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2*b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s 3*b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s 4*b279f6b0SFangrui Song; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} 5*b279f6b0SFangrui Song; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} 61b01064fSDurgadoss R 71b01064fSDurgadoss Rtarget triple = "nvptx64-nvidia-cuda" 81b01064fSDurgadoss R 91b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 101b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 111b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 121b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 131b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 141b01064fSDurgadoss R 151b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 161b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 171b01064fSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2); 181b01064fSDurgadoss R 191b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_1d 201b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { 211b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_1d( 221b01064fSDurgadoss R; CHECK-PTX64: { 231b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; 241b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<2>; 251b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 261b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 271b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 281b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; 291b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; 301b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; 311b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; 321b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; 331b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; 341b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; 351b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; 361b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1; 371b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; 381b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 391b01064fSDurgadoss R; 401b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_1d( 411b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 421b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; 431b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; 441b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 451b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 461b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 471b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; 481b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; 491b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; 501b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; 511b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; 521b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; 531b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; 541b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; 551b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1; 561b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; 571b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 581b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 0, i1 0) 591b01064fSDurgadoss R 601b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 %ch, i1 0, i1 1) 611b01064fSDurgadoss R 621b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 undef, i1 1, i1 0) 631b01064fSDurgadoss R 641b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1) 651b01064fSDurgadoss R ret void 661b01064fSDurgadoss R} 671b01064fSDurgadoss R 681b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_2d 691b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) { 701b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_2d( 711b01064fSDurgadoss R; CHECK-PTX64: { 721b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; 731b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<3>; 741b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 751b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 761b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 771b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; 781b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; 791b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; 801b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; 811b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; 821b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; 831b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; 841b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; 851b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; 861b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; 871b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; 881b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 891b01064fSDurgadoss R; 901b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_2d( 911b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 921b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; 931b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; 941b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 951b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 961b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 971b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; 981b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; 991b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; 1001b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; 1011b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; 1021b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; 1031b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; 1041b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; 1051b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; 1061b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; 1071b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; 1081b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 1091b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 0, i1 0) 1101b01064fSDurgadoss R 1111b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 %ch, i1 0, i1 1) 1121b01064fSDurgadoss R 1131b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 undef, i1 1, i1 0) 1141b01064fSDurgadoss R 1151b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1) 1161b01064fSDurgadoss R ret void 1171b01064fSDurgadoss R} 1181b01064fSDurgadoss R 1191b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_3d 1201b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) { 1211b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_3d( 1221b01064fSDurgadoss R; CHECK-PTX64: { 1231b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; 1241b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<4>; 1251b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 1261b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 1271b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 1281b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; 1291b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; 1301b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; 1311b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; 1321b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; 1331b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; 1341b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; 1351b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; 1361b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; 1371b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; 1381b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; 1391b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; 1401b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 1411b01064fSDurgadoss R; 1421b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_3d( 1431b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 1441b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; 1451b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; 1461b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 1471b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 1481b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 1491b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; 1501b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; 1511b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; 1521b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; 1531b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; 1541b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; 1551b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; 1561b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; 1571b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; 1581b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; 1591b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; 1601b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; 1611b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 1621b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 0, i1 0) 1631b01064fSDurgadoss R 1641b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 %ch, i1 0, i1 1) 1651b01064fSDurgadoss R 1661b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0) 1671b01064fSDurgadoss R 1681b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1) 1691b01064fSDurgadoss R ret void 1701b01064fSDurgadoss R} 1711b01064fSDurgadoss R 1721b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_4d 1731b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) { 1741b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_4d( 1751b01064fSDurgadoss R; CHECK-PTX64: { 1761b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; 1771b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<5>; 1781b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 1791b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 1801b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 1811b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; 1821b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; 1831b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; 1841b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; 1851b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; 1861b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; 1871b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; 1881b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; 1891b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; 1901b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; 1911b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; 1921b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; 1931b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; 1941b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 1951b01064fSDurgadoss R; 1961b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_4d( 1971b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 1981b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; 1991b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; 2001b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 2011b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 2021b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 2031b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; 2041b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; 2051b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; 2061b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; 2071b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; 2081b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; 2091b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; 2101b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; 2111b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; 2121b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; 2131b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; 2141b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; 2151b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; 2161b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 2171b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 undef, i1 0, i1 0) 2181b01064fSDurgadoss R 2191b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 %ch, i1 0, i1 1) 2201b01064fSDurgadoss R 2211b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0) 2221b01064fSDurgadoss R 2231b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1) 2241b01064fSDurgadoss R ret void 2251b01064fSDurgadoss R} 2261b01064fSDurgadoss R 2271b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_tile_5d 2281b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { 2291b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_tile_5d( 2301b01064fSDurgadoss R; CHECK-PTX64: { 2311b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; 2321b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<6>; 2331b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 2341b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 2351b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 2361b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; 2371b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; 2381b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; 2391b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; 2401b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; 2411b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; 2421b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; 2431b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; 2441b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; 2451b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; 2461b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; 2471b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; 2481b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; 2491b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; 2501b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 2511b01064fSDurgadoss R; 2521b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_tile_5d( 2531b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 2541b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; 2551b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; 2561b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 2571b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 2581b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 2591b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; 2601b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; 2611b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; 2621b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; 2631b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; 2641b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; 2651b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; 2661b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; 2671b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; 2681b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; 2691b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; 2701b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; 2711b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; 2721b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; 2731b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 2741b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 undef, i1 0, i1 0) 2751b01064fSDurgadoss R 2761b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 %ch, i1 0, i1 1) 2771b01064fSDurgadoss R 2781b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0) 2791b01064fSDurgadoss R 2801b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1) 2811b01064fSDurgadoss R ret void 2821b01064fSDurgadoss R} 2831b01064fSDurgadoss R 2841b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2col_3d 2851b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) { 2861b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2col_3d( 2871b01064fSDurgadoss R; CHECK-PTX64: { 2881b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; 2891b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<4>; 2901b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 2911b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 2921b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 2931b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; 2941b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; 2951b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; 2961b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; 2971b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; 2981b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; 2991b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; 3001b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; 3011b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; 3021b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; 3031b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; 3041b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; 3051b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; 3061b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 3071b01064fSDurgadoss R; 3081b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2col_3d( 3091b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 3101b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; 3111b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; 3121b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 3131b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 3141b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 3151b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; 3161b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; 3171b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; 3181b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; 3191b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; 3201b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; 3211b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; 3221b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; 3231b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; 3241b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; 3251b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; 3261b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; 3271b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; 3281b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 3291b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 0, i1 0) 3301b01064fSDurgadoss R 3311b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 %ch, i1 0, i1 1) 3321b01064fSDurgadoss R 3331b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0) 3341b01064fSDurgadoss R 3351b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1) 3361b01064fSDurgadoss R ret void 3371b01064fSDurgadoss R} 3381b01064fSDurgadoss R 3391b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2col_4d 3401b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) { 3411b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2col_4d( 3421b01064fSDurgadoss R; CHECK-PTX64: { 3431b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; 3441b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<5>; 3451b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 3461b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 3471b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 3481b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; 3491b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; 3501b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; 3511b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; 3521b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; 3531b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; 3541b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; 3551b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; 3561b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; 3571b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; 3581b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; 3591b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; 3601b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; 3611b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; 3621b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; 3631b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 3641b01064fSDurgadoss R; 3651b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2col_4d( 3661b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 3671b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; 3681b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; 3691b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 3701b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 3711b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 3721b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; 3731b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; 3741b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; 3751b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; 3761b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; 3771b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; 3781b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; 3791b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; 3801b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; 3811b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; 3821b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; 3831b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; 3841b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; 3851b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; 3861b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; 3871b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 3881b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 0, i1 0) 3891b01064fSDurgadoss R 3901b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 %ch, i1 0, i1 1) 3911b01064fSDurgadoss R 3921b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0) 3931b01064fSDurgadoss R 3941b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1) 3951b01064fSDurgadoss R ret void 3961b01064fSDurgadoss R} 3971b01064fSDurgadoss R 3981b01064fSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2col_5d 3991b01064fSDurgadoss Rdefine void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { 4001b01064fSDurgadoss R; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2col_5d( 4011b01064fSDurgadoss R; CHECK-PTX64: { 4021b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b16 %rs<5>; 4031b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b32 %r<6>; 4041b01064fSDurgadoss R; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; 4051b01064fSDurgadoss R; CHECK-PTX64-EMPTY: 4061b01064fSDurgadoss R; CHECK-PTX64-NEXT: // %bb.0: 4071b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; 4081b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; 4091b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; 4101b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; 4111b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; 4121b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; 4131b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; 4141b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; 4151b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; 4161b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; 4171b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; 4181b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; 4191b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; 4201b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; 4211b01064fSDurgadoss R; CHECK-PTX64-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; 4221b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; 4231b01064fSDurgadoss R; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; 4241b01064fSDurgadoss R; CHECK-PTX64-NEXT: ret; 4251b01064fSDurgadoss R; 4261b01064fSDurgadoss R; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2col_5d( 4271b01064fSDurgadoss R; CHECK-PTX-SHARED32: { 4281b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<5>; 4291b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; 4301b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; 4311b01064fSDurgadoss R; CHECK-PTX-SHARED32-EMPTY: 4321b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: // %bb.0: 4331b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; 4341b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; 4351b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; 4361b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; 4371b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; 4381b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; 4391b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; 4401b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; 4411b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; 4421b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; 4431b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; 4441b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; 4451b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; 4461b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; 4471b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; 4481b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; 4491b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; 4501b01064fSDurgadoss R; CHECK-PTX-SHARED32-NEXT: ret; 4511b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 0, i1 0) 4521b01064fSDurgadoss R 4531b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 %ch, i1 0, i1 1) 4541b01064fSDurgadoss R 4551b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0) 4561b01064fSDurgadoss R 4571b01064fSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1) 4581b01064fSDurgadoss R ret void 4591b01064fSDurgadoss R} 460