140d0058eSDurgadoss R; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2*b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s 3*b279f6b0SFangrui Song; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} 440d0058eSDurgadoss R 540d0058eSDurgadoss Rtarget triple = "nvptx64-nvidia-cuda" 640d0058eSDurgadoss R 740d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.1d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i64 %ch, i1 %flag_ch); 840d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %flag_ch); 940d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch); 1040d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch); 1140d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch); 1240d0058eSDurgadoss R 1340d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch); 1440d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch); 1540d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch); 1640d0058eSDurgadoss R 1740d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_1d 1840d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch) { 1940d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_1d( 2040d0058eSDurgadoss R; CHECK-PTX: { 2140d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<2>; 2240d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 2340d0058eSDurgadoss R; CHECK-PTX-EMPTY: 2440d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 2540d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0]; 2640d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1]; 2740d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2]; 2840d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3]; 2940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; 3740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1}], [%rd1]; 3840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1}], [%rd1]; 3940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1}], [%rd1]; 4040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1}], [%rd1]; 4140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1}], [%rd1]; 4240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1}], [%rd1]; 4340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1}], [%rd1]; 4440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1}], [%rd1]; 4540d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 4640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 4740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 4840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 4940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 5040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 5140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 5240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 5340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1) 5440d0058eSDurgadoss R 5540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 5640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 5740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 5840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 5940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 6040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 6140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 6240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) 6340d0058eSDurgadoss R ret void 6440d0058eSDurgadoss R} 6540d0058eSDurgadoss R 6640d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_2d 6740d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { 6840d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_2d( 6940d0058eSDurgadoss R; CHECK-PTX: { 7040d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<3>; 7140d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 7240d0058eSDurgadoss R; CHECK-PTX-EMPTY: 7340d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 7440d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0]; 7540d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1]; 7640d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2]; 7740d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3]; 7840d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4]; 7940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; 8740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 8840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 8940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 9040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 9140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 9240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 9340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 9440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; 9540d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 9640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 9740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 9840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 9940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 10040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 10140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 10240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 10340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) 10440d0058eSDurgadoss R 10540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 10640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 10740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 10840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 10940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 11040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 11140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 11240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) 11340d0058eSDurgadoss R ret void 11440d0058eSDurgadoss R} 11540d0058eSDurgadoss R 11640d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_3d 11740d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { 11840d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_3d( 11940d0058eSDurgadoss R; CHECK-PTX: { 12040d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<4>; 12140d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 12240d0058eSDurgadoss R; CHECK-PTX-EMPTY: 12340d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 12440d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0]; 12540d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1]; 12640d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2]; 12740d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3]; 12840d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4]; 12940d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5]; 13040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 13840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 13940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 14040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 14140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 14240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 14340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 14440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 14540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 14640d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 14740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 14840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 14940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 15040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 15140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 15240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 15340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 15440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 15540d0058eSDurgadoss R 15640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 15740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 15840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 15940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 16040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 16140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 16240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 16340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 16440d0058eSDurgadoss R ret void 16540d0058eSDurgadoss R} 16640d0058eSDurgadoss R 16740d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_4d 16840d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { 16940d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_4d( 17040d0058eSDurgadoss R; CHECK-PTX: { 17140d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<5>; 17240d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 17340d0058eSDurgadoss R; CHECK-PTX-EMPTY: 17440d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 17540d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0]; 17640d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1]; 17740d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2]; 17840d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3]; 17940d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4]; 18040d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5]; 18140d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6]; 18240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 18340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 18440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 18540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 18640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 18740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 18840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 18940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 19040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 19840d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 19940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 20740d0058eSDurgadoss R 20840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 20940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 21040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 21140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 21240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 21340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 21440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 21540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 21640d0058eSDurgadoss R ret void 21740d0058eSDurgadoss R} 21840d0058eSDurgadoss R 21940d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_5d 22040d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { 22140d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_5d( 22240d0058eSDurgadoss R; CHECK-PTX: { 22340d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<6>; 22440d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 22540d0058eSDurgadoss R; CHECK-PTX-EMPTY: 22640d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 22740d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0]; 22840d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1]; 22940d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2]; 23040d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3]; 23140d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4]; 23240d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5]; 23340d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6]; 23440d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7]; 23540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 23640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 23740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 23840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 23940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 24040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 24140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 24240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 24340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 24440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 24540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 24640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 24740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 24840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 24940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 25040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 25140d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 25240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 25340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 25440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 25540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 25640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 25740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 25840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 25940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 26040d0058eSDurgadoss R 26140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 26940d0058eSDurgadoss R ret void 27040d0058eSDurgadoss R} 27140d0058eSDurgadoss R 27240d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_3d 27340d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_im2col_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { 27440d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_3d( 27540d0058eSDurgadoss R; CHECK-PTX: { 27640d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<4>; 27740d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 27840d0058eSDurgadoss R; CHECK-PTX-EMPTY: 27940d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 28040d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0]; 28140d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1]; 28240d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2]; 28340d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3]; 28440d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4]; 28540d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5]; 28640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 28740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 28840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 28940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 29040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 29140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 29240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 29340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; 29440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 29540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 29640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 29740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 29840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 29940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 30040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 30140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; 30240d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 30340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 30440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 30540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 30640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 30740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 30840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 30940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 31040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) 31140d0058eSDurgadoss R 31240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 31340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 31440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 31540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 31640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 31740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 31840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 31940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) 32040d0058eSDurgadoss R ret void 32140d0058eSDurgadoss R} 32240d0058eSDurgadoss R 32340d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_4d 32440d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_im2col_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { 32540d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_4d( 32640d0058eSDurgadoss R; CHECK-PTX: { 32740d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<5>; 32840d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 32940d0058eSDurgadoss R; CHECK-PTX-EMPTY: 33040d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 33140d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0]; 33240d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1]; 33340d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2]; 33440d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3]; 33540d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4]; 33640d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5]; 33740d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6]; 33840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 33940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 34040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 34140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 34240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 34340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 34440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 34540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; 34640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 34740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 34840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 34940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 35040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 35140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 35240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 35340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; 35440d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 35540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 35640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 35740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 35840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 35940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 36040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 36140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 36240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) 36340d0058eSDurgadoss R 36440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 36540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 36640d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 36740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 36840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 36940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 37040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 37140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) 37240d0058eSDurgadoss R ret void 37340d0058eSDurgadoss R} 37440d0058eSDurgadoss R 37540d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_5d 37640d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_im2col_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { 37740d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_5d( 37840d0058eSDurgadoss R; CHECK-PTX: { 37940d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b32 %r<6>; 38040d0058eSDurgadoss R; CHECK-PTX-NEXT: .reg .b64 %rd<4>; 38140d0058eSDurgadoss R; CHECK-PTX-EMPTY: 38240d0058eSDurgadoss R; CHECK-PTX-NEXT: // %bb.0: 38340d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0]; 38440d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1]; 38540d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2]; 38640d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3]; 38740d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4]; 38840d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5]; 38940d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6]; 39040d0058eSDurgadoss R; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7]; 39140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39740d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39840d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; 39940d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40040d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40140d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40240d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40340d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40440d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40540d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40640d0058eSDurgadoss R; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; 40740d0058eSDurgadoss R; CHECK-PTX-NEXT: ret; 40840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 40940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 41040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 41140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 41240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 41340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 41440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 41540d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) 41640d0058eSDurgadoss R 41740d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 41840d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 41940d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 42040d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 42140d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 42240d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 42340d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 42440d0058eSDurgadoss R tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) 42540d0058eSDurgadoss R ret void 42640d0058eSDurgadoss R} 427