xref: /llvm-project/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll (revision b279f6b098d3849f7f1c1f539b108307d5f8ae2d)
140d0058eSDurgadoss R; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2*b279f6b0SFangrui Song; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s
3*b279f6b0SFangrui Song; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
440d0058eSDurgadoss R
540d0058eSDurgadoss Rtarget triple = "nvptx64-nvidia-cuda"
640d0058eSDurgadoss R
740d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.1d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i64 %ch, i1 %flag_ch);
840d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %flag_ch);
940d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch);
1040d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch);
1140d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.tile.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch);
1240d0058eSDurgadoss R
1340d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.3d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch);
1440d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.4d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag_ch);
1540d0058eSDurgadoss Rdeclare void @llvm.nvvm.cp.async.bulk.tensor.reduce.im2col.5d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag_ch);
1640d0058eSDurgadoss R
1740d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_1d
1840d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch) {
1940d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_1d(
2040d0058eSDurgadoss R; CHECK-PTX:       {
2140d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
2240d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
2340d0058eSDurgadoss R; CHECK-PTX-EMPTY:
2440d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
2540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0];
2640d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1];
2740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2];
2840d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3];
2940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
3740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1}], [%rd1];
3840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1}], [%rd1];
3940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1}], [%rd1];
4040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1}], [%rd1];
4140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1}], [%rd1];
4240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1}], [%rd1];
4340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1}], [%rd1];
4440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1}], [%rd1];
4540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
4640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
4740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
4840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
4940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
5040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
5140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
5240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
5340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 1)
5440d0058eSDurgadoss R
5540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
5640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
5740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
5840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
5940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
6040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
6140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
6240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
6340d0058eSDurgadoss R  ret void
6440d0058eSDurgadoss R}
6540d0058eSDurgadoss R
6640d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_2d
6740d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) {
6840d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_2d(
6940d0058eSDurgadoss R; CHECK-PTX:       {
7040d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
7140d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
7240d0058eSDurgadoss R; CHECK-PTX-EMPTY:
7340d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
7440d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0];
7540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1];
7640d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2];
7740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3];
7840d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4];
7940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
8740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
8840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
8940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
9040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
9140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
9240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
9340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
9440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
9540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
9640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
9740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
9840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
9940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
10040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
10140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
10240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
10340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1)
10440d0058eSDurgadoss R
10540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
10640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
10740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
10840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
10940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
11040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
11140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
11240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
11340d0058eSDurgadoss R  ret void
11440d0058eSDurgadoss R}
11540d0058eSDurgadoss R
11640d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_3d
11740d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) {
11840d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_3d(
11940d0058eSDurgadoss R; CHECK-PTX:       {
12040d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
12140d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
12240d0058eSDurgadoss R; CHECK-PTX-EMPTY:
12340d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
12440d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0];
12540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1];
12640d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2];
12740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3];
12840d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4];
12940d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5];
13040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
13840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
13940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
14040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
14140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
14240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
14340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
14440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
14540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
14640d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
14740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
14840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
14940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
15040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
15140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
15240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
15340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
15440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
15540d0058eSDurgadoss R
15640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
15740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
15840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
15940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
16040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
16140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
16240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
16340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
16440d0058eSDurgadoss R  ret void
16540d0058eSDurgadoss R}
16640d0058eSDurgadoss R
16740d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_4d
16840d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) {
16940d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_4d(
17040d0058eSDurgadoss R; CHECK-PTX:       {
17140d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<5>;
17240d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
17340d0058eSDurgadoss R; CHECK-PTX-EMPTY:
17440d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
17540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0];
17640d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1];
17740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2];
17840d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3];
17940d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4];
18040d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5];
18140d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6];
18240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
18340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
18440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
18540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
18640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
18740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
18840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
18940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
19040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
19840d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
19940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
20740d0058eSDurgadoss R
20840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
20940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
21040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
21140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
21240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
21340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
21440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
21540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
21640d0058eSDurgadoss R  ret void
21740d0058eSDurgadoss R}
21840d0058eSDurgadoss R
21940d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_tile_5d
22040d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_tile_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
22140d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_tile_5d(
22240d0058eSDurgadoss R; CHECK-PTX:       {
22340d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<6>;
22440d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
22540d0058eSDurgadoss R; CHECK-PTX-EMPTY:
22640d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
22740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0];
22840d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1];
22940d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2];
23040d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3];
23140d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4];
23240d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5];
23340d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6];
23440d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7];
23540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
23640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
23740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
23840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
23940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
24040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
24140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
24240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
24340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
24440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
24540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
24640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
24740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
24840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
24940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
25040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
25140d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
25240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
25340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
25440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
25540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
25640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
25740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
25840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
25940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
26040d0058eSDurgadoss R
26140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
26940d0058eSDurgadoss R  ret void
27040d0058eSDurgadoss R}
27140d0058eSDurgadoss R
27240d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_3d
27340d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_im2col_3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) {
27440d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_3d(
27540d0058eSDurgadoss R; CHECK-PTX:       {
27640d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
27740d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
27840d0058eSDurgadoss R; CHECK-PTX-EMPTY:
27940d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
28040d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0];
28140d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1];
28240d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2];
28340d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3];
28440d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4];
28540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5];
28640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
28740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
28840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
28940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
29040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
29140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
29240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
29340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
29440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
29540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
29640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
29740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
29840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
29940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
30040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
30140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
30240d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
30340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
30440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
30540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
30640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
30740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
30840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
30940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
31040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1)
31140d0058eSDurgadoss R
31240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
31340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
31440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
31540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
31640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
31740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
31840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
31940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0)
32040d0058eSDurgadoss R  ret void
32140d0058eSDurgadoss R}
32240d0058eSDurgadoss R
32340d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_4d
32440d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_im2col_4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) {
32540d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_4d(
32640d0058eSDurgadoss R; CHECK-PTX:       {
32740d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<5>;
32840d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
32940d0058eSDurgadoss R; CHECK-PTX-EMPTY:
33040d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
33140d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0];
33240d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1];
33340d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2];
33440d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3];
33540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4];
33640d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5];
33740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6];
33840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
33940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
34040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
34140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
34240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
34340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
34440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
34540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
34640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
34740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
34840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
34940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
35040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
35140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
35240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
35340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
35440d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
35540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
35640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
35740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
35840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
35940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
36040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
36140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
36240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1)
36340d0058eSDurgadoss R
36440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
36540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
36640d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
36740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
36840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
36940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
37040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
37140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0)
37240d0058eSDurgadoss R  ret void
37340d0058eSDurgadoss R}
37440d0058eSDurgadoss R
37540d0058eSDurgadoss R; CHECK-LABEL: cp_async_bulk_tensor_reduce_im2col_5d
37640d0058eSDurgadoss Rdefine void @cp_async_bulk_tensor_reduce_im2col_5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
37740d0058eSDurgadoss R; CHECK-PTX-LABEL: cp_async_bulk_tensor_reduce_im2col_5d(
37840d0058eSDurgadoss R; CHECK-PTX:       {
37940d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b32 %r<6>;
38040d0058eSDurgadoss R; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
38140d0058eSDurgadoss R; CHECK-PTX-EMPTY:
38240d0058eSDurgadoss R; CHECK-PTX-NEXT:  // %bb.0:
38340d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0];
38440d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1];
38540d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2];
38640d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3];
38740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4];
38840d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5];
38940d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6];
39040d0058eSDurgadoss R; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7];
39140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39740d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39840d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
39940d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40040d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40140d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40240d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.inc.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40340d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.dec.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40440d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.and.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40540d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.or.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40640d0058eSDurgadoss R; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.xor.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
40740d0058eSDurgadoss R; CHECK-PTX-NEXT:    ret;
40840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
40940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
41040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
41140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
41240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
41340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
41440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
41540d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1)
41640d0058eSDurgadoss R
41740d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
41840d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
41940d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
42040d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
42140d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
42240d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
42340d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
42440d0058eSDurgadoss R  tail call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0)
42540d0058eSDurgadoss R  ret void
42640d0058eSDurgadoss R}
427