xref: /llvm-project/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir (revision 2e6cc79f816d942ab09d6a310cd925c1da148aa9)
1// RUN: mlir-opt --convert-nvvm-to-llvm --convert-arith-to-llvm --split-input-file %s | FileCheck %s
2
3// Same below, but using the `ConvertToLLVMPatternInterface` entry point
4// and the generic `convert-to-llvm` pass.
5// RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s
6
7// CHECK-LABEL: @init_mbarrier
8llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) {
9  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b"
10  nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1
11  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b"
12  nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1
13  llvm.return
14}
15
16// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
17llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
18  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
19  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32
20  //CHECK:  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
21  nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1
22  llvm.return
23}
24
25// CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic
26llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) {
27  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r"
28  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32
29  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b"
30  nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1
31  llvm.return
32}
33
34// CHECK-LABEL: @init_mbarrier_try_wait_shared
35llvm.func @init_mbarrier_try_wait_shared(%barrier : !llvm.ptr<3>, %ticks : i32, %phase : i32) {
36  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
37  // CHECK-SAME: "{
38  // CHECK-SAME: .reg .pred       P1;
39  // CHECK-SAME: LAB_WAIT:
40  // CHECK-SAME: mbarrier.try_wait.parity.shared.b64 P1, [$0], $1, $2;
41  // CHECK-SAME: @P1 bra.uni DONE;
42  // CHECK-SAME: bra.uni     LAB_WAIT;
43  // CHECK-SAME: DONE:
44  // CHECK-SAME: }",
45  // CHECK-SAME: "r,r,r"
46   nvvm.mbarrier.try_wait.parity.shared %barrier, %phase, %ticks : !llvm.ptr<3>, i32, i32
47  llvm.return
48}
49
50// CHECK-LABEL: @init_mbarrier_try_wait
51llvm.func @init_mbarrier_try_wait(%barrier : !llvm.ptr, %ticks : i32, %phase : i32){
52  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att
53  // CHECK-SAME: "{
54  // CHECK-SAME: .reg .pred       P1;
55  // CHECK-SAME: LAB_WAIT:
56  // CHECK-SAME: mbarrier.try_wait.parity.b64 P1, [$0], $1, $2;
57  // CHECK-SAME: @P1 bra.uni DONE;
58  // CHECK-SAME: bra.uni     LAB_WAIT;
59  // CHECK-SAME: DONE:
60  // CHECK-SAME: }",
61  // CHECK-SAME: "l,r,r"
62  nvvm.mbarrier.try_wait.parity %barrier, %phase, %ticks : !llvm.ptr, i32, i32
63  llvm.return
64}
65
66// CHECK-LABEL: @async_cp
67func.func @async_cp(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>) {
68  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
69  nvvm.cp.async.shared.global %dst, %src, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
70  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache =  cg : !llvm.ptr<3>, !llvm.ptr<1>
71  nvvm.cp.async.shared.global %dst, %src, 16, cache =  cg : !llvm.ptr<3>, !llvm.ptr<1>
72  return
73}
74
75// CHECK-LABEL: @async_cp_zfill
76func.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) {
77  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache =  cg, %{{.*}} : !llvm.ptr<3>, !llvm.ptr<1>, i32
78  nvvm.cp.async.shared.global %dst, %src, 16, cache =  cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
79  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 4, cache =  ca, %{{.*}} : !llvm.ptr<3>, !llvm.ptr<1>, i32
80  nvvm.cp.async.shared.global %dst, %src, 4, cache =  ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
81  return
82}
83
84// CHECK-LABEL: @cp_async_mbarrier_arrive
85func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
86  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}}
87  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
88  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
89  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
90  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}}
91  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
92  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true}
93  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
94  llvm.return
95}
96
97// CHECK-LABEL: @tma_load_3d_all
98func.func @tma_load_3d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
99  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l"
100  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr
101  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l,b"
102  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
103  return
104}
105
106// CHECK-LABEL: @tma_load_4d_all
107func.func @tma_load_4d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
108  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l"
109  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr
110  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$11 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l,b"
111  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
112  return
113}
114
115// CHECK-LABEL: @tma_load_5d_all
116func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %ctamask : i16, %cacheHint : i64, %p : i1) {
117  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l"
118  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr
119  // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "@$13 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l,b"
120  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr
121  return
122}
123
124// CHECK-LABEL: @tma_load_1d
125func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) {
126  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r"
127  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr
128  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r,b"
129  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0] predicate=%p : !llvm.ptr<3>, !llvm.ptr
130  return
131}
132
133// CHECK-LABEL: @tma_load_2d
134func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
135  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r"
136  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr
137  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r,b"
138  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
139  return
140}
141
142// CHECK-LABEL: @tma_load_3d
143func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
144  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r"
145  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr
146  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r,b"
147  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
148  return
149}
150
151// CHECK-LABEL: @tma_load_4d
152func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
153  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r"
154  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr
155  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r,b"
156  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
157  return
158}
159
160// CHECK-LABEL: @tma_load_5d
161func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
162  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r"
163  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr
164  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r,b"
165  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
166  return
167}
168
169// CHECK-LABEL: @tma_load_multicast1d
170func.func @tma_load_multicast1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %p : i1) {
171  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h"
172  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
173  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h,b"
174  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr
175  return
176}
177
178// CHECK-LABEL: @tma_load_multicast2d
179func.func @tma_load_multicast2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %p : i1) {
180  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h"
181  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
182  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h,b"
183  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask  predicate=%p  : !llvm.ptr<3>, !llvm.ptr
184  return
185}
186
187// CHECK-LABEL: @tma_load_multicast3d
188func.func @tma_load_multicast3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
189  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h"
190  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
191  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h,b"
192  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask  predicate=%p  : !llvm.ptr<3>, !llvm.ptr
193  return
194}
195
196// CHECK-LABEL: @tma_load_multicast4d
197func.func @tma_load_multicast4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
198  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h"
199  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask: !llvm.ptr<3>, !llvm.ptr
200  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h,b"
201  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask predicate=%p  : !llvm.ptr<3>, !llvm.ptr
202  return
203}
204
205// CHECK-LABEL: @tma_load_multicast5d
206func.func @tma_load_multicast5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
207  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h"
208  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr
209  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h,b"
210  nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask predicate=%p  : !llvm.ptr<3>, !llvm.ptr
211  return
212}
213
214// CHECK-LABEL: @tma_store_1d
215func.func @tma_store_1d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %p : i1) {
216  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r"
217  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0] : !llvm.ptr, !llvm.ptr<3>, i32
218  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$3 cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r,b"
219  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i1
220  return
221}
222
223// CHECK-LABEL: @tma_store_2d
224func.func @tma_store_2d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
225  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r"
226  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1] : !llvm.ptr, !llvm.ptr<3>, i32, i32
227  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r,b"
228  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i1
229  return
230}
231
232// CHECK-LABEL: @tma_store_3d
233func.func @tma_store_3d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
234  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r"
235  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32
236  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r,b"
237  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i1
238  return
239}
240
241// CHECK-LABEL: @tma_store_4d
242func.func @tma_store_4d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
243  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r"
244  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32
245  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r,b"
246  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i1
247  return
248}
249
250// CHECK-LABEL: @tma_store_5d
251func.func @tma_store_5d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
252  // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r"
253  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32
254
255  // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r,b"
256  nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32, i1
257  return
258}
259
260// CHECK-LABEL: @wgmma_execute
261func.func @wgmma_execute() {
262  nvvm.wgmma.fence.aligned
263  nvvm.wgmma.commit.group.sync.aligned
264  nvvm.wgmma.wait.group.sync.aligned 0
265  // CHECK: nvvm.wgmma.fence.aligned
266  // CHECK: nvvm.wgmma.commit.group.sync.aligned
267  // CHECK: nvvm.wgmma.wait.group.sync.aligned 0
268
269
270  nvvm.wgmma.fence.aligned
271  nvvm.wgmma.commit.group.sync.aligned
272  nvvm.wgmma.wait.group.sync.aligned 5
273  // CHECK: nvvm.wgmma.fence.aligned
274  // CHECK: nvvm.wgmma.commit.group.sync.aligned
275  // CHECK: nvvm.wgmma.wait.group.sync.aligned 5
276  return
277}
278
279
280// -----
281
282!mat64f32 = !llvm.struct<(
283  f32, f32, f32, f32, f32, f32, f32, f32,
284  f32, f32, f32, f32, f32, f32, f32, f32)>
285
286// CHECK-LABEL: @wgmma_f32_f16_f16(
287// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
288func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
289  // CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
290  // CHECK: %[[A1:.*]] = llvm.mlir.constant(0 : i32) : i32
291  // CHECK: %[[A2:.*]] = llvm.mlir.constant(-1 : i32) : i32
292  // CHECK: %[[A3:.*]] = llvm.mlir.constant(-1 : i32) : i32
293  // CHECK: %[[A4:.*]] = llvm.mlir.constant(1 : i32) : i32
294  // CHECK: %[[A5:.*]] = llvm.mlir.constant(0 : i32) : i32
295  // CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
296  // CHECK: %[[V4:.*]] = llvm.extractvalue %[[RES]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
297  // CHECK: %[[V11:.*]] = llvm.extractvalue %[[RES]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
298  // CHECK: %[[V13:.*]] = llvm.extractvalue %[[RES]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
299  // CHECK: %[[RES1:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
300  // CHECK-SAME: "{
301  // CHECK-SAME: reg .pred p;
302  // CHECK-SAME: setp.ne.b32 p, $34, 0;
303  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16
304  // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A",
305  // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n"
306  // CHECK-SAME: %[[V0]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11]], %{{.*}}, %[[V13]], %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]]
307  // CHECK-SAME: : (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, i64, i64, i32, i32, i32, i32, i32)
308  // CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
309  // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
310  // CHECK: %[[DESCa:.+]] = llvm.add %[[ARG0]], %[[C2]] : i64
311  // CHECK: %[[DESCb:.+]] = llvm.add %[[ARG1]], %[[C2]] : i64
312  // CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES1]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
313  // CHECK: %[[V4_2:.*]] = llvm.extractvalue %[[RES1]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
314  // CHECK: %[[V11_2:.*]] = llvm.extractvalue %[[RES1]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
315  // CHECK: %[[V13_2:.*]] = llvm.extractvalue %[[RES1]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
316  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
317  // CHECK-SAME: "{
318    // CHECK-SAME: .reg .pred p;
319    // CHECK-SAME: setp.ne.b32 p, $34, 0;
320    // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16
321    // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35,  $36, $37,  $38;\0A}\0A",
322    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n"
323    // CHECK-SAME: %[[V0_2]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4_2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11_2]], %{{.*}}, %[[V13_2]], %{{.*}}, %{{.*}}, %[[DESCa]], %[[DESCb]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
324  %result = llvm.mlir.undef : !mat64f32
325  %result1 = nvvm.wgmma.mma_async
326      %descA, %descB, %result,
327      #nvvm.shape<m = 64, n = 32, k = 16>,
328      D [<f32>, #nvvm.wgmma_scale_out<zero>],
329      A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>],
330      B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
331      :!mat64f32 -> !mat64f32
332  %c2 = arith.constant 2 : i64
333  %descAnext = arith.addi %descA, %c2 : i64
334  %descBnext = arith.addi %descB, %c2 : i64
335  %result2 = nvvm.wgmma.mma_async
336      %descAnext, %descBnext, %result1,
337      #nvvm.shape<m = 64, n = 32, k = 16>,
338      D [<f32>, #nvvm.wgmma_scale_out<zero>],
339      A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>],
340      B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
341      : !mat64f32 -> !mat64f32
342  return %result2 : !mat64f32
343}
344
345// -----
346
347!mat16i32 = !llvm.struct<(i32, i32, i32, i32)>
348
349// CHECK-LABEL: @wgmma_s32_s8_s8_satfinite(
350// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
351func.func @wgmma_s32_s8_s8_satfinite(%descA : i64, %descB : i64) -> !mat16i32{
352  %result = llvm.mlir.undef : !mat16i32
353// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
354// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32
355// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0]
356// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
357// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
358// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
359// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att
360// CHECK-SAME: "{
361// CHECK-SAME: .reg .pred p;
362// CHECK-SAME: setp.ne.b32 p, $10, 0;
363// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite
364// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n"
365// CHECK-SAME: %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] :
366// CHECK-SAME: (i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
367// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
368// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
369// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
370// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
371// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att
372// CHECK-SAME: "{
373// CHECK-SAME: .reg .pred p;
374// CHECK-SAME: setp.ne.b32 p, $10, 0;
375// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite
376// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A",
377// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n"
378// CHECK-SAME: %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
379// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
380// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
381// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
382// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
383// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att
384// CHECK-SAME:"{
385// CHECK-SAME:.reg .pred p;
386// CHECK-SAME: setp.ne.b32 p, $10, 0;
387// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite
388// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n"
389// CHECK-SAME: %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}}
390  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
391      #nvvm.shape<m = 64, n = 8, k = 32>,
392      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
393      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>],
394      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
395      : !mat16i32 -> !mat16i32
396  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
397      #nvvm.shape<m = 64, n = 8, k = 32>,
398      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
399      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>],
400      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
401      : !mat16i32 -> !mat16i32
402  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2,
403      #nvvm.shape<m = 64, n = 8, k = 32>,
404      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
405      A [<s8>, #nvvm.wgmma_scale_in<one>, <row>],
406      B [<s8>, #nvvm.wgmma_scale_in<one>, <col>]
407      : !mat16i32 -> !mat16i32
408  return %result3 : !mat16i32
409}
410
411// CHECK-LABEL: @wgmma_s32_u8_u8(
412  // CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64
413func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 {
414// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct
415// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32
416// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0]
417// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1]
418// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2]
419// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3]
420// CHECK: %[[RES_2:.*]] =  llvm.inline_asm has_side_effects asm_dialect = att
421// CHECK-SAME: "{
422// CHECK-SAME: .reg .pred p;
423// CHECK-SAME: setp.ne.b32 p, $10, 0;
424// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
425// CHECK-SAME: }\0A",
426// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] :
427// CHECK-SAME:(i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)>
428// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0]
429// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1]
430// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2]
431// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3]
432// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att
433// CHECK-SAME:"{
434// CHECK-SAME: .reg .pred p;
435// CHECK-SAME: setp.ne.b32 p, $10, 0;
436// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
437// CHECK-SAME: }\0A",
438// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}}
439// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0]
440// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1]
441// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2]
442// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3]
443// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att
444// CHECK-SAME:"{
445// CHECK-SAME: .reg .pred p;
446// CHECK-SAME: setp.ne.b32 p, $10, 0;
447// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p;
448// CHECK-SAME:}\0A",
449// CHECK-SAME:"=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}}
450  %result = llvm.mlir.undef : !mat16i32
451  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
452      #nvvm.shape<m = 64, n = 8, k = 32>,
453      D [<s32>, #nvvm.wgmma_scale_out<one>],
454      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>],
455      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
456      : !mat16i32 -> !mat16i32
457  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
458      #nvvm.shape<m = 64, n = 8, k = 32>,
459      D [<s32>, #nvvm.wgmma_scale_out<one>],
460      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>],
461      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
462      : !mat16i32 -> !mat16i32
463  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2,
464      #nvvm.shape<m = 64, n = 8, k = 32>,
465      D [<s32>, #nvvm.wgmma_scale_out<one>],
466      A [<u8>, #nvvm.wgmma_scale_in<one>, <row>],
467      B [<u8>, #nvvm.wgmma_scale_in<one>, <col>]
468      : !mat16i32 -> !mat16i32
469  return %result3 : !mat16i32
470}
471
472// -----
473
474!mat32f32 = !llvm.struct<(
475  f32, f32, f32, f32, f32, f32, f32, f32,
476  f32, f32, f32, f32, f32, f32, f32, f32,
477  f32, f32, f32, f32, f32, f32, f32, f32,
478  f32, f32, f32, f32, f32, f32, f32, f32)>
479
480// CHECK-LABEL: @wgmma_f32_tf32_tf32
481func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 {
482  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
483  // CHECK-SAME:"{
484  // CHECK-SAME: .reg .pred p;
485  // CHECK-SAME: setp.ne.b32 p, $66, 0;
486  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
487  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
488  // CHECK-SAME: "{
489  // CHECK-SAME: .reg .pred p;
490  // CHECK-SAME: setp.ne.b32 p, $66, 0;
491  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
492  %result = llvm.mlir.undef : !mat32f32
493  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
494      #nvvm.shape<m = 64, n = 64, k = 8>,
495      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
496      A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>],
497      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
498       : !mat32f32 -> !mat32f32
499  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
500      #nvvm.shape<m = 64, n = 64, k = 8>,
501      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
502      A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>],
503      B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
504      : !mat32f32 -> !mat32f32
505  return %result2 : !mat32f32
506}
507
508
509// -----
510
511!mat32f32 = !llvm.struct<(
512  f32, f32, f32, f32, f32, f32, f32, f32,
513  f32, f32, f32, f32, f32, f32, f32, f32,
514  f32, f32, f32, f32, f32, f32, f32, f32,
515  f32, f32, f32, f32, f32, f32, f32, f32)>
516
517// CHECK-LABEL: @wgmma_f32_e4m3_e4m3
518func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {
519  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
520  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
521  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
522  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
523  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
524  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
525  %result = llvm.mlir.undef : !mat32f32
526  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
527      #nvvm.shape<m = 64, n = 64, k = 32>,
528      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
529      A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>],
530      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
531       : !mat32f32 -> !mat32f32
532  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
533      #nvvm.shape<m = 64, n = 64, k = 32>,
534      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
535      A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>],
536      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
537      : !mat32f32 -> !mat32f32
538  return %result2 : !mat32f32
539}
540
541// -----
542
543!mat32f32 = !llvm.struct<(
544  f32, f32, f32, f32, f32, f32, f32, f32,
545  f32, f32, f32, f32, f32, f32, f32, f32,
546  f32, f32, f32, f32, f32, f32, f32, f32,
547  f32, f32, f32, f32, f32, f32, f32, f32)>
548
549// CHECK-LABEL: @wgmma_f32_e5m2_e4m3
550func.func @wgmma_f32_e5m2_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {
551  // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
552  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
553  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
554  // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att
555  // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
556  // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
557  %result = llvm.mlir.undef : !mat32f32
558  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
559      #nvvm.shape<m = 64, n = 64, k = 32>,
560      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
561      A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>],
562      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
563       : !mat32f32 -> !mat32f32
564  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
565      #nvvm.shape<m = 64, n = 64, k = 32>,
566      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
567      A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>],
568      B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>]
569      : !mat32f32 -> !mat32f32
570  return %result2 : !mat32f32
571}
572
573// -----
574
575func.func @elect_one_leader_sync() {
576  // CHECK: %[[RES:.*]] = nvvm.elect.sync -> i1
577  %cnd = nvvm.elect.sync -> i1
578  return
579}
580
581// -----
582
583// CHECK-LABEL: @stmatrix(
584// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !llvm.ptr<3>,
585// CHECK-SAME: %[[arg1:[a-zA-Z0-9_]+]]: i32,
586// CHECK-SAME: %[[arg2:[a-zA-Z0-9_]+]]: i32,
587// CHECK-SAME: %[[arg3:[a-zA-Z0-9_]+]]: i32,
588// CHECK-SAME: %[[arg4:[a-zA-Z0-9_]+]]: i32)
589llvm.func @stmatrix(%arg0 : !llvm.ptr<3>, %m1 : i32, %m2 : i32, %m3 : i32, %m4 : i32) {
590// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> ()
591// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> ()
592// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> ()
593// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> ()
594// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> ()
595// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> ()
596  nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32
597  nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32
598  nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32, i32, i32
599  nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32
600  nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32
601  nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32, i32, i32
602  llvm.return
603}
604
605// -----
606
607// CHECK-LABEL: @init_mbarrier_arrive_expect_tx
608llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) {
609  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l"
610  nvvm.prefetch.tensormap %desc : !llvm.ptr
611  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b"
612  nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1
613  llvm.return
614}
615
616// -----
617
618func.func @set_max_register() {
619  // CHECK: nvvm.setmaxregister increase 232
620  nvvm.setmaxregister increase 232
621
622  // CHECK: nvvm.setmaxregister decrease 40
623  nvvm.setmaxregister decrease 40
624  func.return
625}
626
627// -----
628
629func.func @cp_async_bulk_commit() {
630  // CHECK: nvvm.cp.async.bulk.commit.group
631  nvvm.cp.async.bulk.commit.group
632  func.return
633}
634
635// -----
636
637func.func @cp_async_bulk_wait_group() {
638  // CHECK: nvvm.cp.async.bulk.wait_group 1
639  // CHECK: nvvm.cp.async.bulk.wait_group 0
640  // CHECK: nvvm.cp.async.bulk.wait_group 5 {read}
641  // CHECK: nvvm.cp.async.bulk.wait_group 0 {read}
642  nvvm.cp.async.bulk.wait_group 1
643  nvvm.cp.async.bulk.wait_group 0
644  nvvm.cp.async.bulk.wait_group 5 {read}
645  nvvm.cp.async.bulk.wait_group 0 {read}
646  func.return
647}
648
649// -----
650
651func.func @fence_mbarrier_init() {
652  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;"
653  nvvm.fence.mbarrier.init
654  func.return
655}
656// -----
657
658func.func @fence_proxy() {
659  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.alias;", ""  : () -> ()
660  nvvm.fence.proxy { kind = #nvvm.proxy_kind<alias>}
661  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async;", ""  : () -> ()
662  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async>}
663  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.global;", ""  : () -> ()
664  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.global>}
665  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cta;", ""  : () -> ()
666  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
667  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cluster;", ""  : () -> ()
668  nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
669  func.return
670}
671
672// -----
673
674// CHECK-LABEL: @llvm_nvvm_barrier_arrive
675// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
676llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) {
677  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive 0, $0;", "r" %[[numberOfThreads]] : (i32) -> ()
678  nvvm.barrier.arrive number_of_threads = %numberOfThreads
679  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive $0, $1;", "r,r" %[[barId]], %[[numberOfThreads]] : (i32, i32) -> ()
680  nvvm.barrier.arrive id = %barID number_of_threads = %numberOfThreads
681  llvm.return
682}
683