1// RUN: mlir-opt --convert-nvvm-to-llvm --convert-arith-to-llvm --split-input-file %s | FileCheck %s 2 3// Same below, but using the `ConvertToLLVMPatternInterface` entry point 4// and the generic `convert-to-llvm` pass. 5// RUN: mlir-opt --convert-to-llvm --split-input-file %s | FileCheck %s 6 7// CHECK-LABEL: @init_mbarrier 8llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) { 9 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" 10 nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 11 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" 12 nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1 13 llvm.return 14} 15 16// CHECK-LABEL: @init_mbarrier_arrive_expect_tx 17llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) { 18 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r" 19 nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32 20 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b" 21 nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 22 llvm.return 23} 24 25// CHECK-LABEL: @init_mbarrier_arrive_expect_tx_generic 26llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32, %pred : i1) { 27 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r" 28 nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32 29 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.b64 _, [$0], $1;", "l,r,b" 30 nvvm.mbarrier.arrive.expect_tx %barrier, %txcount, predicate = %pred : !llvm.ptr, i32, i1 31 llvm.return 32} 33 34// CHECK-LABEL: @init_mbarrier_try_wait_shared 35llvm.func @init_mbarrier_try_wait_shared(%barrier : !llvm.ptr<3>, %ticks : i32, %phase : i32) { 36 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 37 // CHECK-SAME: "{ 38 // CHECK-SAME: .reg .pred P1; 39 // CHECK-SAME: LAB_WAIT: 40 // CHECK-SAME: mbarrier.try_wait.parity.shared.b64 P1, [$0], $1, $2; 41 // CHECK-SAME: @P1 bra.uni DONE; 42 // CHECK-SAME: bra.uni LAB_WAIT; 43 // CHECK-SAME: DONE: 44 // CHECK-SAME: }", 45 // CHECK-SAME: "r,r,r" 46 nvvm.mbarrier.try_wait.parity.shared %barrier, %phase, %ticks : !llvm.ptr<3>, i32, i32 47 llvm.return 48} 49 50// CHECK-LABEL: @init_mbarrier_try_wait 51llvm.func @init_mbarrier_try_wait(%barrier : !llvm.ptr, %ticks : i32, %phase : i32){ 52 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att 53 // CHECK-SAME: "{ 54 // CHECK-SAME: .reg .pred P1; 55 // CHECK-SAME: LAB_WAIT: 56 // CHECK-SAME: mbarrier.try_wait.parity.b64 P1, [$0], $1, $2; 57 // CHECK-SAME: @P1 bra.uni DONE; 58 // CHECK-SAME: bra.uni LAB_WAIT; 59 // CHECK-SAME: DONE: 60 // CHECK-SAME: }", 61 // CHECK-SAME: "l,r,r" 62 nvvm.mbarrier.try_wait.parity %barrier, %phase, %ticks : !llvm.ptr, i32, i32 63 llvm.return 64} 65 66// CHECK-LABEL: @async_cp 67func.func @async_cp(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>) { 68 // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1> 69 nvvm.cp.async.shared.global %dst, %src, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1> 70 // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = cg : !llvm.ptr<3>, !llvm.ptr<1> 71 nvvm.cp.async.shared.global %dst, %src, 16, cache = cg : !llvm.ptr<3>, !llvm.ptr<1> 72 return 73} 74 75// CHECK-LABEL: @async_cp_zfill 76func.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) { 77 // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = cg, %{{.*}} : !llvm.ptr<3>, !llvm.ptr<1>, i32 78 nvvm.cp.async.shared.global %dst, %src, 16, cache = cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 79 // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 4, cache = ca, %{{.*}} : !llvm.ptr<3>, !llvm.ptr<1>, i32 80 nvvm.cp.async.shared.global %dst, %src, 4, cache = ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32 81 return 82} 83 84// CHECK-LABEL: @cp_async_mbarrier_arrive 85func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) { 86 // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} 87 nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr 88 // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} 89 nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr 90 // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} 91 nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3> 92 // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true} 93 nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3> 94 llvm.return 95} 96 97// CHECK-LABEL: @tma_load_3d_all 98func.func @tma_load_3d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { 99 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l" 100 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr 101 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l,b" 102 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr 103 return 104} 105 106// CHECK-LABEL: @tma_load_4d_all 107func.func @tma_load_4d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { 108 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l" 109 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr 110 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$11 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l,b" 111 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr 112 return 113} 114 115// CHECK-LABEL: @tma_load_5d_all 116func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { 117 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l" 118 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr 119 // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "@$13 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l,b" 120 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr 121 return 122} 123 124// CHECK-LABEL: @tma_load_1d 125func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) { 126 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r" 127 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr 128 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r,b" 129 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] predicate=%p : !llvm.ptr<3>, !llvm.ptr 130 return 131} 132 133// CHECK-LABEL: @tma_load_2d 134func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) { 135 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r" 136 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr 137 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r,b" 138 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p : !llvm.ptr<3>, !llvm.ptr 139 return 140} 141 142// CHECK-LABEL: @tma_load_3d 143func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) { 144 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r" 145 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr 146 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r,b" 147 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] predicate=%p : !llvm.ptr<3>, !llvm.ptr 148 return 149} 150 151// CHECK-LABEL: @tma_load_4d 152func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) { 153 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r" 154 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr 155 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r,b" 156 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p : !llvm.ptr<3>, !llvm.ptr 157 return 158} 159 160// CHECK-LABEL: @tma_load_5d 161func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) { 162 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r" 163 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr 164 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r,b" 165 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p : !llvm.ptr<3>, !llvm.ptr 166 return 167} 168 169// CHECK-LABEL: @tma_load_multicast1d 170func.func @tma_load_multicast1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %p : i1) { 171 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h" 172 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr 173 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h,b" 174 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr 175 return 176} 177 178// CHECK-LABEL: @tma_load_multicast2d 179func.func @tma_load_multicast2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %p : i1) { 180 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h" 181 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr 182 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h,b" 183 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr 184 return 185} 186 187// CHECK-LABEL: @tma_load_multicast3d 188func.func @tma_load_multicast3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) { 189 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h" 190 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr 191 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h,b" 192 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr 193 return 194} 195 196// CHECK-LABEL: @tma_load_multicast4d 197func.func @tma_load_multicast4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) { 198 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h" 199 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask: !llvm.ptr<3>, !llvm.ptr 200 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h,b" 201 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr 202 return 203} 204 205// CHECK-LABEL: @tma_load_multicast5d 206func.func @tma_load_multicast5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) { 207 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h" 208 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr 209 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h,b" 210 nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr 211 return 212} 213 214// CHECK-LABEL: @tma_store_1d 215func.func @tma_store_1d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %p : i1) { 216 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r" 217 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0] : !llvm.ptr, !llvm.ptr<3>, i32 218 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$3 cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r,b" 219 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i1 220 return 221} 222 223// CHECK-LABEL: @tma_store_2d 224func.func @tma_store_2d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) { 225 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r" 226 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1] : !llvm.ptr, !llvm.ptr<3>, i32, i32 227 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [$0, {$2, $3} ], [$1];", "l,r,r,r,b" 228 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i1 229 return 230} 231 232// CHECK-LABEL: @tma_store_3d 233func.func @tma_store_3d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) { 234 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r" 235 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32 236 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [$0, {$2, $3, $4} ], [$1];", "l,r,r,r,r,b" 237 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i1 238 return 239} 240 241// CHECK-LABEL: @tma_store_4d 242func.func @tma_store_4d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) { 243 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r" 244 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32 245 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5} ], [$1];", "l,r,r,r,r,r,b" 246 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i1 247 return 248} 249 250// CHECK-LABEL: @tma_store_5d 251func.func @tma_store_5d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) { 252 // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r" 253 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32 254 255 // CHECK-NEXT: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [$0, {$2, $3, $4, $5, $6} ], [$1];", "l,r,r,r,r,r,r,b" 256 nvvm.cp.async.bulk.tensor.global.shared.cta %tmaDescriptor, %src, box[%crd0,%crd1,%crd2,%crd3,%crd4], predicate=%p : !llvm.ptr, !llvm.ptr<3>, i32, i32, i32, i32, i32, i1 257 return 258} 259 260// CHECK-LABEL: @wgmma_execute 261func.func @wgmma_execute() { 262 nvvm.wgmma.fence.aligned 263 nvvm.wgmma.commit.group.sync.aligned 264 nvvm.wgmma.wait.group.sync.aligned 0 265 // CHECK: nvvm.wgmma.fence.aligned 266 // CHECK: nvvm.wgmma.commit.group.sync.aligned 267 // CHECK: nvvm.wgmma.wait.group.sync.aligned 0 268 269 270 nvvm.wgmma.fence.aligned 271 nvvm.wgmma.commit.group.sync.aligned 272 nvvm.wgmma.wait.group.sync.aligned 5 273 // CHECK: nvvm.wgmma.fence.aligned 274 // CHECK: nvvm.wgmma.commit.group.sync.aligned 275 // CHECK: nvvm.wgmma.wait.group.sync.aligned 5 276 return 277} 278 279 280// ----- 281 282!mat64f32 = !llvm.struct<( 283 f32, f32, f32, f32, f32, f32, f32, f32, 284 f32, f32, f32, f32, f32, f32, f32, f32)> 285 286// CHECK-LABEL: @wgmma_f32_f16_f16( 287// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64 288func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{ 289 // CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct 290 // CHECK: %[[A1:.*]] = llvm.mlir.constant(0 : i32) : i32 291 // CHECK: %[[A2:.*]] = llvm.mlir.constant(-1 : i32) : i32 292 // CHECK: %[[A3:.*]] = llvm.mlir.constant(-1 : i32) : i32 293 // CHECK: %[[A4:.*]] = llvm.mlir.constant(1 : i32) : i32 294 // CHECK: %[[A5:.*]] = llvm.mlir.constant(0 : i32) : i32 295 // CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 296 // CHECK: %[[V4:.*]] = llvm.extractvalue %[[RES]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 297 // CHECK: %[[V11:.*]] = llvm.extractvalue %[[RES]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 298 // CHECK: %[[V13:.*]] = llvm.extractvalue %[[RES]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 299 // CHECK: %[[RES1:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 300 // CHECK-SAME: "{ 301 // CHECK-SAME: reg .pred p; 302 // CHECK-SAME: setp.ne.b32 p, $34, 0; 303 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 304 // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35, $36, $37, $38;\0A}\0A", 305 // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 306 // CHECK-SAME: %[[V0]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11]], %{{.*}}, %[[V13]], %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]] 307 // CHECK-SAME: : (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, i64, i64, i32, i32, i32, i32, i32) 308 // CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 309 // CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64 310 // CHECK: %[[DESCa:.+]] = llvm.add %[[ARG0]], %[[C2]] : i64 311 // CHECK: %[[DESCb:.+]] = llvm.add %[[ARG1]], %[[C2]] : i64 312 // CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES1]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 313 // CHECK: %[[V4_2:.*]] = llvm.extractvalue %[[RES1]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 314 // CHECK: %[[V11_2:.*]] = llvm.extractvalue %[[RES1]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 315 // CHECK: %[[V13_2:.*]] = llvm.extractvalue %[[RES1]][13] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 316 // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 317 // CHECK-SAME: "{ 318 // CHECK-SAME: .reg .pred p; 319 // CHECK-SAME: setp.ne.b32 p, $34, 0; 320 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 321 // CHECK-SAME: {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}, $32, $33, p, $35, $36, $37, $38;\0A}\0A", 322 // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,l,l,n,n,n,n,n" 323 // CHECK-SAME: %[[V0_2]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4_2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11_2]], %{{.*}}, %[[V13_2]], %{{.*}}, %{{.*}}, %[[DESCa]], %[[DESCb]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} 324 %result = llvm.mlir.undef : !mat64f32 325 %result1 = nvvm.wgmma.mma_async 326 %descA, %descB, %result, 327 #nvvm.shape<m = 64, n = 32, k = 16>, 328 D [<f32>, #nvvm.wgmma_scale_out<zero>], 329 A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 330 B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>] 331 :!mat64f32 -> !mat64f32 332 %c2 = arith.constant 2 : i64 333 %descAnext = arith.addi %descA, %c2 : i64 334 %descBnext = arith.addi %descB, %c2 : i64 335 %result2 = nvvm.wgmma.mma_async 336 %descAnext, %descBnext, %result1, 337 #nvvm.shape<m = 64, n = 32, k = 16>, 338 D [<f32>, #nvvm.wgmma_scale_out<zero>], 339 A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 340 B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>] 341 : !mat64f32 -> !mat64f32 342 return %result2 : !mat64f32 343} 344 345// ----- 346 347!mat16i32 = !llvm.struct<(i32, i32, i32, i32)> 348 349// CHECK-LABEL: @wgmma_s32_s8_s8_satfinite( 350// CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64 351func.func @wgmma_s32_s8_s8_satfinite(%descA : i64, %descB : i64) -> !mat16i32{ 352 %result = llvm.mlir.undef : !mat16i32 353// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct 354// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32 355// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] 356// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1] 357// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2] 358// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3] 359// CHECK: %[[RES_2:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 360// CHECK-SAME: "{ 361// CHECK-SAME: .reg .pred p; 362// CHECK-SAME: setp.ne.b32 p, $10, 0; 363// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 364// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 365// CHECK-SAME: %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 366// CHECK-SAME: (i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)> 367// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0] 368// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1] 369// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2] 370// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3] 371// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 372// CHECK-SAME: "{ 373// CHECK-SAME: .reg .pred p; 374// CHECK-SAME: setp.ne.b32 p, $10, 0; 375// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 376// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", 377// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" 378// CHECK-SAME: %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}} 379// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0] 380// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1] 381// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2] 382// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3] 383// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 384// CHECK-SAME:"{ 385// CHECK-SAME:.reg .pred p; 386// CHECK-SAME: setp.ne.b32 p, $10, 0; 387// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite 388// CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 389// CHECK-SAME: %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 390 %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 391 #nvvm.shape<m = 64, n = 8, k = 32>, 392 D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>], 393 A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 394 B [<s8>, #nvvm.wgmma_scale_in<one>, <col>] 395 : !mat16i32 -> !mat16i32 396 %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 397 #nvvm.shape<m = 64, n = 8, k = 32>, 398 D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>], 399 A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 400 B [<s8>, #nvvm.wgmma_scale_in<one>, <col>] 401 : !mat16i32 -> !mat16i32 402 %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2, 403 #nvvm.shape<m = 64, n = 8, k = 32>, 404 D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>], 405 A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 406 B [<s8>, #nvvm.wgmma_scale_in<one>, <col>] 407 : !mat16i32 -> !mat16i32 408 return %result3 : !mat16i32 409} 410 411// CHECK-LABEL: @wgmma_s32_u8_u8( 412 // CHECK-SAME: %[[ARG0:.+]]: i64, %[[ARG1:.+]]: i64 413func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 { 414// CHECK: %[[RES:.*]] = llvm.mlir.undef : !llvm.struct 415// CHECK: %[[A1:.*]] = llvm.mlir.constant(1 : i32) : i32 416// CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] 417// CHECK: %[[V1:.*]] = llvm.extractvalue %[[RES]][1] 418// CHECK: %[[V2:.*]] = llvm.extractvalue %[[RES]][2] 419// CHECK: %[[V3:.*]] = llvm.extractvalue %[[RES]][3] 420// CHECK: %[[RES_2:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 421// CHECK-SAME: "{ 422// CHECK-SAME: .reg .pred p; 423// CHECK-SAME: setp.ne.b32 p, $10, 0; 424// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p; 425// CHECK-SAME: }\0A", 426// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[ARG0]], %[[ARG1]], %[[A1]] : 427// CHECK-SAME:(i32, i32, i32, i32, i64, i64, i32) -> !llvm.struct<(i32, i32, i32, i32)> 428// CHECK: %[[V0_2:.*]] = llvm.extractvalue %[[RES_2]][0] 429// CHECK: %[[V1_2:.*]] = llvm.extractvalue %[[RES_2]][1] 430// CHECK: %[[V2_2:.*]] = llvm.extractvalue %[[RES_2]][2] 431// CHECK: %[[V3_2:.*]] = llvm.extractvalue %[[RES_2]][3] 432// CHECK: %[[RES_3:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 433// CHECK-SAME:"{ 434// CHECK-SAME: .reg .pred p; 435// CHECK-SAME: setp.ne.b32 p, $10, 0; 436// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p; 437// CHECK-SAME: }\0A", 438// CHECK-SAME: "=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_2]], %[[V1_2]], %[[V2_2]], %[[V3_2]], %[[ARG0]], %[[ARG1]], %{{.*}} 439// CHECK: %[[V0_3:.*]] = llvm.extractvalue %[[RES_3]][0] 440// CHECK: %[[V1_3:.*]] = llvm.extractvalue %[[RES_3]][1] 441// CHECK: %[[V2_3:.*]] = llvm.extractvalue %[[RES_3]][2] 442// CHECK: %[[V3_3:.*]] = llvm.extractvalue %[[RES_3]][3] 443// CHECK: %[[RES1:.*]] = llvm.inline_asm has_side_effects asm_dialect = att 444// CHECK-SAME:"{ 445// CHECK-SAME: .reg .pred p; 446// CHECK-SAME: setp.ne.b32 p, $10, 0; 447// CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 {$0, $1, $2, $3}, $8, $9, p; 448// CHECK-SAME:}\0A", 449// CHECK-SAME:"=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 450 %result = llvm.mlir.undef : !mat16i32 451 %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 452 #nvvm.shape<m = 64, n = 8, k = 32>, 453 D [<s32>, #nvvm.wgmma_scale_out<one>], 454 A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 455 B [<u8>, #nvvm.wgmma_scale_in<one>, <col>] 456 : !mat16i32 -> !mat16i32 457 %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 458 #nvvm.shape<m = 64, n = 8, k = 32>, 459 D [<s32>, #nvvm.wgmma_scale_out<one>], 460 A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 461 B [<u8>, #nvvm.wgmma_scale_in<one>, <col>] 462 : !mat16i32 -> !mat16i32 463 %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2, 464 #nvvm.shape<m = 64, n = 8, k = 32>, 465 D [<s32>, #nvvm.wgmma_scale_out<one>], 466 A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 467 B [<u8>, #nvvm.wgmma_scale_in<one>, <col>] 468 : !mat16i32 -> !mat16i32 469 return %result3 : !mat16i32 470} 471 472// ----- 473 474!mat32f32 = !llvm.struct<( 475 f32, f32, f32, f32, f32, f32, f32, f32, 476 f32, f32, f32, f32, f32, f32, f32, f32, 477 f32, f32, f32, f32, f32, f32, f32, f32, 478 f32, f32, f32, f32, f32, f32, f32, f32)> 479 480// CHECK-LABEL: @wgmma_f32_tf32_tf32 481func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 { 482 // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 483 // CHECK-SAME:"{ 484 // CHECK-SAME: .reg .pred p; 485 // CHECK-SAME: setp.ne.b32 p, $66, 0; 486 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67, $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n" 487 // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 488 // CHECK-SAME: "{ 489 // CHECK-SAME: .reg .pred p; 490 // CHECK-SAME: setp.ne.b32 p, $66, 0; 491 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67, $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n" 492 %result = llvm.mlir.undef : !mat32f32 493 %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 494 #nvvm.shape<m = 64, n = 64, k = 8>, 495 D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>], 496 A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 497 B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>] 498 : !mat32f32 -> !mat32f32 499 %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 500 #nvvm.shape<m = 64, n = 64, k = 8>, 501 D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>], 502 A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 503 B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>] 504 : !mat32f32 -> !mat32f32 505 return %result2 : !mat32f32 506} 507 508 509// ----- 510 511!mat32f32 = !llvm.struct<( 512 f32, f32, f32, f32, f32, f32, f32, f32, 513 f32, f32, f32, f32, f32, f32, f32, f32, 514 f32, f32, f32, f32, f32, f32, f32, f32, 515 f32, f32, f32, f32, f32, f32, f32, f32)> 516 517// CHECK-LABEL: @wgmma_f32_e4m3_e4m3 518func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 { 519 // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 520 // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0; 521 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67, $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n" 522 // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 523 // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0; 524 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67, $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n" 525 %result = llvm.mlir.undef : !mat32f32 526 %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 527 #nvvm.shape<m = 64, n = 64, k = 32>, 528 D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>], 529 A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 530 B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>] 531 : !mat32f32 -> !mat32f32 532 %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 533 #nvvm.shape<m = 64, n = 64, k = 32>, 534 D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>], 535 A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 536 B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>] 537 : !mat32f32 -> !mat32f32 538 return %result2 : !mat32f32 539} 540 541// ----- 542 543!mat32f32 = !llvm.struct<( 544 f32, f32, f32, f32, f32, f32, f32, f32, 545 f32, f32, f32, f32, f32, f32, f32, f32, 546 f32, f32, f32, f32, f32, f32, f32, f32, 547 f32, f32, f32, f32, f32, f32, f32, f32)> 548 549// CHECK-LABEL: @wgmma_f32_e5m2_e4m3 550func.func @wgmma_f32_e5m2_e4m3(%descA : i64, %descB : i64) -> !mat32f32 { 551 // CHECK: %[[RES:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 552 // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0; 553 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67, $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n" 554 // CHECK: %[[RES_2:.+]] = llvm.inline_asm has_side_effects asm_dialect = att 555 // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0; 556 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67, $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n" 557 %result = llvm.mlir.undef : !mat32f32 558 %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 559 #nvvm.shape<m = 64, n = 64, k = 32>, 560 D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>], 561 A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 562 B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>] 563 : !mat32f32 -> !mat32f32 564 %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 565 #nvvm.shape<m = 64, n = 64, k = 32>, 566 D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>], 567 A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 568 B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<col>] 569 : !mat32f32 -> !mat32f32 570 return %result2 : !mat32f32 571} 572 573// ----- 574 575func.func @elect_one_leader_sync() { 576 // CHECK: %[[RES:.*]] = nvvm.elect.sync -> i1 577 %cnd = nvvm.elect.sync -> i1 578 return 579} 580 581// ----- 582 583// CHECK-LABEL: @stmatrix( 584// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !llvm.ptr<3>, 585// CHECK-SAME: %[[arg1:[a-zA-Z0-9_]+]]: i32, 586// CHECK-SAME: %[[arg2:[a-zA-Z0-9_]+]]: i32, 587// CHECK-SAME: %[[arg3:[a-zA-Z0-9_]+]]: i32, 588// CHECK-SAME: %[[arg4:[a-zA-Z0-9_]+]]: i32) 589llvm.func @stmatrix(%arg0 : !llvm.ptr<3>, %m1 : i32, %m2 : i32, %m3 : i32, %m4 : i32) { 590// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> () 591// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> () 592// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> () 593// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x1.trans.m8n8.shared.b16 [$0], {$1};", "r,r" %[[arg0]], %[[arg1]] : (!llvm.ptr<3>, i32) -> () 594// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x2.trans.m8n8.shared.b16 [$0], {$1, $2};", "r,r,r" %[[arg0]], %[[arg1]], %[[arg2]] : (!llvm.ptr<3>, i32, i32) -> () 595// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "stmatrix.sync.aligned.x4.trans.m8n8.shared.b16 [$0], {$1, $2, $3, $4};", "r,r,r,r,r" %[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]], %[[arg4]] : (!llvm.ptr<3>, i32, i32, i32, i32) -> () 596 nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32 597 nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32 598 nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<row>} : !llvm.ptr<3>, i32, i32, i32, i32 599 nvvm.stmatrix %arg0, %m1 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32 600 nvvm.stmatrix %arg0, %m1, %m2 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32 601 nvvm.stmatrix %arg0, %m1, %m2, %m3, %m4 {layout = #nvvm.mma_layout<col>} : !llvm.ptr<3>, i32, i32, i32, i32 602 llvm.return 603} 604 605// ----- 606 607// CHECK-LABEL: @init_mbarrier_arrive_expect_tx 608llvm.func @init_mbarrier_arrive_expect_tx(%desc : !llvm.ptr, %pred : i1) { 609 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "prefetch.tensormap [$0];", "l" 610 nvvm.prefetch.tensormap %desc : !llvm.ptr 611 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$1 prefetch.tensormap [$0];", "l,b" 612 nvvm.prefetch.tensormap %desc, predicate = %pred : !llvm.ptr, i1 613 llvm.return 614} 615 616// ----- 617 618func.func @set_max_register() { 619 // CHECK: nvvm.setmaxregister increase 232 620 nvvm.setmaxregister increase 232 621 622 // CHECK: nvvm.setmaxregister decrease 40 623 nvvm.setmaxregister decrease 40 624 func.return 625} 626 627// ----- 628 629func.func @cp_async_bulk_commit() { 630 // CHECK: nvvm.cp.async.bulk.commit.group 631 nvvm.cp.async.bulk.commit.group 632 func.return 633} 634 635// ----- 636 637func.func @cp_async_bulk_wait_group() { 638 // CHECK: nvvm.cp.async.bulk.wait_group 1 639 // CHECK: nvvm.cp.async.bulk.wait_group 0 640 // CHECK: nvvm.cp.async.bulk.wait_group 5 {read} 641 // CHECK: nvvm.cp.async.bulk.wait_group 0 {read} 642 nvvm.cp.async.bulk.wait_group 1 643 nvvm.cp.async.bulk.wait_group 0 644 nvvm.cp.async.bulk.wait_group 5 {read} 645 nvvm.cp.async.bulk.wait_group 0 {read} 646 func.return 647} 648 649// ----- 650 651func.func @fence_mbarrier_init() { 652 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;" 653 nvvm.fence.mbarrier.init 654 func.return 655} 656// ----- 657 658func.func @fence_proxy() { 659 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.alias;", "" : () -> () 660 nvvm.fence.proxy { kind = #nvvm.proxy_kind<alias>} 661 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async;", "" : () -> () 662 nvvm.fence.proxy { kind = #nvvm.proxy_kind<async>} 663 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.global;", "" : () -> () 664 nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.global>} 665 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cta;", "" : () -> () 666 nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>} 667 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cluster;", "" : () -> () 668 nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>} 669 func.return 670} 671 672// ----- 673 674// CHECK-LABEL: @llvm_nvvm_barrier_arrive 675// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32) 676llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) { 677 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive 0, $0;", "r" %[[numberOfThreads]] : (i32) -> () 678 nvvm.barrier.arrive number_of_threads = %numberOfThreads 679 // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive $0, $1;", "r,r" %[[barId]], %[[numberOfThreads]] : (i32, i32) -> () 680 nvvm.barrier.arrive id = %barID number_of_threads = %numberOfThreads 681 llvm.return 682} 683