1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 3 4define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 5; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0: 6; GFX12: ; %bb.0: ; %bb 7; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] 8; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 9; GFX12-NEXT: s_endpgm 10bb: 11 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) 12 store <4 x i32> %res, ptr addrspace(1) %out 13 ret void 14} 15 16define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 17; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1: 18; GFX12: ; %bb.0: ; %bb 19; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] 20; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 21; GFX12-NEXT: s_endpgm 22bb: 23 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) 24 store <4 x i32> %res, ptr addrspace(1) %out 25 ret void 26} 27 28define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 29; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp: 30; GFX12: ; %bb.0: ; %bb 31; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp 32; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 33; GFX12-NEXT: s_endpgm 34bb: 35 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) 36 store <4 x i32> %res, ptr addrspace(1) %out 37 ret void 38} 39 40 41 42define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 43; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0: 44; GFX12: ; %bb.0: ; %bb 45; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] 46; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 47; GFX12-NEXT: s_endpgm 48bb: 49 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) 50 store <4 x i32> %res, ptr addrspace(1) %out 51 ret void 52} 53 54define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 55; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1: 56; GFX12: ; %bb.0: ; %bb 57; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] 58; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 59; GFX12-NEXT: s_endpgm 60bb: 61 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) 62 store <4 x i32> %res, ptr addrspace(1) %out 63 ret void 64} 65 66define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 67; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp: 68; GFX12: ; %bb.0: ; %bb 69; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp 70; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 71; GFX12-NEXT: s_endpgm 72bb: 73 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) 74 store <4 x i32> %res, ptr addrspace(1) %out 75 ret void 76} 77 78 79 80define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 81; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0: 82; GFX12: ; %bb.0: ; %bb 83; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] 84; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 85; GFX12-NEXT: s_endpgm 86bb: 87 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0) 88 store <4 x i32> %res, ptr addrspace(1) %out 89 ret void 90} 91 92define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 93; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1: 94; GFX12: ; %bb.0: ; %bb 95; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] 96; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 97; GFX12-NEXT: s_endpgm 98bb: 99 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0) 100 store <4 x i32> %res, ptr addrspace(1) %out 101 ret void 102} 103 104define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) { 105; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp: 106; GFX12: ; %bb.0: ; %bb 107; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp 108; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 109; GFX12-NEXT: s_endpgm 110bb: 111 %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1) 112 store <4 x i32> %res, ptr addrspace(1) %out 113 ret void 114} 115 116 117 118 119 120 121define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { 122; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0: 123; GFX12: ; %bb.0: ; %bb 124; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] 125; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 126; GFX12-NEXT: s_endpgm 127bb: 128 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) 129 store <4 x i32> %res, ptr addrspace(1) %out 130 ret void 131} 132 133define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { 134; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1: 135; GFX12: ; %bb.0: ; %bb 136; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] 137; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 138; GFX12-NEXT: s_endpgm 139bb: 140 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0) 141 store <4 x i32> %res, ptr addrspace(1) %out 142 ret void 143} 144 145define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) { 146; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp: 147; GFX12: ; %bb.0: ; %bb 148; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp 149; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 150; GFX12-NEXT: s_endpgm 151bb: 152 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1) 153 store <4 x i32> %res, ptr addrspace(1) %out 154 ret void 155} 156 157 158 159define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 160; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0: 161; GFX12: ; %bb.0: ; %bb 162; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] 163; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off 164; GFX12-NEXT: s_endpgm 165bb: 166 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0) 167 store <4 x i32> %res, ptr addrspace(1) %out 168 ret void 169} 170 171define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 172; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1: 173; GFX12: ; %bb.0: ; %bb 174; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] 175; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off 176; GFX12-NEXT: s_endpgm 177bb: 178 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0) 179 store <4 x i32> %res, ptr addrspace(1) %out 180 ret void 181} 182 183define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 184; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp: 185; GFX12: ; %bb.0: ; %bb 186; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp 187; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off 188; GFX12-NEXT: s_endpgm 189bb: 190 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1) 191 store <4 x i32> %res, ptr addrspace(1) %out 192 ret void 193} 194 195 196 197define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 198; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0: 199; GFX12: ; %bb.0: ; %bb 200; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] 201; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 202; GFX12-NEXT: s_endpgm 203bb: 204 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) 205 store <4 x i32> %res, ptr addrspace(1) %out 206 ret void 207} 208 209define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 210; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1: 211; GFX12: ; %bb.0: ; %bb 212; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] 213; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 214; GFX12-NEXT: s_endpgm 215bb: 216 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0) 217 store <4 x i32> %res, ptr addrspace(1) %out 218 ret void 219} 220 221define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) { 222; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp: 223; GFX12: ; %bb.0: ; %bb 224; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp 225; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off 226; GFX12-NEXT: s_endpgm 227bb: 228 %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1) 229 store <4 x i32> %res, ptr addrspace(1) %out 230 ret void 231} 232 233declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 234declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 235declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg) 236declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg) 237declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg) 238declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg) 239