1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 3 4define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 5; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: 6; GFX12: ; %bb.0: ; %bb 7; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0] 8; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 9; GFX12-NEXT: s_endpgm 10bb: 11 %fneg.A = fneg <4 x half> %A 12 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C) 13 store <4 x float> %res, ptr addrspace(1) %out 14 ret void 15} 16 17define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 18; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: 19; GFX12: ; %bb.0: ; %bb 20; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0] 21; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 22; GFX12-NEXT: s_endpgm 23bb: 24 %fneg.B = fneg <4 x half> %B 25 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C) 26 store <4 x float> %res, ptr addrspace(1) %out 27 ret void 28} 29 30define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 31; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: 32; GFX12: ; %bb.0: ; %bb 33; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] 34; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 35; GFX12-NEXT: s_endpgm 36bb: 37 %fneg.C = fneg <4 x float> %C 38 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C) 39 store <4 x float> %res, ptr addrspace(1) %out 40 ret void 41} 42 43define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 44; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: 45; GFX12: ; %bb.0: ; %bb 46; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] 47; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 48; GFX12-NEXT: s_endpgm 49bb: 50 %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) 51 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C) 52 store <4 x float> %res, ptr addrspace(1) %out 53 ret void 54} 55 56define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { 57; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: 58; GFX12: ; %bb.0: ; %bb 59; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] 60; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 61; GFX12-NEXT: s_endpgm 62bb: 63 %fneg.C = fneg <4 x float> %C 64 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C) 65 store <4 x float> %res, ptr addrspace(1) %out 66 ret void 67} 68 69define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) { 70; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: 71; GFX12: ; %bb.0: ; %bb 72; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] 73; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 74; GFX12-NEXT: s_endpgm 75bb: 76 %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) 77 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C) 78 store <4 x float> %res, ptr addrspace(1) %out 79 ret void 80} 81 82define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { 83; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: 84; GFX12: ; %bb.0: ; %bb 85; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0] 86; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 87; GFX12-NEXT: s_endpgm 88bb: 89 %fneg.A = fneg <4 x half> %A 90 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0) 91 store <4 x half> %res, ptr addrspace(1) %out 92 ret void 93} 94 95define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { 96; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: 97; GFX12: ; %bb.0: ; %bb 98; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0] 99; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 100; GFX12-NEXT: s_endpgm 101bb: 102 %fneg.B = fneg <4 x half> %B 103 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0) 104 store <4 x half> %res, ptr addrspace(1) %out 105 ret void 106} 107 108define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { 109; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: 110; GFX12: ; %bb.0: ; %bb 111; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] 112; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 113; GFX12-NEXT: s_endpgm 114bb: 115 %fneg.C = fneg <4 x half> %C 116 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0) 117 store <4 x half> %res, ptr addrspace(1) %out 118 ret void 119} 120 121define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { 122; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: 123; GFX12: ; %bb.0: ; %bb 124; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] 125; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 126; GFX12-NEXT: s_endpgm 127bb: 128 %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) 129 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0) 130 store <4 x half> %res, ptr addrspace(1) %out 131 ret void 132} 133 134define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 135; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: 136; GFX12: ; %bb.0: ; %bb 137; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] 138; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 139; GFX12-NEXT: s_endpgm 140bb: 141 %fneg.C = fneg <4 x float> %C 142 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) 143 store <4 x float> %res, ptr addrspace(1) %out 144 ret void 145} 146 147define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 148; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: 149; GFX12: ; %bb.0: ; %bb 150; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] 151; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 152; GFX12-NEXT: s_endpgm 153bb: 154 %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) 155 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) 156 store <4 x float> %res, ptr addrspace(1) %out 157 ret void 158} 159 160define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 161; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: 162; GFX12: ; %bb.0: ; %bb 163; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] 164; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 165; GFX12-NEXT: s_endpgm 166bb: 167 %fneg.C = fneg <4 x float> %C 168 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) 169 store <4 x float> %res, ptr addrspace(1) %out 170 ret void 171} 172 173define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 174; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: 175; GFX12: ; %bb.0: ; %bb 176; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] 177; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 178; GFX12-NEXT: s_endpgm 179bb: 180 %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) 181 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) 182 store <4 x float> %res, ptr addrspace(1) %out 183 ret void 184} 185 186define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 187; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: 188; GFX12: ; %bb.0: ; %bb 189; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] 190; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 191; GFX12-NEXT: s_endpgm 192bb: 193 %fneg.C = fneg <4 x float> %C 194 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) 195 store <4 x float> %res, ptr addrspace(1) %out 196 ret void 197} 198 199define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 200; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: 201; GFX12: ; %bb.0: ; %bb 202; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] 203; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 204; GFX12-NEXT: s_endpgm 205bb: 206 %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) 207 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) 208 store <4 x float> %res, ptr addrspace(1) %out 209 ret void 210} 211 212define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 213; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: 214; GFX12: ; %bb.0: ; %bb 215; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] 216; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 217; GFX12-NEXT: s_endpgm 218bb: 219 %fneg.C = fneg <4 x float> %C 220 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C) 221 store <4 x float> %res, ptr addrspace(1) %out 222 ret void 223} 224 225define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) { 226; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: 227; GFX12: ; %bb.0: ; %bb 228; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] 229; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off 230; GFX12-NEXT: s_endpgm 231bb: 232 %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) 233 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C) 234 store <4 x float> %res, ptr addrspace(1) %out 235 ret void 236} 237 238define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { 239; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: 240; GFX12: ; %bb.0: ; %bb 241; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] 242; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off 243; GFX12-NEXT: s_endpgm 244bb: 245 %fneg.A = fneg <4 x half> %A 246 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) 247 store <4 x float> %res, ptr addrspace(1) %out 248 ret void 249} 250 251define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { 252; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: 253; GFX12: ; %bb.0: ; %bb 254; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] 255; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off 256; GFX12-NEXT: s_endpgm 257bb: 258 %fneg.B = fneg <8 x half> %B 259 %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) 260 store <4 x float> %res, ptr addrspace(1) %out 261 ret void 262} 263 264define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { 265; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: 266; GFX12: ; %bb.0: ; %bb 267; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] 268; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off 269; GFX12-NEXT: s_endpgm 270bb: 271 %fneg.A = fneg <4 x half> %A 272 %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) 273 store <4 x half> %res, ptr addrspace(1) %out 274 ret void 275} 276 277define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { 278; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: 279; GFX12: ; %bb.0: ; %bb 280; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] 281; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off 282; GFX12-NEXT: s_endpgm 283bb: 284 %fneg.B = fneg <8 x half> %B 285 %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index) 286 store <4 x half> %res, ptr addrspace(1) %out 287 ret void 288} 289 290; both neg and abs patterns (wmma matrix C f32 or f16 ) 291 292define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 293; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: 294; GFX12: ; %bb.0: ; %bb 295; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1] 296; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 297; GFX12-NEXT: s_endpgm 298bb: 299 %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C) 300 %fneg.fabs.C = fneg <4 x float> %fabs.C 301 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C) 302 store <4 x float> %res, ptr addrspace(1) %out 303 ret void 304} 305 306define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { 307; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: 308; GFX12: ; %bb.0: ; %bb 309; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] 310; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 311; GFX12-NEXT: s_endpgm 312bb: 313 %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C) 314 %fneg.fabs.C = fneg <4 x half> %fabs.C 315 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0) 316 store <4 x half> %res, ptr addrspace(1) %out 317 ret void 318} 319 320define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 321; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: 322; GFX12: ; %bb.0: ; %bb 323; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 324; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 325; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] 326; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off 327; GFX12-NEXT: s_endpgm 328bb: 329 %el3 = extractelement <4 x float> %C, i32 3 330 %el3.fabs = call float @llvm.fabs.f32(float %el3) 331 %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3 332 %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C 333 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C) 334 store <4 x float> %res, ptr addrspace(1) %out 335 ret void 336} 337 338; A or B matrix modifier and constant in C 339 340define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) { 341; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: 342; GFX12: ; %bb.0: ; %bb 343; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] 344; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 345; GFX12-NEXT: s_endpgm 346bb: 347 %fneg.A = fneg <4 x half> %A 348 %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>) 349 store <4 x float> %res, ptr addrspace(1) %out 350 ret void 351} 352 353define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) { 354; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: 355; GFX12: ; %bb.0: ; %bb 356; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] 357; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off 358; GFX12-NEXT: s_endpgm 359bb: 360 %fneg.B = fneg <4 x half> %B 361 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0) 362 store <4 x half> %res, ptr addrspace(1) %out 363 ret void 364} 365 366; pack f16 elements with v_perm_b32 since they don't come from same b32 367 368define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { 369; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: 370; GFX12: ; %bb.0: ; %bb 371; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5] 372; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 373; GFX12-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 374; GFX12-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 375; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 376; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] 377; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off 378; GFX12-NEXT: s_endpgm 379bb: 380 %C = load <8 x half>, ptr %Caddr 381 %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 382 %fneg.C_shuffle = fneg <4 x half> %C_shuffle 383 %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0) 384 store <4 x half> %res, ptr addrspace(1) %out 385 ret void 386} 387 388declare <4 x half> @llvm.fabs.v4f16(<4 x half>) 389declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 390declare float @llvm.fabs.f32(float) 391 392declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half>, <4 x half>, <4 x float>) 393declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16>, <4 x i16>, <4 x float>) 394declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg) 395declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32, i32, <4 x float>) 396declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <4 x float>) 397declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>) 398declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>) 399declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half>, <8 x half>, <4 x float>, i16) 400declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half>, <8 x half>, <4 x half>, i16) 401