1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 3 4define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { 5; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA: 6; GFX12: ; %bb.0: ; %bb 7; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0] 8; GFX12-NEXT: s_clause 0x1 9; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 10; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 11; GFX12-NEXT: s_endpgm 12bb: 13 %fneg.A = fneg <8 x half> %A 14 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C) 15 store <8 x float> %res, ptr addrspace(1) %out 16 ret void 17} 18 19define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { 20; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB: 21; GFX12: ; %bb.0: ; %bb 22; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0] 23; GFX12-NEXT: s_clause 0x1 24; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 25; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 26; GFX12-NEXT: s_endpgm 27bb: 28 %fneg.B = fneg <8 x half> %B 29 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C) 30 store <8 x float> %res, ptr addrspace(1) %out 31 ret void 32} 33 34define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { 35; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC: 36; GFX12: ; %bb.0: ; %bb 37; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] 38; GFX12-NEXT: s_clause 0x1 39; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 40; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 41; GFX12-NEXT: s_endpgm 42bb: 43 %fneg.C = fneg <8 x float> %C 44 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C) 45 store <8 x float> %res, ptr addrspace(1) %out 46 ret void 47} 48 49define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { 50; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC: 51; GFX12: ; %bb.0: ; %bb 52; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] 53; GFX12-NEXT: s_clause 0x1 54; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 55; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 56; GFX12-NEXT: s_endpgm 57bb: 58 %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) 59 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C) 60 store <8 x float> %res, ptr addrspace(1) %out 61 ret void 62} 63 64define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { 65; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC: 66; GFX12: ; %bb.0: ; %bb 67; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] 68; GFX12-NEXT: s_clause 0x1 69; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 70; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 71; GFX12-NEXT: s_endpgm 72bb: 73 %fneg.C = fneg <8 x float> %C 74 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C) 75 store <8 x float> %res, ptr addrspace(1) %out 76 ret void 77} 78 79define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) { 80; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC: 81; GFX12: ; %bb.0: ; %bb 82; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] 83; GFX12-NEXT: s_clause 0x1 84; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 85; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 86; GFX12-NEXT: s_endpgm 87bb: 88 %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) 89 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C) 90 store <8 x float> %res, ptr addrspace(1) %out 91 ret void 92} 93 94define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 95; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA: 96; GFX12: ; %bb.0: ; %bb 97; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0] 98; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off 99; GFX12-NEXT: s_endpgm 100bb: 101 %fneg.A = fneg <8 x half> %A 102 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0) 103 store <8 x half> %res, ptr addrspace(1) %out 104 ret void 105} 106 107define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 108; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB: 109; GFX12: ; %bb.0: ; %bb 110; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0] 111; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off 112; GFX12-NEXT: s_endpgm 113bb: 114 %fneg.B = fneg <8 x half> %B 115 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0) 116 store <8 x half> %res, ptr addrspace(1) %out 117 ret void 118} 119 120define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 121; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC: 122; GFX12: ; %bb.0: ; %bb 123; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] 124; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off 125; GFX12-NEXT: s_endpgm 126bb: 127 %fneg.C = fneg <8 x half> %C 128 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0) 129 store <8 x half> %res, ptr addrspace(1) %out 130 ret void 131} 132 133define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 134; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC: 135; GFX12: ; %bb.0: ; %bb 136; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] 137; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off 138; GFX12-NEXT: s_endpgm 139bb: 140 %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) 141 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0) 142 store <8 x half> %res, ptr addrspace(1) %out 143 ret void 144} 145 146define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 147; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC: 148; GFX12: ; %bb.0: ; %bb 149; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] 150; GFX12-NEXT: s_clause 0x1 151; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 152; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 153; GFX12-NEXT: s_endpgm 154bb: 155 %fneg.C = fneg <8 x float> %C 156 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) 157 store <8 x float> %res, ptr addrspace(1) %out 158 ret void 159} 160 161define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 162; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC: 163; GFX12: ; %bb.0: ; %bb 164; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] 165; GFX12-NEXT: s_clause 0x1 166; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 167; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 168; GFX12-NEXT: s_endpgm 169bb: 170 %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) 171 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) 172 store <8 x float> %res, ptr addrspace(1) %out 173 ret void 174} 175 176define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 177; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC: 178; GFX12: ; %bb.0: ; %bb 179; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] 180; GFX12-NEXT: s_clause 0x1 181; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 182; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 183; GFX12-NEXT: s_endpgm 184bb: 185 %fneg.C = fneg <8 x float> %C 186 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) 187 store <8 x float> %res, ptr addrspace(1) %out 188 ret void 189} 190 191define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 192; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC: 193; GFX12: ; %bb.0: ; %bb 194; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] 195; GFX12-NEXT: s_clause 0x1 196; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 197; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 198; GFX12-NEXT: s_endpgm 199bb: 200 %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) 201 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) 202 store <8 x float> %res, ptr addrspace(1) %out 203 ret void 204} 205 206define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 207; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC: 208; GFX12: ; %bb.0: ; %bb 209; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] 210; GFX12-NEXT: s_clause 0x1 211; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 212; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 213; GFX12-NEXT: s_endpgm 214bb: 215 %fneg.C = fneg <8 x float> %C 216 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) 217 store <8 x float> %res, ptr addrspace(1) %out 218 ret void 219} 220 221define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 222; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC: 223; GFX12: ; %bb.0: ; %bb 224; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] 225; GFX12-NEXT: s_clause 0x1 226; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 227; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 228; GFX12-NEXT: s_endpgm 229bb: 230 %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) 231 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) 232 store <8 x float> %res, ptr addrspace(1) %out 233 ret void 234} 235 236define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 237; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC: 238; GFX12: ; %bb.0: ; %bb 239; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] 240; GFX12-NEXT: s_clause 0x1 241; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 242; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 243; GFX12-NEXT: s_endpgm 244bb: 245 %fneg.C = fneg <8 x float> %C 246 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C) 247 store <8 x float> %res, ptr addrspace(1) %out 248 ret void 249} 250 251define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { 252; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC: 253; GFX12: ; %bb.0: ; %bb 254; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] 255; GFX12-NEXT: s_clause 0x1 256; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off 257; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 258; GFX12-NEXT: s_endpgm 259bb: 260 %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) 261 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C) 262 store <8 x float> %res, ptr addrspace(1) %out 263 ret void 264} 265 266define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { 267; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: 268; GFX12: ; %bb.0: ; %bb 269; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0] 270; GFX12-NEXT: s_clause 0x1 271; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off 272; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 273; GFX12-NEXT: s_endpgm 274bb: 275 %fneg.A = fneg <8 x half> %A 276 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index) 277 store <8 x float> %res, ptr addrspace(1) %out 278 ret void 279} 280 281define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { 282; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: 283; GFX12: ; %bb.0: ; %bb 284; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0] 285; GFX12-NEXT: s_clause 0x1 286; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off 287; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16 288; GFX12-NEXT: s_endpgm 289bb: 290 %fneg.B = fneg <16 x half> %B 291 %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index) 292 store <8 x float> %res, ptr addrspace(1) %out 293 ret void 294} 295 296define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { 297; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: 298; GFX12: ; %bb.0: ; %bb 299; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0] 300; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off 301; GFX12-NEXT: s_endpgm 302bb: 303 %fneg.A = fneg <8 x half> %A 304 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index) 305 store <8 x half> %res, ptr addrspace(1) %out 306 ret void 307} 308 309define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { 310; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: 311; GFX12: ; %bb.0: ; %bb 312; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0] 313; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off 314; GFX12-NEXT: s_endpgm 315bb: 316 %fneg.B = fneg <16 x half> %B 317 %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index) 318 store <8 x half> %res, ptr addrspace(1) %out 319 ret void 320} 321 322; both neg and abs patterns (wmma matrix C f32 or f16 ) 323 324define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { 325; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC: 326; GFX12: ; %bb.0: ; %bb 327; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1] 328; GFX12-NEXT: s_clause 0x1 329; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 330; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 331; GFX12-NEXT: s_endpgm 332bb: 333 %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C) 334 %fneg.fabs.C = fneg <8 x float> %fabs.C 335 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C) 336 store <8 x float> %res, ptr addrspace(1) %out 337 ret void 338} 339 340define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 341; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC: 342; GFX12: ; %bb.0: ; %bb 343; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1] 344; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off 345; GFX12-NEXT: s_endpgm 346bb: 347 %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C) 348 %fneg.fabs.C = fneg <8 x half> %fabs.C 349 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0) 350 store <8 x half> %res, ptr addrspace(1) %out 351 ret void 352} 353 354define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { 355; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA: 356; GFX12: ; %bb.0: ; %bb 357; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 358; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 359; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] 360; GFX12-NEXT: s_clause 0x1 361; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off 362; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 363; GFX12-NEXT: s_endpgm 364bb: 365 %el3 = extractelement <8 x float> %C, i32 3 366 %el3.fabs = call float @llvm.fabs.f32(float %el3) 367 %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3 368 %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C 369 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C) 370 store <8 x float> %res, ptr addrspace(1) %out 371 ret void 372} 373 374; A or B matrix modifier and constant in C 375 376define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { 377; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC: 378; GFX12: ; %bb.0: ; %bb 379; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0] 380; GFX12-NEXT: s_clause 0x1 381; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 382; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 383; GFX12-NEXT: s_endpgm 384bb: 385 %fneg.A = fneg <8 x half> %A 386 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) 387 store <8 x float> %res, ptr addrspace(1) %out 388 ret void 389} 390 391define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { 392; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC: 393; GFX12: ; %bb.0: ; %bb 394; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0] 395; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 396; GFX12-NEXT: s_endpgm 397bb: 398 %fneg.B = fneg <8 x half> %B 399 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0) 400 store <8 x half> %res, ptr addrspace(1) %out 401 ret void 402} 403 404; pack f16 elements with v_perm_b32 since they don't come from same b32 405 406define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) { 407; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack: 408; GFX12: ; %bb.0: ; %bb 409; GFX12-NEXT: s_clause 0x1 410; GFX12-NEXT: flat_load_b128 v[12:15], v[8:9] 411; GFX12-NEXT: flat_load_b128 v[16:19], v[8:9] offset:16 412; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 413; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v12 414; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v14 415; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 416; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v16 417; GFX12-NEXT: v_and_b32_e32 v16, 0xffff, v18 418; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v8 419; GFX12-NEXT: v_lshl_or_b32 v13, v15, 16, v9 420; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 421; GFX12-NEXT: v_lshl_or_b32 v14, v17, 16, v14 422; GFX12-NEXT: v_lshl_or_b32 v15, v19, 16, v16 423; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 424; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1] 425; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off 426; GFX12-NEXT: s_endpgm 427bb: 428 %C = load <16 x half>, ptr %Caddr 429 %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 430 %fneg.C_shuffle = fneg <8 x half> %C_shuffle 431 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0) 432 store <8 x half> %res, ptr addrspace(1) %out 433 ret void 434} 435 436declare <8 x half> @llvm.fabs.v8f16(<8 x half>) 437declare <8 x float> @llvm.fabs.v8f32(<8 x float>) 438declare float @llvm.fabs.f32(float) 439 440declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) 441declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) 442declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) 443declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 444declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 445declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 446declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 447declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) 448declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) 449