1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12 3 4define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { 5; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm: 6; GFX12: ; %bb.0: ; %bb 7; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 8; GFX12-NEXT: s_clause 0x1 9; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 10; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 11; GFX12-NEXT: s_endpgm 12bb: 13 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) 14 store <8 x float> %res, ptr addrspace(1) %out 15 ret void 16} 17 18define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { 19; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: 20; GFX12: ; %bb.0: ; %bb 21; GFX12-NEXT: s_mov_b32 s0, 0x40400000 22; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 23; GFX12-NEXT: s_mov_b32 s7, s0 24; GFX12-NEXT: s_mov_b32 s1, s0 25; GFX12-NEXT: s_mov_b32 s2, s0 26; GFX12-NEXT: s_mov_b32 s3, s0 27; GFX12-NEXT: s_mov_b32 s4, s0 28; GFX12-NEXT: s_mov_b32 s5, s0 29; GFX12-NEXT: s_mov_b32 s6, s0 30; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 31; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 32; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 33; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 34; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 35; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] 36; GFX12-NEXT: s_clause 0x1 37; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 38; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 39; GFX12-NEXT: s_endpgm 40bb: 41 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) 42 store <8 x float> %res, ptr addrspace(1) %out 43 ret void 44} 45 46define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { 47; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm: 48; GFX12: ; %bb.0: ; %bb 49; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0 50; GFX12-NEXT: s_clause 0x1 51; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 52; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 53; GFX12-NEXT: s_endpgm 54bb: 55 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) 56 store <8 x float> %res, ptr addrspace(1) %out 57 ret void 58} 59 60define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { 61; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: 62; GFX12: ; %bb.0: ; %bb 63; GFX12-NEXT: s_mov_b32 s0, 0x40400000 64; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 65; GFX12-NEXT: s_mov_b32 s7, s0 66; GFX12-NEXT: s_mov_b32 s1, s0 67; GFX12-NEXT: s_mov_b32 s2, s0 68; GFX12-NEXT: s_mov_b32 s3, s0 69; GFX12-NEXT: s_mov_b32 s4, s0 70; GFX12-NEXT: s_mov_b32 s5, s0 71; GFX12-NEXT: s_mov_b32 s6, s0 72; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 73; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 74; GFX12-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 75; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 76; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 77; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] 78; GFX12-NEXT: s_clause 0x1 79; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 80; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 81; GFX12-NEXT: s_endpgm 82bb: 83 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) 84 store <8 x float> %res, ptr addrspace(1) %out 85 ret void 86} 87 88define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { 89; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm: 90; GFX12: ; %bb.0: ; %bb 91; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 92; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 93; GFX12-NEXT: s_endpgm 94bb: 95 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0) 96 store <8 x half> %res, ptr addrspace(1) %out 97 ret void 98} 99 100define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { 101; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: 102; GFX12: ; %bb.0: ; %bb 103; GFX12-NEXT: s_mov_b32 s0, 0x42004200 104; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 105; GFX12-NEXT: s_mov_b32 s3, s0 106; GFX12-NEXT: s_mov_b32 s1, s0 107; GFX12-NEXT: s_mov_b32 s2, s0 108; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 109; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 110; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 111; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] 112; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 113; GFX12-NEXT: s_endpgm 114bb: 115 %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0) 116 store <8 x half> %res, ptr addrspace(1) %out 117 ret void 118} 119 120define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { 121; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: 122; GFX12: ; %bb.0: ; %bb 123; GFX12-NEXT: s_mov_b32 s0, 0x3f803f80 124; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 125; GFX12-NEXT: s_mov_b32 s3, s0 126; GFX12-NEXT: s_mov_b32 s1, s0 127; GFX12-NEXT: s_mov_b32 s2, s0 128; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 129; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 130; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 131; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] 132; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 133; GFX12-NEXT: s_endpgm 134bb: 135 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0) 136 store <8 x i16> %res, ptr addrspace(1) %out 137 ret void 138} 139 140define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { 141; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: 142; GFX12: ; %bb.0: ; %bb 143; GFX12-NEXT: s_mov_b32 s0, 0x3fc03fc0 144; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 145; GFX12-NEXT: s_mov_b32 s3, s0 146; GFX12-NEXT: s_mov_b32 s1, s0 147; GFX12-NEXT: s_mov_b32 s2, s0 148; GFX12-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 149; GFX12-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 150; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 151; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] 152; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off 153; GFX12-NEXT: s_endpgm 154bb: 155 %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0) 156 store <8 x i16> %res, ptr addrspace(1) %out 157 ret void 158} 159 160define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 161; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm: 162; GFX12: ; %bb.0: ; %bb 163; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1 164; GFX12-NEXT: s_clause 0x1 165; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 166; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 167; GFX12-NEXT: s_endpgm 168bb: 169 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0) 170 store <8 x i32> %res, ptr addrspace(1) %out 171 ret void 172} 173 174define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 175; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: 176; GFX12: ; %bb.0: ; %bb 177; GFX12-NEXT: s_movk_i32 s0, 0x80 178; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 179; GFX12-NEXT: s_mov_b32 s7, s0 180; GFX12-NEXT: s_mov_b32 s1, s0 181; GFX12-NEXT: s_mov_b32 s2, s0 182; GFX12-NEXT: s_mov_b32 s3, s0 183; GFX12-NEXT: s_mov_b32 s4, s0 184; GFX12-NEXT: s_mov_b32 s5, s0 185; GFX12-NEXT: s_mov_b32 s6, s0 186; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 187; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 188; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 189; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 190; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 191; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] 192; GFX12-NEXT: s_clause 0x1 193; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 194; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 195; GFX12-NEXT: s_endpgm 196bb: 197 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0) 198 store <8 x i32> %res, ptr addrspace(1) %out 199 ret void 200} 201 202define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) { 203; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm: 204; GFX12: ; %bb.0: ; %bb 205; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1 206; GFX12-NEXT: s_clause 0x1 207; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 208; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 209; GFX12-NEXT: s_endpgm 210bb: 211 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0) 212 store <8 x i32> %res, ptr addrspace(1) %out 213 ret void 214} 215 216define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { 217; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: 218; GFX12: ; %bb.0: ; %bb 219; GFX12-NEXT: s_movk_i32 s0, 0x80 220; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 221; GFX12-NEXT: s_mov_b32 s7, s0 222; GFX12-NEXT: s_mov_b32 s1, s0 223; GFX12-NEXT: s_mov_b32 s2, s0 224; GFX12-NEXT: s_mov_b32 s3, s0 225; GFX12-NEXT: s_mov_b32 s4, s0 226; GFX12-NEXT: s_mov_b32 s5, s0 227; GFX12-NEXT: s_mov_b32 s6, s0 228; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 229; GFX12-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6 230; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4 231; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2 232; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 233; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] 234; GFX12-NEXT: s_clause 0x1 235; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off 236; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 237; GFX12-NEXT: s_endpgm 238bb: 239 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0) 240 store <8 x i32> %res, ptr addrspace(1) %out 241 ret void 242} 243 244define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 245; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm: 246; GFX12: ; %bb.0: ; %bb 247; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0 248; GFX12-NEXT: s_clause 0x1 249; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 250; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 251; GFX12-NEXT: s_endpgm 252bb: 253 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) 254 store <8 x float> %res, ptr addrspace(1) %out 255 ret void 256} 257 258define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 259; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: 260; GFX12: ; %bb.0: ; %bb 261; GFX12-NEXT: s_mov_b32 s0, 0x40400000 262; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 263; GFX12-NEXT: s_mov_b32 s7, s0 264; GFX12-NEXT: s_mov_b32 s1, s0 265; GFX12-NEXT: s_mov_b32 s2, s0 266; GFX12-NEXT: s_mov_b32 s3, s0 267; GFX12-NEXT: s_mov_b32 s4, s0 268; GFX12-NEXT: s_mov_b32 s5, s0 269; GFX12-NEXT: s_mov_b32 s6, s0 270; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 271; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 272; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 273; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 274; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 275; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] 276; GFX12-NEXT: s_clause 0x1 277; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 278; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 279; GFX12-NEXT: s_endpgm 280bb: 281 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) 282 store <8 x float> %res, ptr addrspace(1) %out 283 ret void 284} 285 286define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 287; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm: 288; GFX12: ; %bb.0: ; %bb 289; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0 290; GFX12-NEXT: s_clause 0x1 291; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 292; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 293; GFX12-NEXT: s_endpgm 294bb: 295 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) 296 store <8 x float> %res, ptr addrspace(1) %out 297 ret void 298} 299 300define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 301; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: 302; GFX12: ; %bb.0: ; %bb 303; GFX12-NEXT: s_mov_b32 s0, 0x40400000 304; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 305; GFX12-NEXT: s_mov_b32 s7, s0 306; GFX12-NEXT: s_mov_b32 s1, s0 307; GFX12-NEXT: s_mov_b32 s2, s0 308; GFX12-NEXT: s_mov_b32 s3, s0 309; GFX12-NEXT: s_mov_b32 s4, s0 310; GFX12-NEXT: s_mov_b32 s5, s0 311; GFX12-NEXT: s_mov_b32 s6, s0 312; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 313; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 314; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 315; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 316; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 317; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] 318; GFX12-NEXT: s_clause 0x1 319; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 320; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 321; GFX12-NEXT: s_endpgm 322bb: 323 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) 324 store <8 x float> %res, ptr addrspace(1) %out 325 ret void 326} 327 328define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 329; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm: 330; GFX12: ; %bb.0: ; %bb 331; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0 332; GFX12-NEXT: s_clause 0x1 333; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 334; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 335; GFX12-NEXT: s_endpgm 336bb: 337 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) 338 store <8 x float> %res, ptr addrspace(1) %out 339 ret void 340} 341 342define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 343; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: 344; GFX12: ; %bb.0: ; %bb 345; GFX12-NEXT: s_mov_b32 s0, 0x40400000 346; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 347; GFX12-NEXT: s_mov_b32 s7, s0 348; GFX12-NEXT: s_mov_b32 s1, s0 349; GFX12-NEXT: s_mov_b32 s2, s0 350; GFX12-NEXT: s_mov_b32 s3, s0 351; GFX12-NEXT: s_mov_b32 s4, s0 352; GFX12-NEXT: s_mov_b32 s5, s0 353; GFX12-NEXT: s_mov_b32 s6, s0 354; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 355; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 356; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 357; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 358; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 359; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] 360; GFX12-NEXT: s_clause 0x1 361; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 362; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 363; GFX12-NEXT: s_endpgm 364bb: 365 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) 366 store <8 x float> %res, ptr addrspace(1) %out 367 ret void 368} 369 370define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 371; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm: 372; GFX12: ; %bb.0: ; %bb 373; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0 374; GFX12-NEXT: s_clause 0x1 375; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 376; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 377; GFX12-NEXT: s_endpgm 378bb: 379 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) 380 store <8 x float> %res, ptr addrspace(1) %out 381 ret void 382} 383 384define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 385; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: 386; GFX12: ; %bb.0: ; %bb 387; GFX12-NEXT: s_mov_b32 s0, 0x40400000 388; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 389; GFX12-NEXT: s_mov_b32 s7, s0 390; GFX12-NEXT: s_mov_b32 s1, s0 391; GFX12-NEXT: s_mov_b32 s2, s0 392; GFX12-NEXT: s_mov_b32 s3, s0 393; GFX12-NEXT: s_mov_b32 s4, s0 394; GFX12-NEXT: s_mov_b32 s5, s0 395; GFX12-NEXT: s_mov_b32 s6, s0 396; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 397; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 398; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 399; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 400; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 401; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] 402; GFX12-NEXT: s_clause 0x1 403; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 404; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 405; GFX12-NEXT: s_endpgm 406bb: 407 %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) 408 store <8 x float> %res, ptr addrspace(1) %out 409 ret void 410} 411 412define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 413; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm: 414; GFX12: ; %bb.0: ; %bb 415; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1 416; GFX12-NEXT: s_clause 0x1 417; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 418; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 419; GFX12-NEXT: s_endpgm 420bb: 421 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0) 422 store <8 x i32> %res, ptr addrspace(1) %out 423 ret void 424} 425 426define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { 427; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: 428; GFX12: ; %bb.0: ; %bb 429; GFX12-NEXT: s_movk_i32 s0, 0x80 430; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 431; GFX12-NEXT: s_mov_b32 s7, s0 432; GFX12-NEXT: s_mov_b32 s1, s0 433; GFX12-NEXT: s_mov_b32 s2, s0 434; GFX12-NEXT: s_mov_b32 s3, s0 435; GFX12-NEXT: s_mov_b32 s4, s0 436; GFX12-NEXT: s_mov_b32 s5, s0 437; GFX12-NEXT: s_mov_b32 s6, s0 438; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 439; GFX12-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 440; GFX12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 441; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2 442; GFX12-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 443; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] 444; GFX12-NEXT: s_clause 0x1 445; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off 446; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 447; GFX12-NEXT: s_endpgm 448bb: 449 %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0) 450 store <8 x i32> %res, ptr addrspace(1) %out 451 ret void 452} 453 454declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>) 455declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>) 456declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg) 457declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg) 458declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) 459declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg) 460declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 461declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 462declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 463declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>) 464declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) 465declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16) 466declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16) 467declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16) 468declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16) 469declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg) 470declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg) 471declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg) 472declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) 473declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) 474declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) 475declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16) 476