1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s 3 4declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32) 5declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) 6declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32) 7declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) 8declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32) 9declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32) 10declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32) 11declare i32 @llvm.amdgcn.workitem.id.x() 12 13define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 { 14; GCN-LABEL: test_mfma_f32_32x32x4bf16_1k: 15; GCN: ; %bb.0: ; %bb 16; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 17; GCN-NEXT: s_mov_b64 s[36:37], 1 18; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] 19; GCN-NEXT: s_mov_b32 s36, 2 20; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1] 21; GCN-NEXT: s_waitcnt lgkmcnt(0) 22; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 23; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 24; GCN-NEXT: s_waitcnt lgkmcnt(0) 25; GCN-NEXT: v_accvgpr_write_b32 a0, s0 26; GCN-NEXT: v_accvgpr_write_b32 a1, s1 27; GCN-NEXT: v_accvgpr_write_b32 a2, s2 28; GCN-NEXT: v_accvgpr_write_b32 a3, s3 29; GCN-NEXT: v_accvgpr_write_b32 a4, s4 30; GCN-NEXT: v_accvgpr_write_b32 a5, s5 31; GCN-NEXT: v_accvgpr_write_b32 a6, s6 32; GCN-NEXT: v_accvgpr_write_b32 a7, s7 33; GCN-NEXT: v_accvgpr_write_b32 a8, s8 34; GCN-NEXT: v_accvgpr_write_b32 a9, s9 35; GCN-NEXT: v_accvgpr_write_b32 a10, s10 36; GCN-NEXT: v_accvgpr_write_b32 a11, s11 37; GCN-NEXT: v_accvgpr_write_b32 a12, s12 38; GCN-NEXT: v_accvgpr_write_b32 a13, s13 39; GCN-NEXT: v_accvgpr_write_b32 a14, s14 40; GCN-NEXT: v_accvgpr_write_b32 a15, s15 41; GCN-NEXT: v_accvgpr_write_b32 a16, s16 42; GCN-NEXT: v_accvgpr_write_b32 a17, s17 43; GCN-NEXT: v_accvgpr_write_b32 a18, s18 44; GCN-NEXT: v_accvgpr_write_b32 a19, s19 45; GCN-NEXT: v_accvgpr_write_b32 a20, s20 46; GCN-NEXT: v_accvgpr_write_b32 a21, s21 47; GCN-NEXT: v_accvgpr_write_b32 a22, s22 48; GCN-NEXT: v_accvgpr_write_b32 a23, s23 49; GCN-NEXT: v_accvgpr_write_b32 a24, s24 50; GCN-NEXT: v_accvgpr_write_b32 a25, s25 51; GCN-NEXT: v_accvgpr_write_b32 a26, s26 52; GCN-NEXT: v_accvgpr_write_b32 a27, s27 53; GCN-NEXT: v_accvgpr_write_b32 a28, s28 54; GCN-NEXT: v_accvgpr_write_b32 a29, s29 55; GCN-NEXT: v_accvgpr_write_b32 a30, s30 56; GCN-NEXT: v_accvgpr_write_b32 a31, s31 57; GCN-NEXT: s_nop 1 58; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 59; GCN-NEXT: v_mov_b32_e32 v0, 0 60; GCN-NEXT: s_nop 7 61; GCN-NEXT: s_nop 7 62; GCN-NEXT: s_nop 1 63; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] 64; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 65; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 66; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 67; GCN-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 68; GCN-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 69; GCN-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 70; GCN-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 71; GCN-NEXT: s_endpgm 72bb: 73 %in.1 = load <32 x float>, ptr addrspace(1) %arg 74 %a = bitcast i64 1 to <4 x i16> 75 %b = bitcast i64 2 to <4 x i16> 76 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3) 77 store <32 x float> %mai.1, ptr addrspace(1) %arg 78 ret void 79} 80 81define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 { 82; GCN-LABEL: test_mfma_f32_16x16x4bf16_1k: 83; GCN: ; %bb.0: ; %bb 84; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 85; GCN-NEXT: s_mov_b64 s[18:19], 1 86; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] 87; GCN-NEXT: s_mov_b32 s18, 2 88; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] 89; GCN-NEXT: s_waitcnt lgkmcnt(0) 90; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 91; GCN-NEXT: s_waitcnt lgkmcnt(0) 92; GCN-NEXT: v_accvgpr_write_b32 a0, s0 93; GCN-NEXT: v_accvgpr_write_b32 a1, s1 94; GCN-NEXT: v_accvgpr_write_b32 a2, s2 95; GCN-NEXT: v_accvgpr_write_b32 a3, s3 96; GCN-NEXT: v_accvgpr_write_b32 a4, s4 97; GCN-NEXT: v_accvgpr_write_b32 a5, s5 98; GCN-NEXT: v_accvgpr_write_b32 a6, s6 99; GCN-NEXT: v_accvgpr_write_b32 a7, s7 100; GCN-NEXT: v_accvgpr_write_b32 a8, s8 101; GCN-NEXT: v_accvgpr_write_b32 a9, s9 102; GCN-NEXT: v_accvgpr_write_b32 a10, s10 103; GCN-NEXT: v_accvgpr_write_b32 a11, s11 104; GCN-NEXT: v_accvgpr_write_b32 a12, s12 105; GCN-NEXT: v_accvgpr_write_b32 a13, s13 106; GCN-NEXT: v_accvgpr_write_b32 a14, s14 107; GCN-NEXT: v_accvgpr_write_b32 a15, s15 108; GCN-NEXT: s_nop 1 109; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 110; GCN-NEXT: v_mov_b32_e32 v0, 0 111; GCN-NEXT: s_nop 7 112; GCN-NEXT: s_nop 1 113; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] 114; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 115; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 116; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 117; GCN-NEXT: s_endpgm 118bb: 119 %in.1 = load <16 x float>, ptr addrspace(1) %arg 120 %a = bitcast i64 1 to <4 x i16> 121 %b = bitcast i64 2 to <4 x i16> 122 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) 123 store <16 x float> %mai.1, ptr addrspace(1) %arg 124 ret void 125} 126 127define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 { 128; GCN-LABEL: test_mfma_f32_4x4x4bf16_1k: 129; GCN: ; %bb.0: ; %bb 130; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 131; GCN-NEXT: s_mov_b64 s[4:5], 1 132; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 133; GCN-NEXT: s_mov_b32 s4, 2 134; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] 135; GCN-NEXT: s_waitcnt lgkmcnt(0) 136; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 137; GCN-NEXT: s_waitcnt lgkmcnt(0) 138; GCN-NEXT: v_accvgpr_write_b32 a0, s0 139; GCN-NEXT: v_accvgpr_write_b32 a1, s1 140; GCN-NEXT: v_accvgpr_write_b32 a2, s2 141; GCN-NEXT: v_accvgpr_write_b32 a3, s3 142; GCN-NEXT: s_nop 1 143; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 144; GCN-NEXT: v_mov_b32_e32 v0, 0 145; GCN-NEXT: s_nop 3 146; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] 147; GCN-NEXT: s_endpgm 148bb: 149 %in.1 = load <4 x float>, ptr addrspace(1) %arg 150 %a = bitcast i64 1 to <4 x i16> 151 %b = bitcast i64 2 to <4 x i16> 152 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) 153 store <4 x float> %mai.1, ptr addrspace(1) %arg 154 ret void 155} 156 157define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 { 158; GCN-LABEL: test_mfma_f32_32x32x8bf16_1k: 159; GCN: ; %bb.0: ; %bb 160; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 161; GCN-NEXT: s_mov_b64 s[18:19], 1 162; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1] 163; GCN-NEXT: s_mov_b32 s18, 2 164; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1] 165; GCN-NEXT: s_waitcnt lgkmcnt(0) 166; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 167; GCN-NEXT: s_waitcnt lgkmcnt(0) 168; GCN-NEXT: v_accvgpr_write_b32 a0, s0 169; GCN-NEXT: v_accvgpr_write_b32 a1, s1 170; GCN-NEXT: v_accvgpr_write_b32 a2, s2 171; GCN-NEXT: v_accvgpr_write_b32 a3, s3 172; GCN-NEXT: v_accvgpr_write_b32 a4, s4 173; GCN-NEXT: v_accvgpr_write_b32 a5, s5 174; GCN-NEXT: v_accvgpr_write_b32 a6, s6 175; GCN-NEXT: v_accvgpr_write_b32 a7, s7 176; GCN-NEXT: v_accvgpr_write_b32 a8, s8 177; GCN-NEXT: v_accvgpr_write_b32 a9, s9 178; GCN-NEXT: v_accvgpr_write_b32 a10, s10 179; GCN-NEXT: v_accvgpr_write_b32 a11, s11 180; GCN-NEXT: v_accvgpr_write_b32 a12, s12 181; GCN-NEXT: v_accvgpr_write_b32 a13, s13 182; GCN-NEXT: v_accvgpr_write_b32 a14, s14 183; GCN-NEXT: v_accvgpr_write_b32 a15, s15 184; GCN-NEXT: s_nop 1 185; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 186; GCN-NEXT: v_mov_b32_e32 v0, 0 187; GCN-NEXT: s_nop 7 188; GCN-NEXT: s_nop 7 189; GCN-NEXT: s_nop 1 190; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] 191; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 192; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 193; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 194; GCN-NEXT: s_endpgm 195bb: 196 %in.1 = load <16 x float>, ptr addrspace(1) %arg 197 %a = bitcast i64 1 to <4 x i16> 198 %b = bitcast i64 2 to <4 x i16> 199 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) 200 store <16 x float> %mai.1, ptr addrspace(1) %arg 201 ret void 202} 203 204define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 { 205; GCN-LABEL: test_mfma_f32_16x16x16bf16_1k: 206; GCN: ; %bb.0: ; %bb 207; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 208; GCN-NEXT: s_mov_b64 s[4:5], 1 209; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] 210; GCN-NEXT: s_mov_b32 s4, 2 211; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] 212; GCN-NEXT: s_waitcnt lgkmcnt(0) 213; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 214; GCN-NEXT: s_waitcnt lgkmcnt(0) 215; GCN-NEXT: v_accvgpr_write_b32 a0, s0 216; GCN-NEXT: v_accvgpr_write_b32 a1, s1 217; GCN-NEXT: v_accvgpr_write_b32 a2, s2 218; GCN-NEXT: v_accvgpr_write_b32 a3, s3 219; GCN-NEXT: s_nop 1 220; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 221; GCN-NEXT: v_mov_b32_e32 v0, 0 222; GCN-NEXT: s_nop 7 223; GCN-NEXT: s_nop 1 224; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] 225; GCN-NEXT: s_endpgm 226bb: 227 %in.1 = load <4 x float>, ptr addrspace(1) %arg 228 %a = bitcast i64 1 to <4 x i16> 229 %b = bitcast i64 2 to <4 x i16> 230 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) 231 store <4 x float> %mai.1, ptr addrspace(1) %arg 232 ret void 233} 234 235define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { 236; GCN-LABEL: test_mfma_f64_4x4x4f64: 237; GCN: ; %bb.0: ; %bb 238; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 239; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 240; GCN-NEXT: s_waitcnt lgkmcnt(0) 241; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 242; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] 243; GCN-NEXT: s_nop 1 244; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 245; GCN-NEXT: s_nop 3 246; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3 247; GCN-NEXT: v_mov_b32_e32 v0, 0 248; GCN-NEXT: s_nop 7 249; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1] 250; GCN-NEXT: s_endpgm 251bb: 252 %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) 253 %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3) 254 store double %mai.2, ptr addrspace(1) %arg 255 ret void 256} 257 258define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 { 259; GCN-LABEL: test_mfma_f64_16x16x4f64: 260; GCN: ; %bb.0: ; %bb 261; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 262; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 263; GCN-NEXT: s_waitcnt lgkmcnt(0) 264; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] 265; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 266; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] 267; GCN-NEXT: s_waitcnt lgkmcnt(0) 268; GCN-NEXT: v_accvgpr_write_b32 a0, s0 269; GCN-NEXT: v_accvgpr_write_b32 a1, s1 270; GCN-NEXT: v_accvgpr_write_b32 a2, s2 271; GCN-NEXT: v_accvgpr_write_b32 a3, s3 272; GCN-NEXT: v_accvgpr_write_b32 a4, s4 273; GCN-NEXT: v_accvgpr_write_b32 a5, s5 274; GCN-NEXT: v_accvgpr_write_b32 a6, s6 275; GCN-NEXT: v_accvgpr_write_b32 a7, s7 276; GCN-NEXT: s_nop 1 277; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 278; GCN-NEXT: v_mov_b32_e32 v0, 0 279; GCN-NEXT: s_nop 7 280; GCN-NEXT: s_nop 7 281; GCN-NEXT: s_nop 0 282; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] 283; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 284; GCN-NEXT: s_endpgm 285bb: 286 %in.1 = load <4 x double>, ptr addrspace(1) %arg 287 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3) 288 store <4 x double> %mai.1, ptr addrspace(1) %arg 289 ret void 290} 291 292define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { 293; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_imm: 294; GCN: ; %bb.0: ; %bb 295; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 296; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 297; GCN-NEXT: s_waitcnt lgkmcnt(0) 298; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 299; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] 300; GCN-NEXT: s_nop 1 301; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 302; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 303; GCN-NEXT: v_mov_b32_e32 v0, 0 304; GCN-NEXT: s_nop 7 305; GCN-NEXT: s_nop 7 306; GCN-NEXT: s_nop 0 307; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] 308; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 309; GCN-NEXT: s_endpgm 310bb: 311 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0) 312 %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3) 313 store <4 x double> %mai.2, ptr addrspace(1) %arg 314 ret void 315} 316 317define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { 318; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: 319; GCN: ; %bb.0: ; %bb 320; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 321; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 322; GCN-NEXT: s_mov_b64 s[0:1], 0 323; GCN-NEXT: s_mov_b64 s[6:7], 1.0 324; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] 325; GCN-NEXT: s_waitcnt lgkmcnt(0) 326; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] 327; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] 328; GCN-NEXT: v_accvgpr_write_b32 a0, s0 329; GCN-NEXT: v_accvgpr_write_b32 a1, s1 330; GCN-NEXT: v_accvgpr_write_b32 a2, s2 331; GCN-NEXT: v_accvgpr_write_b32 a3, s3 332; GCN-NEXT: v_accvgpr_write_b32 a4, s4 333; GCN-NEXT: v_accvgpr_write_b32 a5, s5 334; GCN-NEXT: v_accvgpr_write_b32 a6, s6 335; GCN-NEXT: v_accvgpr_write_b32 a7, s7 336; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] 337; GCN-NEXT: s_nop 1 338; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] 339; GCN-NEXT: v_mov_b32_e32 v0, 0 340; GCN-NEXT: s_nop 7 341; GCN-NEXT: s_nop 7 342; GCN-NEXT: s_nop 0 343; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] 344; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 345; GCN-NEXT: s_endpgm 346bb: 347 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0) 348 store <4 x double> %mai.1, ptr addrspace(1) %arg 349 ret void 350} 351 352define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { 353; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: 354; GCN: ; %bb.0: ; %bb 355; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 356; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 357; GCN-NEXT: s_mov_b32 s0, 0 358; GCN-NEXT: s_mov_b32 s1, 0x405ec000 359; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] 360; GCN-NEXT: s_waitcnt lgkmcnt(0) 361; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] 362; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] 363; GCN-NEXT: s_mov_b64 s[6:7], s[0:1] 364; GCN-NEXT: v_accvgpr_write_b32 a0, s0 365; GCN-NEXT: v_accvgpr_write_b32 a1, s1 366; GCN-NEXT: v_accvgpr_write_b32 a2, s2 367; GCN-NEXT: v_accvgpr_write_b32 a3, s3 368; GCN-NEXT: v_accvgpr_write_b32 a4, s4 369; GCN-NEXT: v_accvgpr_write_b32 a5, s5 370; GCN-NEXT: v_accvgpr_write_b32 a6, s6 371; GCN-NEXT: v_accvgpr_write_b32 a7, s7 372; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] 373; GCN-NEXT: s_nop 1 374; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] 375; GCN-NEXT: v_mov_b32_e32 v0, 0 376; GCN-NEXT: s_nop 7 377; GCN-NEXT: s_nop 7 378; GCN-NEXT: s_nop 0 379; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] 380; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 381; GCN-NEXT: s_endpgm 382bb: 383 %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0) 384 store <4 x double> %mai.1, ptr addrspace(1) %arg 385 ret void 386} 387 388attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } 389