1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefix=GCN %s 3 4define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspace(1) %out1, float %in0, <4 x float> %in1) { 5; GCN-LABEL: MFMAExpInterleave: 6; GCN: ; %bb.0: 7; GCN-NEXT: s_load_dword s6, s[4:5], 0x10 8; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 9; GCN-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b 10; GCN-NEXT: v_mov_b32_e32 v0, 1.0 11; GCN-NEXT: s_mov_b32 s7, 0x42b17218 12; GCN-NEXT: s_waitcnt lgkmcnt(0) 13; GCN-NEXT: v_mul_f32_e32 v2, s6, v1 14; GCN-NEXT: v_rndne_f32_e32 v3, v2 15; GCN-NEXT: v_sub_f32_e32 v4, v2, v3 16; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2 17; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f 18; GCN-NEXT: v_accvgpr_write_b32 a0, s0 19; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2 20; GCN-NEXT: v_accvgpr_write_b32 a1, s1 21; GCN-NEXT: v_accvgpr_write_b32 a2, s2 22; GCN-NEXT: v_accvgpr_write_b32 a3, s3 23; GCN-NEXT: v_add_f32_e32 v1, v4, v1 24; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3 25; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 26; GCN-NEXT: v_exp_f32_e32 v1, v1 27; GCN-NEXT: s_mov_b32 s0, 0x3fb8aa3b 28; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 29; GCN-NEXT: ; iglp_opt mask(0x00000003) 30; GCN-NEXT: v_ldexp_f32 v1, v1, v2 31; GCN-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 32; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2 33; GCN-NEXT: v_mov_b32_e32 v2, 0x42b17218 34; GCN-NEXT: s_nop 0 35; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 36; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v2 37; GCN-NEXT: v_mov_b32_e32 v2, 0x7f800000 38; GCN-NEXT: s_mov_b32 s6, 0xc2ce8ed0 39; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 40; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 41; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 42; GCN-NEXT: v_rndne_f32_e32 v5, v3 43; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 44; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 45; GCN-NEXT: v_add_f32_e32 v3, v3, v4 46; GCN-NEXT: v_exp_f32_e32 v3, v3 47; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 48; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 49; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 50; GCN-NEXT: v_ldexp_f32 v3, v3, v4 51; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 52; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 53; GCN-NEXT: s_nop 1 54; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 55; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 56; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 57; GCN-NEXT: v_rndne_f32_e32 v5, v3 58; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 59; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 60; GCN-NEXT: v_add_f32_e32 v3, v3, v4 61; GCN-NEXT: v_exp_f32_e32 v3, v3 62; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 63; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 64; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 65; GCN-NEXT: v_ldexp_f32 v3, v3, v4 66; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 67; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 68; GCN-NEXT: s_nop 1 69; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 70; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 71; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 72; GCN-NEXT: v_rndne_f32_e32 v5, v3 73; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 74; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 75; GCN-NEXT: v_add_f32_e32 v3, v3, v4 76; GCN-NEXT: v_exp_f32_e32 v3, v3 77; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 78; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 79; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 80; GCN-NEXT: v_ldexp_f32 v3, v3, v4 81; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 82; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 83; GCN-NEXT: s_nop 1 84; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 85; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 86; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 87; GCN-NEXT: v_rndne_f32_e32 v5, v3 88; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 89; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 90; GCN-NEXT: v_add_f32_e32 v3, v3, v4 91; GCN-NEXT: v_exp_f32_e32 v3, v3 92; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 93; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 94; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 95; GCN-NEXT: v_ldexp_f32 v3, v3, v4 96; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 97; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 98; GCN-NEXT: s_nop 1 99; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 100; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 101; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 102; GCN-NEXT: v_rndne_f32_e32 v5, v3 103; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 104; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 105; GCN-NEXT: v_add_f32_e32 v3, v3, v4 106; GCN-NEXT: v_exp_f32_e32 v3, v3 107; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 108; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 109; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 110; GCN-NEXT: v_ldexp_f32 v3, v3, v4 111; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 112; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 113; GCN-NEXT: s_nop 1 114; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 115; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1 116; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3 117; GCN-NEXT: v_rndne_f32_e32 v5, v3 118; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1 119; GCN-NEXT: v_sub_f32_e32 v3, v3, v5 120; GCN-NEXT: v_add_f32_e32 v3, v3, v4 121; GCN-NEXT: v_exp_f32_e32 v3, v3 122; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5 123; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3] 124; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1 125; GCN-NEXT: v_ldexp_f32 v0, v3, v4 126; GCN-NEXT: s_nop 0 127; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 128; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1 129; GCN-NEXT: s_nop 1 130; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 131; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 132; GCN-NEXT: v_fma_f32 v3, v0, s0, -v1 133; GCN-NEXT: v_rndne_f32_e32 v4, v1 134; GCN-NEXT: v_fmac_f32_e32 v3, 0x32a5705f, v0 135; GCN-NEXT: v_sub_f32_e32 v1, v1, v4 136; GCN-NEXT: v_add_f32_e32 v1, v1, v3 137; GCN-NEXT: v_exp_f32_e32 v1, v1 138; GCN-NEXT: v_cvt_i32_f32_e32 v3, v4 139; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 140; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v0 141; GCN-NEXT: v_mov_b32_e32 v4, 0 142; GCN-NEXT: v_ldexp_f32 v1, v1, v3 143; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 144; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v0 145; GCN-NEXT: s_waitcnt lgkmcnt(0) 146; GCN-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] 147; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 148; GCN-NEXT: global_store_dword v4, v0, s[2:3] 149; GCN-NEXT: s_endpgm 150 %mai0 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in1, i32 0, i32 0, i32 0) 151 %mai1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai0, i32 0, i32 0, i32 0) 152 %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai1, i32 0, i32 0, i32 0) 153 %mai3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai2, i32 0, i32 0, i32 0) 154 %mai4 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai3, i32 0, i32 0, i32 0) 155 %mai5 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai4, i32 0, i32 0, i32 0) 156 %mai6 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai5, i32 0, i32 0, i32 0) 157 %mai7 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai6, i32 0, i32 0, i32 0) 158 %exp0 = call float @llvm.exp.f32(float %in0) 159 %exp1 = call float @llvm.exp.f32(float %exp0) 160 %exp2 = call float @llvm.exp.f32(float %exp1) 161 %exp3 = call float @llvm.exp.f32(float %exp2) 162 %exp4 = call float @llvm.exp.f32(float %exp3) 163 %exp5 = call float @llvm.exp.f32(float %exp4) 164 %exp6 = call float @llvm.exp.f32(float %exp5) 165 %exp7 = call float @llvm.exp.f32(float %exp6) 166 store <4 x float> %mai7, ptr addrspace(1) %out0 167 store float %exp7, ptr addrspace(1) %out1 168 tail call void @llvm.amdgcn.iglp.opt(i32 3) 169 ret void 170} 171