xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll (revision 9ac52ce8d6cb7adcb5f3981952e39207c5b9588a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942  -o - %s | FileCheck -check-prefix=GCN %s
3
4define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspace(1) %out1, float %in0, <4 x float> %in1) {
5; GCN-LABEL: MFMAExpInterleave:
6; GCN:       ; %bb.0:
7; GCN-NEXT:    s_load_dword s6, s[4:5], 0x10
8; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
9; GCN-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
10; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
11; GCN-NEXT:    s_mov_b32 s7, 0x42b17218
12; GCN-NEXT:    s_waitcnt lgkmcnt(0)
13; GCN-NEXT:    v_mul_f32_e32 v2, s6, v1
14; GCN-NEXT:    v_rndne_f32_e32 v3, v2
15; GCN-NEXT:    v_sub_f32_e32 v4, v2, v3
16; GCN-NEXT:    v_fma_f32 v1, s6, v1, -v2
17; GCN-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
18; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
19; GCN-NEXT:    v_fmac_f32_e32 v1, s6, v2
20; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
21; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
22; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
23; GCN-NEXT:    v_add_f32_e32 v1, v4, v1
24; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v3
25; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
26; GCN-NEXT:    v_exp_f32_e32 v1, v1
27; GCN-NEXT:    s_mov_b32 s0, 0x3fb8aa3b
28; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
29; GCN-NEXT:    ; iglp_opt mask(0x00000003)
30; GCN-NEXT:    v_ldexp_f32 v1, v1, v2
31; GCN-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
32; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s6, v2
33; GCN-NEXT:    v_mov_b32_e32 v2, 0x42b17218
34; GCN-NEXT:    s_nop 0
35; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
36; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v2
37; GCN-NEXT:    v_mov_b32_e32 v2, 0x7f800000
38; GCN-NEXT:    s_mov_b32 s6, 0xc2ce8ed0
39; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
40; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v1
41; GCN-NEXT:    v_fma_f32 v4, v1, s0, -v3
42; GCN-NEXT:    v_rndne_f32_e32 v5, v3
43; GCN-NEXT:    v_fmac_f32_e32 v4, 0x32a5705f, v1
44; GCN-NEXT:    v_sub_f32_e32 v3, v3, v5
45; GCN-NEXT:    v_add_f32_e32 v3, v3, v4
46; GCN-NEXT:    v_exp_f32_e32 v3, v3
47; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v5
48; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v1
49; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
50; GCN-NEXT:    v_ldexp_f32 v3, v3, v4
51; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
52; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v1
53; GCN-NEXT:    s_nop 1
54; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
55; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v1
56; GCN-NEXT:    v_fma_f32 v4, v1, s0, -v3
57; GCN-NEXT:    v_rndne_f32_e32 v5, v3
58; GCN-NEXT:    v_fmac_f32_e32 v4, 0x32a5705f, v1
59; GCN-NEXT:    v_sub_f32_e32 v3, v3, v5
60; GCN-NEXT:    v_add_f32_e32 v3, v3, v4
61; GCN-NEXT:    v_exp_f32_e32 v3, v3
62; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v5
63; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v1
64; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
65; GCN-NEXT:    v_ldexp_f32 v3, v3, v4
66; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
67; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v1
68; GCN-NEXT:    s_nop 1
69; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
70; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v1
71; GCN-NEXT:    v_fma_f32 v4, v1, s0, -v3
72; GCN-NEXT:    v_rndne_f32_e32 v5, v3
73; GCN-NEXT:    v_fmac_f32_e32 v4, 0x32a5705f, v1
74; GCN-NEXT:    v_sub_f32_e32 v3, v3, v5
75; GCN-NEXT:    v_add_f32_e32 v3, v3, v4
76; GCN-NEXT:    v_exp_f32_e32 v3, v3
77; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v5
78; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v1
79; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
80; GCN-NEXT:    v_ldexp_f32 v3, v3, v4
81; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
82; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v1
83; GCN-NEXT:    s_nop 1
84; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
85; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v1
86; GCN-NEXT:    v_fma_f32 v4, v1, s0, -v3
87; GCN-NEXT:    v_rndne_f32_e32 v5, v3
88; GCN-NEXT:    v_fmac_f32_e32 v4, 0x32a5705f, v1
89; GCN-NEXT:    v_sub_f32_e32 v3, v3, v5
90; GCN-NEXT:    v_add_f32_e32 v3, v3, v4
91; GCN-NEXT:    v_exp_f32_e32 v3, v3
92; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v5
93; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v1
94; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
95; GCN-NEXT:    v_ldexp_f32 v3, v3, v4
96; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
97; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v1
98; GCN-NEXT:    s_nop 1
99; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
100; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v1
101; GCN-NEXT:    v_fma_f32 v4, v1, s0, -v3
102; GCN-NEXT:    v_rndne_f32_e32 v5, v3
103; GCN-NEXT:    v_fmac_f32_e32 v4, 0x32a5705f, v1
104; GCN-NEXT:    v_sub_f32_e32 v3, v3, v5
105; GCN-NEXT:    v_add_f32_e32 v3, v3, v4
106; GCN-NEXT:    v_exp_f32_e32 v3, v3
107; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v5
108; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v1
109; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
110; GCN-NEXT:    v_ldexp_f32 v3, v3, v4
111; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
112; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v1
113; GCN-NEXT:    s_nop 1
114; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
115; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v1
116; GCN-NEXT:    v_fma_f32 v4, v1, s0, -v3
117; GCN-NEXT:    v_rndne_f32_e32 v5, v3
118; GCN-NEXT:    v_fmac_f32_e32 v4, 0x32a5705f, v1
119; GCN-NEXT:    v_sub_f32_e32 v3, v3, v5
120; GCN-NEXT:    v_add_f32_e32 v3, v3, v4
121; GCN-NEXT:    v_exp_f32_e32 v3, v3
122; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v5
123; GCN-NEXT:    v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
124; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v1
125; GCN-NEXT:    v_ldexp_f32 v0, v3, v4
126; GCN-NEXT:    s_nop 0
127; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
128; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v1
129; GCN-NEXT:    s_nop 1
130; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
131; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
132; GCN-NEXT:    v_fma_f32 v3, v0, s0, -v1
133; GCN-NEXT:    v_rndne_f32_e32 v4, v1
134; GCN-NEXT:    v_fmac_f32_e32 v3, 0x32a5705f, v0
135; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
136; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
137; GCN-NEXT:    v_exp_f32_e32 v1, v1
138; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v4
139; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
140; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, s6, v0
141; GCN-NEXT:    v_mov_b32_e32 v4, 0
142; GCN-NEXT:    v_ldexp_f32 v1, v1, v3
143; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
144; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s7, v0
145; GCN-NEXT:    s_waitcnt lgkmcnt(0)
146; GCN-NEXT:    global_store_dwordx4 v4, a[0:3], s[0:1]
147; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
148; GCN-NEXT:    global_store_dword v4, v0, s[2:3]
149; GCN-NEXT:    s_endpgm
150  %mai0 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in1, i32 0, i32 0, i32 0)
151  %mai1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai0, i32 0, i32 0, i32 0)
152  %mai2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai1, i32 0, i32 0, i32 0)
153  %mai3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai2, i32 0, i32 0, i32 0)
154  %mai4 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai3, i32 0, i32 0, i32 0)
155  %mai5 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai4, i32 0, i32 0, i32 0)
156  %mai6 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai5, i32 0, i32 0, i32 0)
157  %mai7 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai6, i32 0, i32 0, i32 0)
158  %exp0 = call float @llvm.exp.f32(float %in0)
159  %exp1 = call float @llvm.exp.f32(float %exp0)
160  %exp2 = call float @llvm.exp.f32(float %exp1)
161  %exp3 = call float @llvm.exp.f32(float %exp2)
162  %exp4 = call float @llvm.exp.f32(float %exp3)
163  %exp5 = call float @llvm.exp.f32(float %exp4)
164  %exp6 = call float @llvm.exp.f32(float %exp5)
165  %exp7 = call float @llvm.exp.f32(float %exp6)
166  store <4 x float> %mai7, ptr addrspace(1) %out0
167  store float %exp7, ptr addrspace(1) %out1
168  tail call void @llvm.amdgcn.iglp.opt(i32 3)
169  ret void
170}
171