xref: /llvm-project/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
4; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600,EG %s
5; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=R600,CM %s
6
7define float @v_rcp_f32_ieee(float %x) #3 {
8; SI-LABEL: v_rcp_f32_ieee:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
12; SI-NEXT:    v_rcp_f32_e32 v2, v1
13; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
14; SI-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
15; SI-NEXT:    v_fma_f32 v2, v4, v2, v2
16; SI-NEXT:    v_mul_f32_e32 v4, v3, v2
17; SI-NEXT:    v_fma_f32 v5, -v1, v4, v3
18; SI-NEXT:    v_fma_f32 v4, v5, v2, v4
19; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
20; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
21; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
22; SI-NEXT:    s_setpc_b64 s[30:31]
23;
24; VI-LABEL: v_rcp_f32_ieee:
25; VI:       ; %bb.0:
26; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
28; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
29; VI-NEXT:    v_rcp_f32_e32 v3, v1
30; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
31; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
32; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
33; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
34; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
35; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
36; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
37; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
38; VI-NEXT:    s_setpc_b64 s[30:31]
39;
40; R600-LABEL: v_rcp_f32_ieee:
41; R600:       ; %bb.0:
42; R600-NEXT:    CF_END
43; R600-NEXT:    PAD
44  %rcp = fdiv float 1.0, %x
45  ret float %rcp
46}
47
48define float @v_rcp_f32_ieee_unsafe(float %x) #4 {
49; GCN-LABEL: v_rcp_f32_ieee_unsafe:
50; GCN:       ; %bb.0:
51; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GCN-NEXT:    v_rcp_f32_e32 v0, v0
53; GCN-NEXT:    s_setpc_b64 s[30:31]
54;
55; R600-LABEL: v_rcp_f32_ieee_unsafe:
56; R600:       ; %bb.0:
57; R600-NEXT:    CF_END
58; R600-NEXT:    PAD
59  %rcp = fdiv float 1.0, %x
60  ret float %rcp
61}
62
63define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 {
64; SI-LABEL: v_rcp_f32_ieee_known_not_denormal:
65; SI:       ; %bb.0:
66; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
68; SI-NEXT:    v_rcp_f32_e32 v2, v1
69; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
70; SI-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
71; SI-NEXT:    v_fma_f32 v2, v4, v2, v2
72; SI-NEXT:    v_mul_f32_e32 v4, v3, v2
73; SI-NEXT:    v_fma_f32 v5, -v1, v4, v3
74; SI-NEXT:    v_fma_f32 v4, v5, v2, v4
75; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
76; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
77; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
78; SI-NEXT:    s_setpc_b64 s[30:31]
79;
80; VI-LABEL: v_rcp_f32_ieee_known_not_denormal:
81; VI:       ; %bb.0:
82; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
84; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
85; VI-NEXT:    v_rcp_f32_e32 v3, v1
86; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
87; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
88; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
89; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
90; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
91; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
92; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
93; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
94; VI-NEXT:    s_setpc_b64 s[30:31]
95;
96; R600-LABEL: v_rcp_f32_ieee_known_not_denormal:
97; R600:       ; %bb.0:
98; R600-NEXT:    CF_END
99; R600-NEXT:    PAD
100  %rcp = fdiv float 1.0, %x
101  ret float %rcp
102}
103
104define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 {
105; SI-LABEL: v_rcp_f32_ieee_nnan_ninf:
106; SI:       ; %bb.0:
107; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
109; SI-NEXT:    v_rcp_f32_e32 v2, v1
110; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
111; SI-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
112; SI-NEXT:    v_fma_f32 v2, v4, v2, v2
113; SI-NEXT:    v_mul_f32_e32 v4, v3, v2
114; SI-NEXT:    v_fma_f32 v5, -v1, v4, v3
115; SI-NEXT:    v_fma_f32 v4, v5, v2, v4
116; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
117; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
118; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
119; SI-NEXT:    s_setpc_b64 s[30:31]
120;
121; VI-LABEL: v_rcp_f32_ieee_nnan_ninf:
122; VI:       ; %bb.0:
123; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
125; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
126; VI-NEXT:    v_rcp_f32_e32 v3, v1
127; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
128; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
129; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
130; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
131; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
132; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
133; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
134; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
135; VI-NEXT:    s_setpc_b64 s[30:31]
136;
137; R600-LABEL: v_rcp_f32_ieee_nnan_ninf:
138; R600:       ; %bb.0:
139; R600-NEXT:    CF_END
140; R600-NEXT:    PAD
141  %rcp = fdiv nnan ninf float 1.0, %x
142  ret float %rcp
143}
144
145define float @v_neg_rcp_f32_ieee(float %x) #3 {
146; SI-LABEL: v_neg_rcp_f32_ieee:
147; SI:       ; %bb.0:
148; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
150; SI-NEXT:    v_rcp_f32_e32 v2, v1
151; SI-NEXT:    v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
152; SI-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
153; SI-NEXT:    v_fma_f32 v2, v4, v2, v2
154; SI-NEXT:    v_mul_f32_e32 v4, v3, v2
155; SI-NEXT:    v_fma_f32 v5, -v1, v4, v3
156; SI-NEXT:    v_fma_f32 v4, v5, v2, v4
157; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
158; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
159; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
160; SI-NEXT:    s_setpc_b64 s[30:31]
161;
162; VI-LABEL: v_neg_rcp_f32_ieee:
163; VI:       ; %bb.0:
164; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
166; VI-NEXT:    v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
167; VI-NEXT:    v_rcp_f32_e32 v3, v1
168; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
169; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
170; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
171; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
172; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
173; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
174; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
175; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
176; VI-NEXT:    s_setpc_b64 s[30:31]
177;
178; R600-LABEL: v_neg_rcp_f32_ieee:
179; R600:       ; %bb.0:
180; R600-NEXT:    CF_END
181; R600-NEXT:    PAD
182  %rcp = fdiv float -1.0, %x
183  ret float %rcp
184}
185
186define float @v_rcp_f32_daz(float %x) #0 {
187; SI-LABEL: v_rcp_f32_daz:
188; SI:       ; %bb.0:
189; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
191; SI-NEXT:    v_rcp_f32_e32 v2, v1
192; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
193; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
194; SI-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
195; SI-NEXT:    v_fma_f32 v2, v4, v2, v2
196; SI-NEXT:    v_mul_f32_e32 v4, v3, v2
197; SI-NEXT:    v_fma_f32 v5, -v1, v4, v3
198; SI-NEXT:    v_fma_f32 v4, v5, v2, v4
199; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
200; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
201; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
202; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
203; SI-NEXT:    s_setpc_b64 s[30:31]
204;
205; VI-LABEL: v_rcp_f32_daz:
206; VI:       ; %bb.0:
207; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
209; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
210; VI-NEXT:    v_rcp_f32_e32 v3, v1
211; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
212; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
213; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
214; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
215; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
216; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
217; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
218; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
219; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
220; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
221; VI-NEXT:    s_setpc_b64 s[30:31]
222;
223; R600-LABEL: v_rcp_f32_daz:
224; R600:       ; %bb.0:
225; R600-NEXT:    CF_END
226; R600-NEXT:    PAD
227  %rcp = fdiv float 1.0, %x
228  ret float %rcp
229}
230
231define float @v_neg_rcp_f32_daz(float %x) #0 {
232; SI-LABEL: v_neg_rcp_f32_daz:
233; SI:       ; %bb.0:
234; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
236; SI-NEXT:    v_rcp_f32_e32 v2, v1
237; SI-NEXT:    v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
238; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
239; SI-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
240; SI-NEXT:    v_fma_f32 v2, v4, v2, v2
241; SI-NEXT:    v_mul_f32_e32 v4, v3, v2
242; SI-NEXT:    v_fma_f32 v5, -v1, v4, v3
243; SI-NEXT:    v_fma_f32 v4, v5, v2, v4
244; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
245; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
246; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
247; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
248; SI-NEXT:    s_setpc_b64 s[30:31]
249;
250; VI-LABEL: v_neg_rcp_f32_daz:
251; VI:       ; %bb.0:
252; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
254; VI-NEXT:    v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
255; VI-NEXT:    v_rcp_f32_e32 v3, v1
256; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
257; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
258; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
259; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
260; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
261; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
262; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
263; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
264; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
265; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
266; VI-NEXT:    s_setpc_b64 s[30:31]
267;
268; R600-LABEL: v_neg_rcp_f32_daz:
269; R600:       ; %bb.0:
270; R600-NEXT:    CF_END
271; R600-NEXT:    PAD
272  %rcp = fdiv float -1.0, %x
273  ret float %rcp
274}
275
276define float @v_rcp_f32_ieee_ulp25(float %x) #3 {
277; SI-LABEL: v_rcp_f32_ieee_ulp25:
278; SI:       ; %bb.0:
279; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; SI-NEXT:    s_mov_b32 s4, 0x7f800000
281; SI-NEXT:    v_frexp_mant_f32_e32 v1, v0
282; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
283; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
284; SI-NEXT:    v_rcp_f32_e32 v1, v1
285; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
286; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
287; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
288; SI-NEXT:    s_setpc_b64 s[30:31]
289;
290; VI-LABEL: v_rcp_f32_ieee_ulp25:
291; VI:       ; %bb.0:
292; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
294; VI-NEXT:    v_rcp_f32_e32 v1, v1
295; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
296; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
297; VI-NEXT:    v_ldexp_f32 v0, v1, v0
298; VI-NEXT:    s_setpc_b64 s[30:31]
299;
300; R600-LABEL: v_rcp_f32_ieee_ulp25:
301; R600:       ; %bb.0:
302; R600-NEXT:    CF_END
303; R600-NEXT:    PAD
304  %rcp = fdiv float 1.0, %x, !fpmath !0
305  ret float %rcp
306}
307
308define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
309; SI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
310; SI:       ; %bb.0:
311; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312; SI-NEXT:    s_mov_b32 s4, 0x7f800000
313; SI-NEXT:    v_frexp_mant_f32_e32 v1, v0
314; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
315; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
316; SI-NEXT:    v_rcp_f32_e32 v1, v1
317; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
318; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
319; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
320; SI-NEXT:    s_setpc_b64 s[30:31]
321;
322; VI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
323; VI:       ; %bb.0:
324; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
326; VI-NEXT:    v_rcp_f32_e32 v1, v1
327; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
328; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
329; VI-NEXT:    v_ldexp_f32 v0, v1, v0
330; VI-NEXT:    s_setpc_b64 s[30:31]
331;
332; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
333; R600:       ; %bb.0:
334; R600-NEXT:    CF_END
335; R600-NEXT:    PAD
336  %rcp = fdiv float 1.0, %x, !fpmath !0
337  ret float %rcp
338}
339
340define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
341; SI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
342; SI:       ; %bb.0:
343; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; SI-NEXT:    s_mov_b32 s4, 0x7f800000
345; SI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
346; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
347; SI-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
348; SI-NEXT:    v_rcp_f32_e32 v1, v1
349; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
350; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
351; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
352; SI-NEXT:    s_setpc_b64 s[30:31]
353;
354; VI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
355; VI:       ; %bb.0:
356; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
357; VI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
358; VI-NEXT:    v_rcp_f32_e32 v1, v1
359; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
360; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
361; VI-NEXT:    v_ldexp_f32 v0, v1, v0
362; VI-NEXT:    s_setpc_b64 s[30:31]
363;
364; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
365; R600:       ; %bb.0:
366; R600-NEXT:    CF_END
367; R600-NEXT:    PAD
368  %rcp = fdiv float -1.0, %x, !fpmath !0
369  ret float %rcp
370}
371
372define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 {
373; SI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
374; SI:       ; %bb.0:
375; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376; SI-NEXT:    s_mov_b32 s4, 0x7f800000
377; SI-NEXT:    v_frexp_mant_f32_e32 v1, v0
378; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
379; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
380; SI-NEXT:    v_rcp_f32_e32 v1, v1
381; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
382; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
383; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
384; SI-NEXT:    s_setpc_b64 s[30:31]
385;
386; VI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
387; VI:       ; %bb.0:
388; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
390; VI-NEXT:    v_rcp_f32_e32 v1, v1
391; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
392; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
393; VI-NEXT:    v_ldexp_f32 v0, v1, v0
394; VI-NEXT:    s_setpc_b64 s[30:31]
395;
396; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
397; R600:       ; %bb.0:
398; R600-NEXT:    CF_END
399; R600-NEXT:    PAD
400  %rcp = fdiv ninf nnan float 1.0, %x, !fpmath !0
401  ret float %rcp
402}
403
404define float @v_rcp_f32_daz_ulp25(float %x) #0 {
405; GCN-LABEL: v_rcp_f32_daz_ulp25:
406; GCN:       ; %bb.0:
407; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408; GCN-NEXT:    v_rcp_f32_e32 v0, v0
409; GCN-NEXT:    s_setpc_b64 s[30:31]
410;
411; R600-LABEL: v_rcp_f32_daz_ulp25:
412; R600:       ; %bb.0:
413; R600-NEXT:    CF_END
414; R600-NEXT:    PAD
415  %rcp = fdiv float 1.0, %x, !fpmath !0
416  ret float %rcp
417}
418
419define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 {
420; SI-LABEL: v_neg_rcp_f32_ieee_ulp25:
421; SI:       ; %bb.0:
422; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
423; SI-NEXT:    s_mov_b32 s4, 0x7f800000
424; SI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
425; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
426; SI-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
427; SI-NEXT:    v_rcp_f32_e32 v1, v1
428; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
429; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
430; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
431; SI-NEXT:    s_setpc_b64 s[30:31]
432;
433; VI-LABEL: v_neg_rcp_f32_ieee_ulp25:
434; VI:       ; %bb.0:
435; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; VI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
437; VI-NEXT:    v_rcp_f32_e32 v1, v1
438; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
439; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
440; VI-NEXT:    v_ldexp_f32 v0, v1, v0
441; VI-NEXT:    s_setpc_b64 s[30:31]
442;
443; R600-LABEL: v_neg_rcp_f32_ieee_ulp25:
444; R600:       ; %bb.0:
445; R600-NEXT:    CF_END
446; R600-NEXT:    PAD
447  %rcp = fdiv float -1.0, %x, !fpmath !0
448  ret float %rcp
449}
450
451define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 {
452; GCN-LABEL: v_neg_rcp_f32_daz_ulp25:
453; GCN:       ; %bb.0:
454; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455; GCN-NEXT:    v_rcp_f32_e64 v0, -v0
456; GCN-NEXT:    s_setpc_b64 s[30:31]
457;
458; R600-LABEL: v_neg_rcp_f32_daz_ulp25:
459; R600:       ; %bb.0:
460; R600-NEXT:    CF_END
461; R600-NEXT:    PAD
462  %rcp = fdiv float -1.0, %x, !fpmath !0
463  ret float %rcp
464}
465
466define float @v_rcp_fabs_f32_ieee(float %x) #3 {
467; SI-LABEL: v_rcp_fabs_f32_ieee:
468; SI:       ; %bb.0:
469; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
471; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
472; SI-NEXT:    v_rcp_f32_e32 v3, v2
473; SI-NEXT:    v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
474; SI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
475; SI-NEXT:    v_fma_f32 v3, v4, v3, v3
476; SI-NEXT:    v_mul_f32_e32 v4, v1, v3
477; SI-NEXT:    v_fma_f32 v5, -v2, v4, v1
478; SI-NEXT:    v_fma_f32 v4, v5, v3, v4
479; SI-NEXT:    v_fma_f32 v1, -v2, v4, v1
480; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
481; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
482; SI-NEXT:    s_setpc_b64 s[30:31]
483;
484; VI-LABEL: v_rcp_fabs_f32_ieee:
485; VI:       ; %bb.0:
486; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
488; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
489; VI-NEXT:    v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
490; VI-NEXT:    v_rcp_f32_e32 v3, v2
491; VI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
492; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
493; VI-NEXT:    v_mul_f32_e32 v4, v1, v3
494; VI-NEXT:    v_fma_f32 v5, -v2, v4, v1
495; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
496; VI-NEXT:    v_fma_f32 v1, -v2, v4, v1
497; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
498; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
499; VI-NEXT:    s_setpc_b64 s[30:31]
500;
501; R600-LABEL: v_rcp_fabs_f32_ieee:
502; R600:       ; %bb.0:
503; R600-NEXT:    CF_END
504; R600-NEXT:    PAD
505  %fabs.x = call float @llvm.fabs.f32(float %x)
506  %rcp = fdiv float 1.0, %fabs.x
507  ret float %rcp
508}
509
510define float @v_rcp_fabs_f32_daz(float %x) #0 {
511; SI-LABEL: v_rcp_fabs_f32_daz:
512; SI:       ; %bb.0:
513; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
515; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
516; SI-NEXT:    v_rcp_f32_e32 v3, v2
517; SI-NEXT:    v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
518; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
519; SI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
520; SI-NEXT:    v_fma_f32 v3, v4, v3, v3
521; SI-NEXT:    v_mul_f32_e32 v4, v1, v3
522; SI-NEXT:    v_fma_f32 v5, -v2, v4, v1
523; SI-NEXT:    v_fma_f32 v4, v5, v3, v4
524; SI-NEXT:    v_fma_f32 v1, -v2, v4, v1
525; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
526; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
527; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
528; SI-NEXT:    s_setpc_b64 s[30:31]
529;
530; VI-LABEL: v_rcp_fabs_f32_daz:
531; VI:       ; %bb.0:
532; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
533; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
534; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
535; VI-NEXT:    v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
536; VI-NEXT:    v_rcp_f32_e32 v3, v2
537; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
538; VI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
539; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
540; VI-NEXT:    v_mul_f32_e32 v4, v1, v3
541; VI-NEXT:    v_fma_f32 v5, -v2, v4, v1
542; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
543; VI-NEXT:    v_fma_f32 v1, -v2, v4, v1
544; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
545; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
546; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
547; VI-NEXT:    s_setpc_b64 s[30:31]
548;
549; R600-LABEL: v_rcp_fabs_f32_daz:
550; R600:       ; %bb.0:
551; R600-NEXT:    CF_END
552; R600-NEXT:    PAD
553  %fabs.x = call float @llvm.fabs.f32(float %x)
554  %rcp = fdiv float 1.0, %fabs.x
555  ret float %rcp
556}
557
558define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
559; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
560; SI:       ; %bb.0:
561; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562; SI-NEXT:    s_mov_b32 s4, 0x7f800000
563; SI-NEXT:    v_frexp_mant_f32_e64 v1, |v0|
564; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
565; SI-NEXT:    v_cndmask_b32_e64 v1, |v0|, v1, s[4:5]
566; SI-NEXT:    v_rcp_f32_e32 v1, v1
567; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
568; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
569; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
570; SI-NEXT:    s_setpc_b64 s[30:31]
571;
572; VI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
573; VI:       ; %bb.0:
574; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
575; VI-NEXT:    v_frexp_mant_f32_e64 v1, |v0|
576; VI-NEXT:    v_rcp_f32_e32 v1, v1
577; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
578; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
579; VI-NEXT:    v_ldexp_f32 v0, v1, v0
580; VI-NEXT:    s_setpc_b64 s[30:31]
581;
582; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25:
583; R600:       ; %bb.0:
584; R600-NEXT:    CF_END
585; R600-NEXT:    PAD
586  %fabs.x = call float @llvm.fabs.f32(float %x)
587  %rcp = fdiv float 1.0, %fabs.x, !fpmath !0
588  ret float %rcp
589}
590
591define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 {
592; GCN-LABEL: v_rcp_fabs_f32_daz_ulp25:
593; GCN:       ; %bb.0:
594; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595; GCN-NEXT:    v_rcp_f32_e64 v0, |v0|
596; GCN-NEXT:    s_setpc_b64 s[30:31]
597;
598; R600-LABEL: v_rcp_fabs_f32_daz_ulp25:
599; R600:       ; %bb.0:
600; R600-NEXT:    CF_END
601; R600-NEXT:    PAD
602  %fabs.x = call float @llvm.fabs.f32(float %x)
603  %rcp = fdiv float 1.0, %fabs.x, !fpmath !0
604  ret float %rcp
605}
606
607define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 {
608; SI-LABEL: v_rcp_neg_fabs_f32_ieee:
609; SI:       ; %bb.0:
610; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
611; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
612; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
613; SI-NEXT:    v_rcp_f32_e32 v3, v2
614; SI-NEXT:    v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
615; SI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
616; SI-NEXT:    v_fma_f32 v3, v4, v3, v3
617; SI-NEXT:    v_mul_f32_e32 v4, v1, v3
618; SI-NEXT:    v_fma_f32 v5, -v2, v4, v1
619; SI-NEXT:    v_fma_f32 v4, v5, v3, v4
620; SI-NEXT:    v_fma_f32 v1, -v2, v4, v1
621; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
622; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
623; SI-NEXT:    s_setpc_b64 s[30:31]
624;
625; VI-LABEL: v_rcp_neg_fabs_f32_ieee:
626; VI:       ; %bb.0:
627; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
629; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
630; VI-NEXT:    v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
631; VI-NEXT:    v_rcp_f32_e32 v3, v2
632; VI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
633; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
634; VI-NEXT:    v_mul_f32_e32 v4, v1, v3
635; VI-NEXT:    v_fma_f32 v5, -v2, v4, v1
636; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
637; VI-NEXT:    v_fma_f32 v1, -v2, v4, v1
638; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
639; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
640; VI-NEXT:    s_setpc_b64 s[30:31]
641;
642; R600-LABEL: v_rcp_neg_fabs_f32_ieee:
643; R600:       ; %bb.0:
644; R600-NEXT:    CF_END
645; R600-NEXT:    PAD
646  %fabs.x = call float @llvm.fabs.f32(float %x)
647  %rcp = fdiv float -1.0, %fabs.x
648  ret float %rcp
649}
650
651define float @v_rcp_neg_fabs_f32_daz(float %x) #0 {
652; SI-LABEL: v_rcp_neg_fabs_f32_daz:
653; SI:       ; %bb.0:
654; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
656; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
657; SI-NEXT:    v_rcp_f32_e32 v3, v2
658; SI-NEXT:    v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
659; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
660; SI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
661; SI-NEXT:    v_fma_f32 v3, v4, v3, v3
662; SI-NEXT:    v_mul_f32_e32 v4, v1, v3
663; SI-NEXT:    v_fma_f32 v5, -v2, v4, v1
664; SI-NEXT:    v_fma_f32 v4, v5, v3, v4
665; SI-NEXT:    v_fma_f32 v1, -v2, v4, v1
666; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
667; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
668; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
669; SI-NEXT:    s_setpc_b64 s[30:31]
670;
671; VI-LABEL: v_rcp_neg_fabs_f32_daz:
672; VI:       ; %bb.0:
673; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
674; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
675; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
676; VI-NEXT:    v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
677; VI-NEXT:    v_rcp_f32_e32 v3, v2
678; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
679; VI-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
680; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
681; VI-NEXT:    v_mul_f32_e32 v4, v1, v3
682; VI-NEXT:    v_fma_f32 v5, -v2, v4, v1
683; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
684; VI-NEXT:    v_fma_f32 v1, -v2, v4, v1
685; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
686; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
687; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
688; VI-NEXT:    s_setpc_b64 s[30:31]
689;
690; R600-LABEL: v_rcp_neg_fabs_f32_daz:
691; R600:       ; %bb.0:
692; R600-NEXT:    CF_END
693; R600-NEXT:    PAD
694  %fabs.x = call float @llvm.fabs.f32(float %x)
695  %rcp = fdiv float -1.0, %fabs.x
696  ret float %rcp
697}
698
699define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
700; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
701; SI:       ; %bb.0:
702; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703; SI-NEXT:    s_mov_b32 s4, 0x7f800000
704; SI-NEXT:    v_frexp_mant_f32_e64 v1, -|v0|
705; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
706; SI-NEXT:    v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5]
707; SI-NEXT:    v_rcp_f32_e32 v1, v1
708; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
709; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
710; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
711; SI-NEXT:    s_setpc_b64 s[30:31]
712;
713; VI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
714; VI:       ; %bb.0:
715; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716; VI-NEXT:    v_frexp_mant_f32_e64 v1, -|v0|
717; VI-NEXT:    v_rcp_f32_e32 v1, v1
718; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
719; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
720; VI-NEXT:    v_ldexp_f32 v0, v1, v0
721; VI-NEXT:    s_setpc_b64 s[30:31]
722;
723; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
724; R600:       ; %bb.0:
725; R600-NEXT:    CF_END
726; R600-NEXT:    PAD
727  %fabs.x = call float @llvm.fabs.f32(float %x)
728  %rcp = fdiv float -1.0, %fabs.x, !fpmath !0
729  ret float %rcp
730}
731
732define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 {
733; GCN-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
734; GCN:       ; %bb.0:
735; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
736; GCN-NEXT:    v_rcp_f32_e64 v0, -|v0|
737; GCN-NEXT:    s_setpc_b64 s[30:31]
738;
739; R600-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
740; R600:       ; %bb.0:
741; R600-NEXT:    CF_END
742; R600-NEXT:    PAD
743  %fabs.x = call float @llvm.fabs.f32(float %x)
744  %rcp = fdiv float -1.0, %fabs.x, !fpmath !0
745  ret float %rcp
746}
747
748define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
749; SI-LABEL: s_rcp_pat_f32_daz:
750; SI:       ; %bb.0:
751; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
752; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
753; SI-NEXT:    s_mov_b32 s3, 0xf000
754; SI-NEXT:    s_waitcnt lgkmcnt(0)
755; SI-NEXT:    v_rcp_f32_e32 v0, s2
756; SI-NEXT:    s_mov_b32 s2, -1
757; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
758; SI-NEXT:    s_endpgm
759;
760; VI-LABEL: s_rcp_pat_f32_daz:
761; VI:       ; %bb.0:
762; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
763; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
764; VI-NEXT:    s_waitcnt lgkmcnt(0)
765; VI-NEXT:    v_rcp_f32_e32 v2, s2
766; VI-NEXT:    v_mov_b32_e32 v0, s0
767; VI-NEXT:    v_mov_b32_e32 v1, s1
768; VI-NEXT:    flat_store_dword v[0:1], v2
769; VI-NEXT:    s_endpgm
770;
771; EG-LABEL: s_rcp_pat_f32_daz:
772; EG:       ; %bb.0:
773; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
774; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
775; EG-NEXT:    CF_END
776; EG-NEXT:    PAD
777; EG-NEXT:    ALU clause starting at 4:
778; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
779; EG-NEXT:     RECIP_IEEE * T1.X, KC0[2].Z,
780; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
781;
782; CM-LABEL: s_rcp_pat_f32_daz:
783; CM:       ; %bb.0:
784; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
785; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
786; CM-NEXT:    CF_END
787; CM-NEXT:    PAD
788; CM-NEXT:    ALU clause starting at 4:
789; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
790; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
791; CM-NEXT:     RECIP_IEEE T1.X, KC0[2].Z,
792; CM-NEXT:     RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
793; CM-NEXT:     RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
794; CM-NEXT:     RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
795  %rcp = fdiv float 1.0, %src, !fpmath !0
796  store float %rcp, ptr addrspace(1) %out, align 4
797  ret void
798}
799
800define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
801; SI-LABEL: s_rcp_ulp25_pat_f32_daz:
802; SI:       ; %bb.0:
803; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
804; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
805; SI-NEXT:    s_mov_b32 s3, 0xf000
806; SI-NEXT:    s_waitcnt lgkmcnt(0)
807; SI-NEXT:    v_rcp_f32_e32 v0, s2
808; SI-NEXT:    s_mov_b32 s2, -1
809; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
810; SI-NEXT:    s_endpgm
811;
812; VI-LABEL: s_rcp_ulp25_pat_f32_daz:
813; VI:       ; %bb.0:
814; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
815; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
816; VI-NEXT:    s_waitcnt lgkmcnt(0)
817; VI-NEXT:    v_rcp_f32_e32 v2, s2
818; VI-NEXT:    v_mov_b32_e32 v0, s0
819; VI-NEXT:    v_mov_b32_e32 v1, s1
820; VI-NEXT:    flat_store_dword v[0:1], v2
821; VI-NEXT:    s_endpgm
822;
823; EG-LABEL: s_rcp_ulp25_pat_f32_daz:
824; EG:       ; %bb.0:
825; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
826; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
827; EG-NEXT:    CF_END
828; EG-NEXT:    PAD
829; EG-NEXT:    ALU clause starting at 4:
830; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
831; EG-NEXT:     RECIP_IEEE * T1.X, KC0[2].Z,
832; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
833;
834; CM-LABEL: s_rcp_ulp25_pat_f32_daz:
835; CM:       ; %bb.0:
836; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
837; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
838; CM-NEXT:    CF_END
839; CM-NEXT:    PAD
840; CM-NEXT:    ALU clause starting at 4:
841; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
842; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
843; CM-NEXT:     RECIP_IEEE T1.X, KC0[2].Z,
844; CM-NEXT:     RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
845; CM-NEXT:     RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
846; CM-NEXT:     RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
847  %rcp = fdiv float 1.0, %src, !fpmath !0
848  store float %rcp, ptr addrspace(1) %out, align 4
849  ret void
850}
851
852define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
853; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
854; SI:       ; %bb.0:
855; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
856; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
857; SI-NEXT:    s_mov_b32 s3, 0xf000
858; SI-NEXT:    s_waitcnt lgkmcnt(0)
859; SI-NEXT:    v_rcp_f32_e32 v0, s2
860; SI-NEXT:    s_mov_b32 s2, -1
861; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
862; SI-NEXT:    s_endpgm
863;
864; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
865; VI:       ; %bb.0:
866; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
867; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
868; VI-NEXT:    s_waitcnt lgkmcnt(0)
869; VI-NEXT:    v_rcp_f32_e32 v2, s2
870; VI-NEXT:    v_mov_b32_e32 v0, s0
871; VI-NEXT:    v_mov_b32_e32 v1, s1
872; VI-NEXT:    flat_store_dword v[0:1], v2
873; VI-NEXT:    s_endpgm
874;
875; EG-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
876; EG:       ; %bb.0:
877; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
878; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
879; EG-NEXT:    CF_END
880; EG-NEXT:    PAD
881; EG-NEXT:    ALU clause starting at 4:
882; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
883; EG-NEXT:     RECIP_IEEE * T1.X, KC0[2].Z,
884; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
885;
886; CM-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
887; CM:       ; %bb.0:
888; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
889; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
890; CM-NEXT:    CF_END
891; CM-NEXT:    PAD
892; CM-NEXT:    ALU clause starting at 4:
893; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
894; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
895; CM-NEXT:     RECIP_IEEE T1.X, KC0[2].Z,
896; CM-NEXT:     RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
897; CM-NEXT:     RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
898; CM-NEXT:     RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
899  %rcp = fdiv fast float 1.0, %src, !fpmath !0
900  store float %rcp, ptr addrspace(1) %out, align 4
901  ret void
902}
903
904define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
905; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
906; SI:       ; %bb.0:
907; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
908; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
909; SI-NEXT:    s_mov_b32 s3, 0xf000
910; SI-NEXT:    s_waitcnt lgkmcnt(0)
911; SI-NEXT:    v_rcp_f32_e32 v0, s2
912; SI-NEXT:    s_mov_b32 s2, -1
913; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
914; SI-NEXT:    s_endpgm
915;
916; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
917; VI:       ; %bb.0:
918; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
919; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
920; VI-NEXT:    s_waitcnt lgkmcnt(0)
921; VI-NEXT:    v_rcp_f32_e32 v2, s2
922; VI-NEXT:    v_mov_b32_e32 v0, s0
923; VI-NEXT:    v_mov_b32_e32 v1, s1
924; VI-NEXT:    flat_store_dword v[0:1], v2
925; VI-NEXT:    s_endpgm
926;
927; EG-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
928; EG:       ; %bb.0:
929; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
930; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
931; EG-NEXT:    CF_END
932; EG-NEXT:    PAD
933; EG-NEXT:    ALU clause starting at 4:
934; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
935; EG-NEXT:     RECIP_IEEE * T1.X, KC0[2].Z,
936; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
937;
938; CM-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
939; CM:       ; %bb.0:
940; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
941; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
942; CM-NEXT:    CF_END
943; CM-NEXT:    PAD
944; CM-NEXT:    ALU clause starting at 4:
945; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
946; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
947; CM-NEXT:     RECIP_IEEE T1.X, KC0[2].Z,
948; CM-NEXT:     RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
949; CM-NEXT:     RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
950; CM-NEXT:     RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
951  %rcp = fdiv arcp float 1.0, %src, !fpmath !0
952  store float %rcp, ptr addrspace(1) %out, align 4
953  ret void
954}
955
956define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 {
957; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
958; SI:       ; %bb.0:
959; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
960; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
961; SI-NEXT:    s_mov_b32 s3, 0xf000
962; SI-NEXT:    s_waitcnt lgkmcnt(0)
963; SI-NEXT:    v_rcp_f32_e32 v0, s2
964; SI-NEXT:    s_mov_b32 s2, -1
965; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
966; SI-NEXT:    s_endpgm
967;
968; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
969; VI:       ; %bb.0:
970; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
971; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
972; VI-NEXT:    s_waitcnt lgkmcnt(0)
973; VI-NEXT:    v_rcp_f32_e32 v2, s2
974; VI-NEXT:    v_mov_b32_e32 v0, s0
975; VI-NEXT:    v_mov_b32_e32 v1, s1
976; VI-NEXT:    flat_store_dword v[0:1], v2
977; VI-NEXT:    s_endpgm
978;
979; EG-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
980; EG:       ; %bb.0:
981; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
982; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
983; EG-NEXT:    CF_END
984; EG-NEXT:    PAD
985; EG-NEXT:    ALU clause starting at 4:
986; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
987; EG-NEXT:     RECIP_IEEE * T1.X, KC0[2].Z,
988; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
989;
990; CM-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
991; CM:       ; %bb.0:
992; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
993; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
994; CM-NEXT:    CF_END
995; CM-NEXT:    PAD
996; CM-NEXT:    ALU clause starting at 4:
997; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
998; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
999; CM-NEXT:     RECIP_IEEE T1.X, KC0[2].Z,
1000; CM-NEXT:     RECIP_IEEE T1.Y (MASKED), KC0[2].Z,
1001; CM-NEXT:     RECIP_IEEE T1.Z (MASKED), KC0[2].Z,
1002; CM-NEXT:     RECIP_IEEE * T1.W (MASKED), KC0[2].Z,
1003  %rcp = fdiv float 1.0, %src, !fpmath !0
1004  store float %rcp, ptr addrspace(1) %out, align 4
1005  ret void
1006}
1007
1008define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1009; SI-LABEL: s_rcp_fabs_pat_f32_daz:
1010; SI:       ; %bb.0:
1011; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
1012; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1013; SI-NEXT:    s_mov_b32 s3, 0xf000
1014; SI-NEXT:    s_waitcnt lgkmcnt(0)
1015; SI-NEXT:    v_rcp_f32_e64 v0, |s2|
1016; SI-NEXT:    s_mov_b32 s2, -1
1017; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1018; SI-NEXT:    s_endpgm
1019;
1020; VI-LABEL: s_rcp_fabs_pat_f32_daz:
1021; VI:       ; %bb.0:
1022; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
1023; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1024; VI-NEXT:    s_waitcnt lgkmcnt(0)
1025; VI-NEXT:    v_rcp_f32_e64 v2, |s2|
1026; VI-NEXT:    v_mov_b32_e32 v0, s0
1027; VI-NEXT:    v_mov_b32_e32 v1, s1
1028; VI-NEXT:    flat_store_dword v[0:1], v2
1029; VI-NEXT:    s_endpgm
1030;
1031; EG-LABEL: s_rcp_fabs_pat_f32_daz:
1032; EG:       ; %bb.0:
1033; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1034; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1035; EG-NEXT:    CF_END
1036; EG-NEXT:    PAD
1037; EG-NEXT:    ALU clause starting at 4:
1038; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
1039; EG-NEXT:     RECIP_IEEE * T1.X, |KC0[2].Z|,
1040; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1041;
1042; CM-LABEL: s_rcp_fabs_pat_f32_daz:
1043; CM:       ; %bb.0:
1044; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1045; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1046; CM-NEXT:    CF_END
1047; CM-NEXT:    PAD
1048; CM-NEXT:    ALU clause starting at 4:
1049; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1050; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1051; CM-NEXT:     RECIP_IEEE T1.X, |KC0[2].Z|,
1052; CM-NEXT:     RECIP_IEEE T1.Y (MASKED), |KC0[2].Z|,
1053; CM-NEXT:     RECIP_IEEE T1.Z (MASKED), |KC0[2].Z|,
1054; CM-NEXT:     RECIP_IEEE * T1.W (MASKED), |KC0[2].Z|,
1055  %src.fabs = call float @llvm.fabs.f32(float %src)
1056  %rcp = fdiv float 1.0, %src.fabs, !fpmath !0
1057  store float %rcp, ptr addrspace(1) %out, align 4
1058  ret void
1059}
1060
1061define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1062; SI-LABEL: s_neg_rcp_pat_f32_daz:
1063; SI:       ; %bb.0:
1064; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
1065; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1066; SI-NEXT:    s_mov_b32 s3, 0xf000
1067; SI-NEXT:    s_waitcnt lgkmcnt(0)
1068; SI-NEXT:    v_rcp_f32_e64 v0, -s2
1069; SI-NEXT:    s_mov_b32 s2, -1
1070; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1071; SI-NEXT:    s_endpgm
1072;
1073; VI-LABEL: s_neg_rcp_pat_f32_daz:
1074; VI:       ; %bb.0:
1075; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
1076; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1077; VI-NEXT:    s_waitcnt lgkmcnt(0)
1078; VI-NEXT:    v_rcp_f32_e64 v2, -s2
1079; VI-NEXT:    v_mov_b32_e32 v0, s0
1080; VI-NEXT:    v_mov_b32_e32 v1, s1
1081; VI-NEXT:    flat_store_dword v[0:1], v2
1082; VI-NEXT:    s_endpgm
1083;
1084; EG-LABEL: s_neg_rcp_pat_f32_daz:
1085; EG:       ; %bb.0:
1086; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1087; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1088; EG-NEXT:    CF_END
1089; EG-NEXT:    PAD
1090; EG-NEXT:    ALU clause starting at 4:
1091; EG-NEXT:     RECIP_IEEE * T0.X, KC0[2].Z,
1092; EG-NEXT:     MUL_IEEE T0.X, literal.x, PS,
1093; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1094; EG-NEXT:    -1082130432(-1.000000e+00), 2(2.802597e-45)
1095;
1096; CM-LABEL: s_neg_rcp_pat_f32_daz:
1097; CM:       ; %bb.0:
1098; CM-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1099; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1100; CM-NEXT:    CF_END
1101; CM-NEXT:    PAD
1102; CM-NEXT:    ALU clause starting at 4:
1103; CM-NEXT:     RECIP_IEEE T0.X, KC0[2].Z,
1104; CM-NEXT:     RECIP_IEEE T0.Y (MASKED), KC0[2].Z,
1105; CM-NEXT:     RECIP_IEEE T0.Z (MASKED), KC0[2].Z,
1106; CM-NEXT:     RECIP_IEEE * T0.W (MASKED), KC0[2].Z,
1107; CM-NEXT:     MUL_IEEE * T0.X, literal.x, PV.X,
1108; CM-NEXT:    -1082130432(-1.000000e+00), 0(0.000000e+00)
1109; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1110; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1111  %rcp = fdiv float -1.0, %src, !fpmath !0
1112  store float %rcp, ptr addrspace(1) %out, align 4
1113  ret void
1114}
1115
1116define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1117; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1118; SI:       ; %bb.0:
1119; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
1120; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1121; SI-NEXT:    s_mov_b32 s3, 0xf000
1122; SI-NEXT:    s_waitcnt lgkmcnt(0)
1123; SI-NEXT:    v_rcp_f32_e64 v0, -|s2|
1124; SI-NEXT:    s_mov_b32 s2, -1
1125; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1126; SI-NEXT:    s_endpgm
1127;
1128; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1129; VI:       ; %bb.0:
1130; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
1131; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1132; VI-NEXT:    s_waitcnt lgkmcnt(0)
1133; VI-NEXT:    v_rcp_f32_e64 v2, -|s2|
1134; VI-NEXT:    v_mov_b32_e32 v0, s0
1135; VI-NEXT:    v_mov_b32_e32 v1, s1
1136; VI-NEXT:    flat_store_dword v[0:1], v2
1137; VI-NEXT:    s_endpgm
1138;
1139; EG-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1140; EG:       ; %bb.0:
1141; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1142; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1143; EG-NEXT:    CF_END
1144; EG-NEXT:    PAD
1145; EG-NEXT:    ALU clause starting at 4:
1146; EG-NEXT:     RECIP_IEEE * T0.X, |KC0[2].Z|,
1147; EG-NEXT:     MUL_IEEE T0.X, literal.x, PS,
1148; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1149; EG-NEXT:    -1082130432(-1.000000e+00), 2(2.802597e-45)
1150;
1151; CM-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
1152; CM:       ; %bb.0:
1153; CM-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1154; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1155; CM-NEXT:    CF_END
1156; CM-NEXT:    PAD
1157; CM-NEXT:    ALU clause starting at 4:
1158; CM-NEXT:     RECIP_IEEE T0.X, |KC0[2].Z|,
1159; CM-NEXT:     RECIP_IEEE T0.Y (MASKED), |KC0[2].Z|,
1160; CM-NEXT:     RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|,
1161; CM-NEXT:     RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|,
1162; CM-NEXT:     MUL_IEEE * T0.X, literal.x, PV.X,
1163; CM-NEXT:    -1082130432(-1.000000e+00), 0(0.000000e+00)
1164; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1165; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1166  %src.fabs = call float @llvm.fabs.f32(float %src)
1167  %src.fabs.fneg = fneg float %src.fabs
1168  %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
1169  store float %rcp, ptr addrspace(1) %out, align 4
1170  ret void
1171}
1172
1173define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 {
1174; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1175; SI:       ; %bb.0:
1176; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
1177; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1178; SI-NEXT:    s_mov_b32 s3, 0xf000
1179; SI-NEXT:    s_mov_b32 s2, -1
1180; SI-NEXT:    s_waitcnt lgkmcnt(0)
1181; SI-NEXT:    v_rcp_f32_e64 v0, -|s6|
1182; SI-NEXT:    v_mul_f32_e64 v1, s6, -|s6|
1183; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1184; SI-NEXT:    s_waitcnt vmcnt(0)
1185; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1186; SI-NEXT:    s_waitcnt vmcnt(0)
1187; SI-NEXT:    s_endpgm
1188;
1189; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1190; VI:       ; %bb.0:
1191; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
1192; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1193; VI-NEXT:    s_waitcnt lgkmcnt(0)
1194; VI-NEXT:    v_rcp_f32_e64 v2, -|s2|
1195; VI-NEXT:    v_mov_b32_e32 v0, s0
1196; VI-NEXT:    v_mov_b32_e32 v1, s1
1197; VI-NEXT:    v_mul_f32_e64 v3, s2, -|s2|
1198; VI-NEXT:    flat_store_dword v[0:1], v2
1199; VI-NEXT:    s_waitcnt vmcnt(0)
1200; VI-NEXT:    flat_store_dword v[0:1], v3
1201; VI-NEXT:    s_waitcnt vmcnt(0)
1202; VI-NEXT:    s_endpgm
1203;
1204; EG-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1205; EG:       ; %bb.0:
1206; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
1207; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
1208; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
1209; EG-NEXT:    CF_END
1210; EG-NEXT:    ALU clause starting at 4:
1211; EG-NEXT:     MUL_IEEE T0.X, KC0[2].Z, -|KC0[2].Z|,
1212; EG-NEXT:     RECIP_IEEE * T0.Y, |KC0[2].Z|,
1213; EG-NEXT:     MUL_IEEE T1.X, literal.x, PS,
1214; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.y,
1215; EG-NEXT:    -1082130432(-1.000000e+00), 2(2.802597e-45)
1216;
1217; CM-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
1218; CM:       ; %bb.0:
1219; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1220; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
1221; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
1222; CM-NEXT:    CF_END
1223; CM-NEXT:    ALU clause starting at 4:
1224; CM-NEXT:     MUL_IEEE * T0.X, KC0[2].Z, -|KC0[2].Z|,
1225; CM-NEXT:     RECIP_IEEE T0.X (MASKED), |KC0[2].Z|,
1226; CM-NEXT:     RECIP_IEEE T0.Y, |KC0[2].Z|,
1227; CM-NEXT:     RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|,
1228; CM-NEXT:     RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|,
1229; CM-NEXT:     MUL_IEEE * T1.X, literal.x, PV.Y,
1230; CM-NEXT:    -1082130432(-1.000000e+00), 0(0.000000e+00)
1231; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
1232; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1233  %src.fabs = call float @llvm.fabs.f32(float %src)
1234  %src.fabs.fneg = fneg float %src.fabs
1235  %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
1236  store volatile float %rcp, ptr addrspace(1) %out, align 4
1237
1238  %other = fmul float %src, %src.fabs.fneg
1239  store volatile float %other, ptr addrspace(1) %out, align 4
1240  ret void
1241}
1242
1243define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
1244; SI-LABEL: s_div_arcp_2_x_pat_f32_daz:
1245; SI:       ; %bb.0:
1246; SI-NEXT:    s_load_dword s6, s[0:1], 0x0
1247; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1248; SI-NEXT:    s_mov_b32 s3, 0xf000
1249; SI-NEXT:    s_mov_b32 s2, -1
1250; SI-NEXT:    s_waitcnt lgkmcnt(0)
1251; SI-NEXT:    v_mul_f32_e64 v0, s6, 0.5
1252; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1253; SI-NEXT:    s_endpgm
1254;
1255; VI-LABEL: s_div_arcp_2_x_pat_f32_daz:
1256; VI:       ; %bb.0:
1257; VI-NEXT:    s_load_dword s2, s[0:1], 0x0
1258; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1259; VI-NEXT:    s_waitcnt lgkmcnt(0)
1260; VI-NEXT:    v_mul_f32_e64 v2, s2, 0.5
1261; VI-NEXT:    v_mov_b32_e32 v0, s0
1262; VI-NEXT:    v_mov_b32_e32 v1, s1
1263; VI-NEXT:    flat_store_dword v[0:1], v2
1264; VI-NEXT:    s_endpgm
1265;
1266; EG-LABEL: s_div_arcp_2_x_pat_f32_daz:
1267; EG:       ; %bb.0:
1268; EG-NEXT:    TEX 0 @4
1269; EG-NEXT:    ALU 2, @6, KC0[CB0:0-32], KC1[]
1270; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1271; EG-NEXT:    CF_END
1272; EG-NEXT:    Fetch clause starting at 4:
1273; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1274; EG-NEXT:    ALU clause starting at 6:
1275; EG-NEXT:     MUL_IEEE T0.X, T0.X, 0.5,
1276; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1277; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1278;
1279; CM-LABEL: s_div_arcp_2_x_pat_f32_daz:
1280; CM:       ; %bb.0:
1281; CM-NEXT:    TEX 0 @4
1282; CM-NEXT:    ALU 2, @6, KC0[CB0:0-32], KC1[]
1283; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1284; CM-NEXT:    CF_END
1285; CM-NEXT:    Fetch clause starting at 4:
1286; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1287; CM-NEXT:    ALU clause starting at 6:
1288; CM-NEXT:     MUL_IEEE * T0.X, T0.X, 0.5,
1289; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1290; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1291  %x = load float, ptr addrspace(1) undef
1292  %rcp = fdiv arcp float %x, 2.0
1293  store float %rcp, ptr addrspace(1) %out, align 4
1294  ret void
1295}
1296
1297define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
1298; SI-LABEL: s_div_arcp_k_x_pat_f32_daz:
1299; SI:       ; %bb.0:
1300; SI-NEXT:    s_load_dword s6, s[0:1], 0x0
1301; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1302; SI-NEXT:    v_mov_b32_e32 v0, 0x3dcccccd
1303; SI-NEXT:    s_mov_b32 s3, 0xf000
1304; SI-NEXT:    s_mov_b32 s2, -1
1305; SI-NEXT:    s_waitcnt lgkmcnt(0)
1306; SI-NEXT:    v_mul_f32_e32 v0, s6, v0
1307; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1308; SI-NEXT:    s_endpgm
1309;
1310; VI-LABEL: s_div_arcp_k_x_pat_f32_daz:
1311; VI:       ; %bb.0:
1312; VI-NEXT:    s_load_dword s2, s[0:1], 0x0
1313; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1314; VI-NEXT:    v_mov_b32_e32 v0, 0x3dcccccd
1315; VI-NEXT:    s_waitcnt lgkmcnt(0)
1316; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
1317; VI-NEXT:    v_mov_b32_e32 v0, s0
1318; VI-NEXT:    v_mov_b32_e32 v1, s1
1319; VI-NEXT:    flat_store_dword v[0:1], v2
1320; VI-NEXT:    s_endpgm
1321;
1322; EG-LABEL: s_div_arcp_k_x_pat_f32_daz:
1323; EG:       ; %bb.0:
1324; EG-NEXT:    TEX 0 @4
1325; EG-NEXT:    ALU 2, @6, KC0[CB0:0-32], KC1[]
1326; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1327; EG-NEXT:    CF_END
1328; EG-NEXT:    Fetch clause starting at 4:
1329; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1330; EG-NEXT:    ALU clause starting at 6:
1331; EG-NEXT:     MUL_IEEE T0.X, T0.X, literal.x,
1332; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1333; EG-NEXT:    1036831949(1.000000e-01), 2(2.802597e-45)
1334;
1335; CM-LABEL: s_div_arcp_k_x_pat_f32_daz:
1336; CM:       ; %bb.0:
1337; CM-NEXT:    TEX 0 @4
1338; CM-NEXT:    ALU 3, @6, KC0[CB0:0-32], KC1[]
1339; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1340; CM-NEXT:    CF_END
1341; CM-NEXT:    Fetch clause starting at 4:
1342; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1343; CM-NEXT:    ALU clause starting at 6:
1344; CM-NEXT:     MUL_IEEE * T0.X, T0.X, literal.x,
1345; CM-NEXT:    1036831949(1.000000e-01), 0(0.000000e+00)
1346; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1347; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1348  %x = load float, ptr addrspace(1) undef
1349  %rcp = fdiv arcp float %x, 10.0
1350  store float %rcp, ptr addrspace(1) %out, align 4
1351  ret void
1352}
1353
1354define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 {
1355; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1356; SI:       ; %bb.0:
1357; SI-NEXT:    s_load_dword s6, s[0:1], 0x0
1358; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1359; SI-NEXT:    v_mov_b32_e32 v0, 0xbdcccccd
1360; SI-NEXT:    s_mov_b32 s3, 0xf000
1361; SI-NEXT:    s_mov_b32 s2, -1
1362; SI-NEXT:    s_waitcnt lgkmcnt(0)
1363; SI-NEXT:    v_mul_f32_e32 v0, s6, v0
1364; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1365; SI-NEXT:    s_endpgm
1366;
1367; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1368; VI:       ; %bb.0:
1369; VI-NEXT:    s_load_dword s2, s[0:1], 0x0
1370; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1371; VI-NEXT:    v_mov_b32_e32 v0, 0xbdcccccd
1372; VI-NEXT:    s_waitcnt lgkmcnt(0)
1373; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
1374; VI-NEXT:    v_mov_b32_e32 v0, s0
1375; VI-NEXT:    v_mov_b32_e32 v1, s1
1376; VI-NEXT:    flat_store_dword v[0:1], v2
1377; VI-NEXT:    s_endpgm
1378;
1379; EG-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1380; EG:       ; %bb.0:
1381; EG-NEXT:    TEX 0 @4
1382; EG-NEXT:    ALU 2, @6, KC0[CB0:0-32], KC1[]
1383; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1384; EG-NEXT:    CF_END
1385; EG-NEXT:    Fetch clause starting at 4:
1386; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1387; EG-NEXT:    ALU clause starting at 6:
1388; EG-NEXT:     MUL_IEEE T0.X, T0.X, literal.x,
1389; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1390; EG-NEXT:    -1110651699(-1.000000e-01), 2(2.802597e-45)
1391;
1392; CM-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
1393; CM:       ; %bb.0:
1394; CM-NEXT:    TEX 0 @4
1395; CM-NEXT:    ALU 3, @6, KC0[CB0:0-32], KC1[]
1396; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1397; CM-NEXT:    CF_END
1398; CM-NEXT:    Fetch clause starting at 4:
1399; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1400; CM-NEXT:    ALU clause starting at 6:
1401; CM-NEXT:     MUL_IEEE * T0.X, T0.X, literal.x,
1402; CM-NEXT:    -1110651699(-1.000000e-01), 0(0.000000e+00)
1403; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1404; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1405  %x = load float, ptr addrspace(1) undef
1406  %rcp = fdiv arcp float %x, -10.0
1407  store float %rcp, ptr addrspace(1) %out, align 4
1408  ret void
1409}
1410
1411declare float @llvm.fabs.f32(float) #1
1412declare float @llvm.sqrt.f32(float) #1
1413
1414attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1415attributes #1 = { nounwind readnone }
1416attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1417attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
1418attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" }
1419
1420!0 = !{float 2.500000e+00}
1421