xref: /llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | FileCheck %s
3
4declare hidden float @_Z3powff(float, float)
5declare hidden double @_Z3powdd(double, double)
6declare hidden half @_Z3powDhDh(half, half)
7
8declare hidden float @_Z4powrff(float, float)
9declare hidden double @_Z4powrdd(double, double)
10declare hidden half @_Z4powrDhDh(half, half)
11
12declare hidden float @_Z4pownfi(float, i32)
13declare hidden double @_Z4powndi(double, i32)
14declare hidden half @_Z4pownDhi(half, i32)
15
16; --------------------------------------------------------------------
17; test pow
18; --------------------------------------------------------------------
19
20define half @test_pow_fast_f16(half %x, half %y) {
21; CHECK-LABEL: test_pow_fast_f16:
22; CHECK:       ; %bb.0:
23; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24; CHECK-NEXT:    s_getpc_b64 s[16:17]
25; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powDhDh@rel32@lo+4
26; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powDhDh@rel32@hi+12
27; CHECK-NEXT:    s_setpc_b64 s[16:17]
28  %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
29  ret half %pow
30}
31
32define float @test_pow_fast_f32(float %x, float %y) {
33; CHECK-LABEL: test_pow_fast_f32:
34; CHECK:       ; %bb.0:
35; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36; CHECK-NEXT:    s_getpc_b64 s[16:17]
37; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powff@rel32@lo+4
38; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powff@rel32@hi+12
39; CHECK-NEXT:    s_setpc_b64 s[16:17]
40  %pow = tail call fast float @_Z3powff(float %x, float %y)
41  ret float %pow
42}
43
44define double @test_pow_fast_f64(double %x, double %y) {
45; CHECK-LABEL: test_pow_fast_f64:
46; CHECK:       ; %bb.0:
47; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; CHECK-NEXT:    s_getpc_b64 s[16:17]
49; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powdd@rel32@lo+4
50; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powdd@rel32@hi+12
51; CHECK-NEXT:    s_setpc_b64 s[16:17]
52  %pow = tail call fast double @_Z3powdd(double %x, double %y)
53  ret double %pow
54}
55
56define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
57; CHECK-LABEL: test_pow_fast_f16__integral_y:
58; CHECK:       ; %bb.0:
59; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
61; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
62; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
63; CHECK-NEXT:    v_cvt_f32_f16_e32 v1, v1
64; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
65; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
66; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
67; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
68; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
69; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
70; CHECK-NEXT:    v_exp_f16_e32 v2, v2
71; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
72; CHECK-NEXT:    s_setpc_b64 s[30:31]
73  %y = sitofp i32 %y.i to half
74  %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
75  ret half %pow
76}
77
78define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
79; CHECK-LABEL: test_pow_fast_f32__integral_y:
80; CHECK:       ; %bb.0:
81; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
83; CHECK-NEXT:    s_mov_b32 s4, 0x800000
84; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
85; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
86; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
87; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
88; CHECK-NEXT:    v_ldexp_f32 v3, |v0|, v3
89; CHECK-NEXT:    v_log_f32_e32 v3, v3
90; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
91; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
92; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
93; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
94; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
95; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
96; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
97; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
98; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
99; CHECK-NEXT:    v_fma_f32 v2, v2, v4, v3
100; CHECK-NEXT:    v_exp_f32_e32 v2, v2
101; CHECK-NEXT:    v_not_b32_e32 v3, 63
102; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
103; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
104; CHECK-NEXT:    v_ldexp_f32 v2, v2, v3
105; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
106; CHECK-NEXT:    s_setpc_b64 s[30:31]
107  %y = sitofp i32 %y.i to float
108  %pow = tail call fast float @_Z3powff(float %x, float %y)
109  ret float %pow
110}
111
112define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
113; CHECK-LABEL: test_pow_fast_f64__integral_y:
114; CHECK:       ; %bb.0:
115; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116; CHECK-NEXT:    s_mov_b32 s16, s33
117; CHECK-NEXT:    s_mov_b32 s33, s32
118; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
119; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
120; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
121; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
122; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
123; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
124; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
125; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
126; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
127; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
128; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
129; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
130; CHECK-NEXT:    s_addk_i32 s32, 0x800
131; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
132; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
133; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
134; CHECK-NEXT:    s_getpc_b64 s[4:5]
135; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
136; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
137; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
138; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
139; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
140; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
141; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
142; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
143; CHECK-NEXT:    v_mov_b32_e32 v42, v1
144; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
145; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
146; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
147; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
148; CHECK-NEXT:    v_mov_b32_e32 v40, v31
149; CHECK-NEXT:    v_mov_b32_e32 v41, v2
150; CHECK-NEXT:    s_mov_b32 s42, s15
151; CHECK-NEXT:    s_mov_b32 s43, s14
152; CHECK-NEXT:    s_mov_b32 s44, s13
153; CHECK-NEXT:    s_mov_b32 s45, s12
154; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
155; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
156; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
157; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
158; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
159; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
160; CHECK-NEXT:    s_getpc_b64 s[4:5]
161; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
162; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
163; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
164; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
165; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
166; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
167; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
168; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
169; CHECK-NEXT:    s_mov_b32 s12, s45
170; CHECK-NEXT:    s_mov_b32 s13, s44
171; CHECK-NEXT:    s_mov_b32 s14, s43
172; CHECK-NEXT:    s_mov_b32 s15, s42
173; CHECK-NEXT:    v_mov_b32_e32 v31, v40
174; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
175; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
176; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
177; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
178; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
179; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
180; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
181; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
182; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
183; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
184; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
185; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
186; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
187; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
188; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
189; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
190; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
191; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
192; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
193; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
194; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
195; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
196; CHECK-NEXT:    s_mov_b32 s32, s33
197; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
198; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
199; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
200; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
201; CHECK-NEXT:    s_mov_b32 s33, s4
202; CHECK-NEXT:    s_waitcnt vmcnt(0)
203; CHECK-NEXT:    s_setpc_b64 s[30:31]
204  %y = sitofp i32 %y.i to double
205  %pow = tail call fast double @_Z3powdd(double %x, double %y)
206  ret double %pow
207}
208
209; --------------------------------------------------------------------
210; test powr
211; --------------------------------------------------------------------
212
213define half @test_powr_fast_f16(half %x, half %y) {
214; CHECK-LABEL: test_powr_fast_f16:
215; CHECK:       ; %bb.0:
216; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; CHECK-NEXT:    v_log_f16_e32 v0, v0
218; CHECK-NEXT:    v_mul_f16_e32 v0, v1, v0
219; CHECK-NEXT:    v_exp_f16_e32 v0, v0
220; CHECK-NEXT:    s_setpc_b64 s[30:31]
221  %powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
222  ret half %powr
223}
224
225define float @test_powr_fast_f32(float %x, float %y) {
226; CHECK-LABEL: test_powr_fast_f32:
227; CHECK:       ; %bb.0:
228; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; CHECK-NEXT:    s_mov_b32 s4, 0x800000
230; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
231; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
232; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
233; CHECK-NEXT:    v_ldexp_f32 v0, v0, v3
234; CHECK-NEXT:    v_log_f32_e32 v0, v0
235; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
236; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
237; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
238; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
239; CHECK-NEXT:    v_mul_f32_e32 v2, v1, v0
240; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
241; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
242; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
243; CHECK-NEXT:    v_fma_f32 v0, v1, v0, v2
244; CHECK-NEXT:    v_exp_f32_e32 v0, v0
245; CHECK-NEXT:    v_not_b32_e32 v1, 63
246; CHECK-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
247; CHECK-NEXT:    v_ldexp_f32 v0, v0, v1
248; CHECK-NEXT:    s_setpc_b64 s[30:31]
249  %powr = tail call fast float @_Z4powrff(float %x, float %y)
250  ret float %powr
251}
252
253define double @test_powr_fast_f64(double %x, double %y) {
254; CHECK-LABEL: test_powr_fast_f64:
255; CHECK:       ; %bb.0:
256; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257; CHECK-NEXT:    s_mov_b32 s16, s33
258; CHECK-NEXT:    s_mov_b32 s33, s32
259; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
260; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
261; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
262; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
263; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
264; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
265; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
266; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
267; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
268; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
269; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
270; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
271; CHECK-NEXT:    s_addk_i32 s32, 0x800
272; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
273; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
274; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
275; CHECK-NEXT:    s_getpc_b64 s[4:5]
276; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
277; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
278; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
279; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
280; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
281; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
282; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
283; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
284; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
285; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
286; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
287; CHECK-NEXT:    v_mov_b32_e32 v42, v31
288; CHECK-NEXT:    v_mov_b32_e32 v41, v3
289; CHECK-NEXT:    v_mov_b32_e32 v40, v2
290; CHECK-NEXT:    s_mov_b32 s42, s15
291; CHECK-NEXT:    s_mov_b32 s43, s14
292; CHECK-NEXT:    s_mov_b32 s44, s13
293; CHECK-NEXT:    s_mov_b32 s45, s12
294; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
295; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
296; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
297; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
298; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
299; CHECK-NEXT:    v_mul_f64 v[0:1], v[40:41], v[0:1]
300; CHECK-NEXT:    s_getpc_b64 s[4:5]
301; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
302; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
303; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
304; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
305; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
306; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
307; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
308; CHECK-NEXT:    s_mov_b32 s12, s45
309; CHECK-NEXT:    s_mov_b32 s13, s44
310; CHECK-NEXT:    s_mov_b32 s14, s43
311; CHECK-NEXT:    s_mov_b32 s15, s42
312; CHECK-NEXT:    v_mov_b32_e32 v31, v42
313; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
314; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
315; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
316; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
317; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
318; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
319; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
320; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
321; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
322; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
323; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
324; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
325; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
326; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
327; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
328; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
329; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
330; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
331; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
332; CHECK-NEXT:    s_mov_b32 s32, s33
333; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
334; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
335; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
336; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
337; CHECK-NEXT:    s_mov_b32 s33, s4
338; CHECK-NEXT:    s_waitcnt vmcnt(0)
339; CHECK-NEXT:    s_setpc_b64 s[30:31]
340  %powr = tail call fast double @_Z4powrdd(double %x, double %y)
341  ret double %powr
342}
343
344; --------------------------------------------------------------------
345; test pown
346; --------------------------------------------------------------------
347
348define half @test_pown_fast_f16(half %x, i32 %y) {
349; CHECK-LABEL: test_pown_fast_f16:
350; CHECK:       ; %bb.0:
351; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
353; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
354; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
355; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
356; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
357; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
358; CHECK-NEXT:    v_exp_f16_e32 v2, v2
359; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
360; CHECK-NEXT:    s_setpc_b64 s[30:31]
361  %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
362  ret half %call
363}
364
365define float @test_pown_fast_f32(float %x, i32 %y) {
366; CHECK-LABEL: test_pown_fast_f32:
367; CHECK:       ; %bb.0:
368; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369; CHECK-NEXT:    s_mov_b32 s4, 0x800000
370; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
371; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
372; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
373; CHECK-NEXT:    v_ldexp_f32 v3, |v0|, v3
374; CHECK-NEXT:    v_log_f32_e32 v3, v3
375; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
376; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
377; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
378; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
379; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
380; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
381; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
382; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
383; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
384; CHECK-NEXT:    v_fma_f32 v2, v2, v4, v3
385; CHECK-NEXT:    v_exp_f32_e32 v2, v2
386; CHECK-NEXT:    v_not_b32_e32 v3, 63
387; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
388; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
389; CHECK-NEXT:    v_ldexp_f32 v2, v2, v3
390; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
391; CHECK-NEXT:    s_setpc_b64 s[30:31]
392  %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
393  ret float %call
394}
395
396define double @test_pown_fast_f64(double %x, i32 %y) {
397; CHECK-LABEL: test_pown_fast_f64:
398; CHECK:       ; %bb.0:
399; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400; CHECK-NEXT:    s_mov_b32 s16, s33
401; CHECK-NEXT:    s_mov_b32 s33, s32
402; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
403; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
404; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
405; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
406; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
407; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
408; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
409; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
410; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
411; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
412; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
413; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
414; CHECK-NEXT:    s_addk_i32 s32, 0x800
415; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
416; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
417; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
418; CHECK-NEXT:    s_getpc_b64 s[4:5]
419; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
420; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
421; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
422; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
423; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
424; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
425; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
426; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
427; CHECK-NEXT:    v_mov_b32_e32 v42, v1
428; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
429; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
430; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
431; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
432; CHECK-NEXT:    v_mov_b32_e32 v40, v31
433; CHECK-NEXT:    v_mov_b32_e32 v41, v2
434; CHECK-NEXT:    s_mov_b32 s42, s15
435; CHECK-NEXT:    s_mov_b32 s43, s14
436; CHECK-NEXT:    s_mov_b32 s44, s13
437; CHECK-NEXT:    s_mov_b32 s45, s12
438; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
439; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
440; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
441; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
442; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
443; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
444; CHECK-NEXT:    s_getpc_b64 s[4:5]
445; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
446; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
447; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
448; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
449; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
450; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
451; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
452; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
453; CHECK-NEXT:    s_mov_b32 s12, s45
454; CHECK-NEXT:    s_mov_b32 s13, s44
455; CHECK-NEXT:    s_mov_b32 s14, s43
456; CHECK-NEXT:    s_mov_b32 s15, s42
457; CHECK-NEXT:    v_mov_b32_e32 v31, v40
458; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
459; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
460; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
461; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
462; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
463; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
464; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
465; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
466; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
467; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
468; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
469; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
470; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
471; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
472; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
473; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
474; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
475; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
476; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
477; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
478; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
479; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
480; CHECK-NEXT:    s_mov_b32 s32, s33
481; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
482; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
483; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
484; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
485; CHECK-NEXT:    s_mov_b32 s33, s4
486; CHECK-NEXT:    s_waitcnt vmcnt(0)
487; CHECK-NEXT:    s_setpc_b64 s[30:31]
488  %call = tail call fast double @_Z4powndi(double %x, i32 %y)
489  ret double %call
490}
491
492define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
493; CHECK-LABEL: test_pown_fast_f16_known_even:
494; CHECK:       ; %bb.0:
495; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
496; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
497; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
498; CHECK-NEXT:    v_log_f16_e64 v0, |v0|
499; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
500; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
501; CHECK-NEXT:    v_exp_f16_e32 v0, v0
502; CHECK-NEXT:    s_setpc_b64 s[30:31]
503  %y = shl i32 %y.arg, 1
504  %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
505  ret half %call
506}
507
508define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
509; CHECK-LABEL: test_pown_fast_f32_known_even:
510; CHECK:       ; %bb.0:
511; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; CHECK-NEXT:    s_mov_b32 s4, 0x800000
513; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
514; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
515; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
516; CHECK-NEXT:    v_ldexp_f32 v0, |v0|, v3
517; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
518; CHECK-NEXT:    v_log_f32_e32 v0, v0
519; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
520; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
521; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
522; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
523; CHECK-NEXT:    v_mul_f32_e32 v2, v0, v1
524; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
525; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
526; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
527; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
528; CHECK-NEXT:    v_fma_f32 v0, v0, v1, v2
529; CHECK-NEXT:    v_exp_f32_e32 v0, v0
530; CHECK-NEXT:    v_not_b32_e32 v1, 63
531; CHECK-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
532; CHECK-NEXT:    v_ldexp_f32 v0, v0, v1
533; CHECK-NEXT:    s_setpc_b64 s[30:31]
534  %y = shl i32 %y.arg, 1
535  %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
536  ret float %call
537}
538
539define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
540; CHECK-LABEL: test_pown_fast_f64_known_even:
541; CHECK:       ; %bb.0:
542; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; CHECK-NEXT:    s_mov_b32 s16, s33
544; CHECK-NEXT:    s_mov_b32 s33, s32
545; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
546; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
547; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
548; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
549; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
550; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
551; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
552; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
553; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
554; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
555; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
556; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
557; CHECK-NEXT:    s_addk_i32 s32, 0x400
558; CHECK-NEXT:    v_writelane_b32 v42, s40, 8
559; CHECK-NEXT:    v_writelane_b32 v42, s41, 9
560; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
561; CHECK-NEXT:    s_getpc_b64 s[4:5]
562; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
563; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
564; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
565; CHECK-NEXT:    v_writelane_b32 v42, s42, 10
566; CHECK-NEXT:    v_writelane_b32 v42, s43, 11
567; CHECK-NEXT:    v_writelane_b32 v42, s44, 12
568; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
569; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
570; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
571; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
572; CHECK-NEXT:    v_writelane_b32 v42, s45, 13
573; CHECK-NEXT:    v_mov_b32_e32 v40, v31
574; CHECK-NEXT:    s_mov_b32 s42, s15
575; CHECK-NEXT:    s_mov_b32 s43, s14
576; CHECK-NEXT:    s_mov_b32 s44, s13
577; CHECK-NEXT:    s_mov_b32 s45, s12
578; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
579; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
580; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
581; CHECK-NEXT:    v_lshlrev_b32_e32 v41, 1, v2
582; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
583; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
584; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
585; CHECK-NEXT:    s_getpc_b64 s[4:5]
586; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
587; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
588; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
589; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
590; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
591; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
592; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
593; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
594; CHECK-NEXT:    s_mov_b32 s12, s45
595; CHECK-NEXT:    s_mov_b32 s13, s44
596; CHECK-NEXT:    s_mov_b32 s14, s43
597; CHECK-NEXT:    s_mov_b32 s15, s42
598; CHECK-NEXT:    v_mov_b32_e32 v31, v40
599; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
600; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
601; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
602; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
603; CHECK-NEXT:    v_readlane_b32 s45, v42, 13
604; CHECK-NEXT:    v_readlane_b32 s44, v42, 12
605; CHECK-NEXT:    v_readlane_b32 s43, v42, 11
606; CHECK-NEXT:    v_readlane_b32 s42, v42, 10
607; CHECK-NEXT:    v_readlane_b32 s41, v42, 9
608; CHECK-NEXT:    v_readlane_b32 s40, v42, 8
609; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
610; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
611; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
612; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
613; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
614; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
615; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
616; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
617; CHECK-NEXT:    s_mov_b32 s32, s33
618; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
619; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
620; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
621; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
622; CHECK-NEXT:    s_mov_b32 s33, s4
623; CHECK-NEXT:    s_waitcnt vmcnt(0)
624; CHECK-NEXT:    s_setpc_b64 s[30:31]
625  %y = shl i32 %y.arg, 1
626  %call = tail call fast double @_Z4powndi(double %x, i32 %y)
627  ret double %call
628}
629
630define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
631; CHECK-LABEL: test_pown_fast_f16_known_odd:
632; CHECK:       ; %bb.0:
633; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
635; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
636; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
637; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
638; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
639; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
640; CHECK-NEXT:    v_exp_f16_e32 v1, v1
641; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
642; CHECK-NEXT:    s_setpc_b64 s[30:31]
643  %y = or i32 %y.arg, 1
644  %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
645  ret half %call
646}
647
648define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
649; CHECK-LABEL: test_pown_fast_f32_known_odd:
650; CHECK:       ; %bb.0:
651; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; CHECK-NEXT:    s_mov_b32 s4, 0x800000
653; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
654; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
655; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
656; CHECK-NEXT:    v_ldexp_f32 v3, |v0|, v3
657; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
658; CHECK-NEXT:    v_log_f32_e32 v3, v3
659; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
660; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
661; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
662; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
663; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v1
664; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
665; CHECK-NEXT:    v_mov_b32_e32 v4, 0x42800000
666; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
667; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
668; CHECK-NEXT:    v_fma_f32 v1, v2, v1, v3
669; CHECK-NEXT:    v_exp_f32_e32 v1, v1
670; CHECK-NEXT:    v_not_b32_e32 v2, 63
671; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
672; CHECK-NEXT:    s_brev_b32 s4, -2
673; CHECK-NEXT:    v_ldexp_f32 v1, v1, v2
674; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
675; CHECK-NEXT:    s_setpc_b64 s[30:31]
676  %y = or i32 %y.arg, 1
677  %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
678  ret float %call
679}
680
681define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
682; CHECK-LABEL: test_pown_fast_f64_known_odd:
683; CHECK:       ; %bb.0:
684; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
685; CHECK-NEXT:    s_mov_b32 s16, s33
686; CHECK-NEXT:    s_mov_b32 s33, s32
687; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
688; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
689; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
690; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
691; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
692; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
693; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
694; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
695; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
696; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
697; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
698; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
699; CHECK-NEXT:    s_addk_i32 s32, 0x800
700; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
701; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
702; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
703; CHECK-NEXT:    s_getpc_b64 s[4:5]
704; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
705; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
706; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
707; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
708; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
709; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
710; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
711; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
712; CHECK-NEXT:    v_mov_b32_e32 v41, v1
713; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
714; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
715; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
716; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
717; CHECK-NEXT:    v_mov_b32_e32 v40, v31
718; CHECK-NEXT:    s_mov_b32 s42, s15
719; CHECK-NEXT:    s_mov_b32 s43, s14
720; CHECK-NEXT:    s_mov_b32 s44, s13
721; CHECK-NEXT:    s_mov_b32 s45, s12
722; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
723; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
724; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
725; CHECK-NEXT:    v_or_b32_e32 v42, 1, v2
726; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
727; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
728; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
729; CHECK-NEXT:    s_getpc_b64 s[4:5]
730; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
731; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
732; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
733; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
734; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
735; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
736; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
737; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
738; CHECK-NEXT:    s_mov_b32 s12, s45
739; CHECK-NEXT:    s_mov_b32 s13, s44
740; CHECK-NEXT:    s_mov_b32 s14, s43
741; CHECK-NEXT:    s_mov_b32 s15, s42
742; CHECK-NEXT:    v_mov_b32_e32 v31, v40
743; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
744; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
745; CHECK-NEXT:    v_and_b32_e32 v2, 0x80000000, v41
746; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
747; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
748; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
749; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
750; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
751; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
752; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
753; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
754; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
755; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
756; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
757; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
758; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
759; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
760; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
761; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
762; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
763; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
764; CHECK-NEXT:    s_mov_b32 s32, s33
765; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
766; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
767; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
768; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
769; CHECK-NEXT:    s_mov_b32 s33, s4
770; CHECK-NEXT:    s_waitcnt vmcnt(0)
771; CHECK-NEXT:    s_setpc_b64 s[30:31]
772  %y = or i32 %y.arg, 1
773  %call = tail call fast double @_Z4powndi(double %x, i32 %y)
774  ret double %call
775}
776
777!llvm.module.flags = !{!0}
778!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
779