xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (revision 88a239d292da80f260788c0817a07cbc0a8ac758)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
5
6declare i16 @llvm.umax.i16(i16, i16)
7declare i64 @llvm.umin.i64(i64, i64)
8
9declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
10
11define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
12; VI-LABEL: fmul_pow2_4xfloat:
13; VI:       ; %bb.0:
14; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
16; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
17; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
18; VI-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
19; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
20; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
21; VI-NEXT:    v_cvt_f32_u32_e32 v2, v2
22; VI-NEXT:    v_cvt_f32_u32_e32 v3, v3
23; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
24; VI-NEXT:    v_mul_f32_e32 v1, 0x41100000, v1
25; VI-NEXT:    v_mul_f32_e32 v2, 0x41100000, v2
26; VI-NEXT:    v_mul_f32_e32 v3, 0x41100000, v3
27; VI-NEXT:    s_setpc_b64 s[30:31]
28;
29; GFX10-LABEL: fmul_pow2_4xfloat:
30; GFX10:       ; %bb.0:
31; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
33; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
34; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
35; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
36; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
37; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
38; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
39; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
40; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
41; GFX10-NEXT:    v_mul_f32_e32 v1, 0x41100000, v1
42; GFX10-NEXT:    v_mul_f32_e32 v2, 0x41100000, v2
43; GFX10-NEXT:    v_mul_f32_e32 v3, 0x41100000, v3
44; GFX10-NEXT:    s_setpc_b64 s[30:31]
45;
46; GFX11-LABEL: fmul_pow2_4xfloat:
47; GFX11:       ; %bb.0:
48; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
50; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
51; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
52; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v3, 1
53; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
54; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
55; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
56; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
57; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
58; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
59; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
60; GFX11-NEXT:    v_dual_mul_f32 v0, 0x41100000, v0 :: v_dual_mul_f32 v1, 0x41100000, v1
61; GFX11-NEXT:    v_dual_mul_f32 v2, 0x41100000, v2 :: v_dual_mul_f32 v3, 0x41100000, v3
62; GFX11-NEXT:    s_setpc_b64 s[30:31]
63  %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
64  %p2_f = uitofp <4 x i32> %p2 to <4 x float>
65  %r = fmul <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
66  ret <4 x float> %r
67}
68
69define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
70; VI-LABEL: fmul_pow2_ldexp_4xfloat:
71; VI:       ; %bb.0:
72; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; VI-NEXT:    s_mov_b32 s4, 0x41100000
74; VI-NEXT:    v_ldexp_f32 v0, s4, v0
75; VI-NEXT:    v_ldexp_f32 v1, s4, v1
76; VI-NEXT:    v_ldexp_f32 v2, s4, v2
77; VI-NEXT:    v_ldexp_f32 v3, s4, v3
78; VI-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX10-LABEL: fmul_pow2_ldexp_4xfloat:
81; GFX10:       ; %bb.0:
82; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX10-NEXT:    v_ldexp_f32 v0, 0x41100000, v0
84; GFX10-NEXT:    v_ldexp_f32 v1, 0x41100000, v1
85; GFX10-NEXT:    v_ldexp_f32 v2, 0x41100000, v2
86; GFX10-NEXT:    v_ldexp_f32 v3, 0x41100000, v3
87; GFX10-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX11-LABEL: fmul_pow2_ldexp_4xfloat:
90; GFX11:       ; %bb.0:
91; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX11-NEXT:    v_ldexp_f32 v0, 0x41100000, v0
93; GFX11-NEXT:    v_ldexp_f32 v1, 0x41100000, v1
94; GFX11-NEXT:    v_ldexp_f32 v2, 0x41100000, v2
95; GFX11-NEXT:    v_ldexp_f32 v3, 0x41100000, v3
96; GFX11-NEXT:    s_setpc_b64 s[30:31]
97  %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
98  ret <4 x float> %r
99}
100
101define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
102; VI-LABEL: fdiv_pow2_4xfloat:
103; VI:       ; %bb.0:
104; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
106; VI-NEXT:    v_lshlrev_b32_e32 v1, 23, v1
107; VI-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
108; VI-NEXT:    v_lshlrev_b32_e32 v3, 23, v3
109; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0x41100000, v0
110; VI-NEXT:    v_sub_u32_e32 v1, vcc, 0x41100000, v1
111; VI-NEXT:    v_sub_u32_e32 v2, vcc, 0x41100000, v2
112; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0x41100000, v3
113; VI-NEXT:    s_setpc_b64 s[30:31]
114;
115; GFX10-LABEL: fdiv_pow2_4xfloat:
116; GFX10:       ; %bb.0:
117; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
119; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 23, v1
120; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
121; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 23, v3
122; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x41100000, v0
123; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x41100000, v1
124; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0x41100000, v2
125; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 0x41100000, v3
126; GFX10-NEXT:    s_setpc_b64 s[30:31]
127;
128; GFX11-LABEL: fdiv_pow2_4xfloat:
129; GFX11:       ; %bb.0:
130; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
132; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 23, v1
133; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
134; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 23, v3
135; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
136; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x41100000, v0
137; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x41100000, v1
138; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
139; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x41100000, v2
140; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 0x41100000, v3
141; GFX11-NEXT:    s_setpc_b64 s[30:31]
142  %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
143  %p2_f = uitofp <4 x i32> %p2 to <4 x float>
144  %r = fdiv <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
145  ret <4 x float> %r
146}
147
148declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
149
150define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
151; VI-LABEL: fmul_pow2_8xhalf:
152; VI:       ; %bb.0:
153; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; VI-NEXT:    v_mov_b32_e32 v5, 1
155; VI-NEXT:    v_lshlrev_b16_e64 v4, v3, 1
156; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
157; VI-NEXT:    v_lshlrev_b16_e64 v6, v2, 1
158; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
159; VI-NEXT:    v_lshlrev_b16_e64 v7, v1, 1
160; VI-NEXT:    v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
161; VI-NEXT:    v_lshlrev_b16_e64 v8, v0, 1
162; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
163; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
164; VI-NEXT:    v_cvt_f16_u16_e32 v5, v8
165; VI-NEXT:    v_cvt_f16_u16_e32 v1, v1
166; VI-NEXT:    v_cvt_f16_u16_e32 v7, v7
167; VI-NEXT:    v_cvt_f16_u16_e32 v2, v2
168; VI-NEXT:    v_cvt_f16_u16_e32 v6, v6
169; VI-NEXT:    v_cvt_f16_u16_e32 v3, v3
170; VI-NEXT:    v_cvt_f16_u16_e32 v4, v4
171; VI-NEXT:    v_mov_b32_e32 v8, 0x7000
172; VI-NEXT:    v_mul_f16_e32 v4, 0x7000, v4
173; VI-NEXT:    v_mul_f16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
174; VI-NEXT:    v_mul_f16_e32 v6, 0x7000, v6
175; VI-NEXT:    v_mul_f16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
176; VI-NEXT:    v_mul_f16_e32 v7, 0x7000, v7
177; VI-NEXT:    v_mul_f16_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
178; VI-NEXT:    v_mul_f16_e32 v5, 0x7000, v5
179; VI-NEXT:    v_mul_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
180; VI-NEXT:    v_or_b32_e32 v0, v5, v0
181; VI-NEXT:    v_or_b32_e32 v1, v7, v1
182; VI-NEXT:    v_or_b32_e32 v2, v6, v2
183; VI-NEXT:    v_or_b32_e32 v3, v4, v3
184; VI-NEXT:    s_setpc_b64 s[30:31]
185;
186; GFX10-LABEL: fmul_pow2_8xhalf:
187; GFX10:       ; %bb.0:
188; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
190; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
191; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
192; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
193; GFX10-NEXT:    v_cvt_f16_u16_e32 v4, v3
194; GFX10-NEXT:    v_cvt_f16_u16_e32 v5, v2
195; GFX10-NEXT:    v_cvt_f16_u16_e32 v6, v1
196; GFX10-NEXT:    v_cvt_f16_u16_e32 v7, v0
197; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
198; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
199; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
200; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
201; GFX10-NEXT:    v_pack_b32_f16 v0, v7, v0
202; GFX10-NEXT:    v_pack_b32_f16 v1, v6, v1
203; GFX10-NEXT:    v_pack_b32_f16 v2, v5, v2
204; GFX10-NEXT:    v_pack_b32_f16 v3, v4, v3
205; GFX10-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
206; GFX10-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
207; GFX10-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
208; GFX10-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
209; GFX10-NEXT:    s_setpc_b64 s[30:31]
210;
211; GFX11-LABEL: fmul_pow2_8xhalf:
212; GFX11:       ; %bb.0:
213; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
214; GFX11-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
215; GFX11-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
216; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
217; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
218; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
219; GFX11-NEXT:    v_cvt_f16_u16_e32 v4, v3
220; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
221; GFX11-NEXT:    v_cvt_f16_u16_e32 v5, v2
222; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
223; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
224; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
225; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
226; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
227; GFX11-NEXT:    v_cvt_f16_u16_e32 v6, v6
228; GFX11-NEXT:    v_cvt_f16_u16_e32 v7, v7
229; GFX11-NEXT:    v_cvt_f16_u16_e32 v2, v2
230; GFX11-NEXT:    v_cvt_f16_u16_e32 v3, v3
231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
232; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v6
233; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v7
234; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
235; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
236; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
237; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
238; GFX11-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
239; GFX11-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
240; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
241; GFX11-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
242; GFX11-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
243; GFX11-NEXT:    s_setpc_b64 s[30:31]
244  %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
245  %p2_f = uitofp <8 x i16> %p2 to <8 x half>
246  %r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
247  ret <8 x half> %r
248}
249
250define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
251; VI-LABEL: fmul_pow2_ldexp_8xhalf:
252; VI:       ; %bb.0:
253; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254; VI-NEXT:    v_mov_b32_e32 v5, 0x7000
255; VI-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
256; VI-NEXT:    v_ldexp_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
257; VI-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
258; VI-NEXT:    v_ldexp_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
259; VI-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v1
260; VI-NEXT:    v_ldexp_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
261; VI-NEXT:    v_ldexp_f16_e32 v8, 0x7000, v0
262; VI-NEXT:    v_ldexp_f16_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
263; VI-NEXT:    v_or_b32_e32 v0, v8, v0
264; VI-NEXT:    v_or_b32_e32 v1, v7, v1
265; VI-NEXT:    v_or_b32_e32 v2, v6, v2
266; VI-NEXT:    v_or_b32_e32 v3, v4, v3
267; VI-NEXT:    s_setpc_b64 s[30:31]
268;
269; GFX10-LABEL: fmul_pow2_ldexp_8xhalf:
270; GFX10:       ; %bb.0:
271; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7000
273; GFX10-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v3
274; GFX10-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
275; GFX10-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v1
276; GFX10-NEXT:    v_ldexp_f16_e32 v8, 0x7000, v0
277; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
278; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
279; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
280; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
281; GFX10-NEXT:    v_pack_b32_f16 v0, v8, v0
282; GFX10-NEXT:    v_pack_b32_f16 v1, v7, v1
283; GFX10-NEXT:    v_pack_b32_f16 v2, v6, v2
284; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
285; GFX10-NEXT:    s_setpc_b64 s[30:31]
286;
287; GFX11-LABEL: fmul_pow2_ldexp_8xhalf:
288; GFX11:       ; %bb.0:
289; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290; GFX11-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
291; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
292; GFX11-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v2
293; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
294; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
295; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
296; GFX11-NEXT:    v_ldexp_f16_e32 v1, 0x7000, v1
297; GFX11-NEXT:    v_ldexp_f16_e32 v0, 0x7000, v0
298; GFX11-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v6
299; GFX11-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v7
300; GFX11-NEXT:    v_ldexp_f16_e32 v2, 0x7000, v2
301; GFX11-NEXT:    v_ldexp_f16_e32 v3, 0x7000, v3
302; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
303; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v6
304; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v7
305; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
306; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
307; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
308; GFX11-NEXT:    s_setpc_b64 s[30:31]
309  %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
310  ret <8 x half> %r
311}
312
313define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
314; VI-LABEL: fdiv_pow2_8xhalf:
315; VI:       ; %bb.0:
316; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317; VI-NEXT:    v_mov_b32_e32 v4, 10
318; VI-NEXT:    v_lshlrev_b16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
319; VI-NEXT:    v_mov_b32_e32 v6, 0x7000
320; VI-NEXT:    v_lshlrev_b16_e32 v3, 10, v3
321; VI-NEXT:    v_lshlrev_b16_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
322; VI-NEXT:    v_lshlrev_b16_e32 v2, 10, v2
323; VI-NEXT:    v_lshlrev_b16_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
324; VI-NEXT:    v_lshlrev_b16_e32 v1, 10, v1
325; VI-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
326; VI-NEXT:    v_lshlrev_b16_e32 v0, 10, v0
327; VI-NEXT:    v_sub_u16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
328; VI-NEXT:    v_sub_u16_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
329; VI-NEXT:    v_sub_u16_sdwa v8, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
330; VI-NEXT:    v_sub_u16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
331; VI-NEXT:    v_sub_u16_e32 v0, 0x7000, v0
332; VI-NEXT:    v_sub_u16_e32 v1, 0x7000, v1
333; VI-NEXT:    v_sub_u16_e32 v2, 0x7000, v2
334; VI-NEXT:    v_sub_u16_e32 v3, 0x7000, v3
335; VI-NEXT:    v_or_b32_e32 v0, v0, v4
336; VI-NEXT:    v_or_b32_e32 v1, v1, v8
337; VI-NEXT:    v_or_b32_e32 v2, v2, v7
338; VI-NEXT:    v_or_b32_e32 v3, v3, v5
339; VI-NEXT:    s_setpc_b64 s[30:31]
340;
341; GFX10-LABEL: fdiv_pow2_8xhalf:
342; GFX10:       ; %bb.0:
343; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
345; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
346; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
347; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1]
348; GFX10-NEXT:    v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1]
349; GFX10-NEXT:    v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1]
350; GFX10-NEXT:    v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
351; GFX10-NEXT:    v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
352; GFX10-NEXT:    s_setpc_b64 s[30:31]
353;
354; GFX11-LABEL: fdiv_pow2_8xhalf:
355; GFX11:       ; %bb.0:
356; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
357; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
358; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
359; GFX11-NEXT:    v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
360; GFX11-NEXT:    v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1]
361; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
362; GFX11-NEXT:    v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1]
363; GFX11-NEXT:    v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1]
364; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
365; GFX11-NEXT:    v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
366; GFX11-NEXT:    v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
367; GFX11-NEXT:    s_setpc_b64 s[30:31]
368  %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
369  %p2_f = uitofp <8 x i16> %p2 to <8 x half>
370  %r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
371  ret <8 x half> %r
372}
373
374define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
375; VI-LABEL: fmul_pow_shl_cnt:
376; VI:       ; %bb.0:
377; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
379; VI-NEXT:    s_mov_b32 s4, 0
380; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
381; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
382; VI-NEXT:    s_mov_b32 s5, 0x40220000
383; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
384; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
385; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
386; VI-NEXT:    s_setpc_b64 s[30:31]
387;
388; GFX10-LABEL: fmul_pow_shl_cnt:
389; GFX10:       ; %bb.0:
390; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
392; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
393; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
394; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
395; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
396; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
397; GFX10-NEXT:    s_setpc_b64 s[30:31]
398;
399; GFX11-LABEL: fmul_pow_shl_cnt:
400; GFX11:       ; %bb.0:
401; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
403; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
404; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
405; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
406; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
407; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
408; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
409; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
410; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
411; GFX11-NEXT:    s_setpc_b64 s[30:31]
412  %shl = shl nuw i64 1, %cnt
413  %conv = uitofp i64 %shl to double
414  %mul = fmul double 9.000000e+00, %conv
415  ret double %mul
416}
417
418define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
419; VI-LABEL: fmul_pow_shl_cnt2:
420; VI:       ; %bb.0:
421; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
423; VI-NEXT:    s_mov_b32 s4, 0
424; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
425; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
426; VI-NEXT:    s_mov_b32 s5, 0xc0220000
427; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
428; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
429; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
430; VI-NEXT:    s_setpc_b64 s[30:31]
431;
432; GFX10-LABEL: fmul_pow_shl_cnt2:
433; GFX10:       ; %bb.0:
434; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
435; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
436; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
437; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
438; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
439; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
440; GFX10-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
441; GFX10-NEXT:    s_setpc_b64 s[30:31]
442;
443; GFX11-LABEL: fmul_pow_shl_cnt2:
444; GFX11:       ; %bb.0:
445; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
447; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
448; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
449; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
450; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
451; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
452; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
453; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
454; GFX11-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
455; GFX11-NEXT:    s_setpc_b64 s[30:31]
456  %shl = shl nuw i64 2, %cnt
457  %conv = uitofp i64 %shl to double
458  %mul = fmul double -9.000000e+00, %conv
459  ret double %mul
460}
461
462define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
463; VI-LABEL: fmul_pow_select:
464; VI:       ; %bb.0:
465; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
466; VI-NEXT:    v_and_b32_e32 v1, 1, v1
467; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
468; VI-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc
469; VI-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
470; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
471; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
472; VI-NEXT:    s_setpc_b64 s[30:31]
473;
474; GFX10-LABEL: fmul_pow_select:
475; GFX10:       ; %bb.0:
476; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
477; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
478; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
479; GFX10-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
480; GFX10-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
481; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
482; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
483; GFX10-NEXT:    s_setpc_b64 s[30:31]
484;
485; GFX11-LABEL: fmul_pow_select:
486; GFX11:       ; %bb.0:
487; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
488; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
489; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
490; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
491; GFX11-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
492; GFX11-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
493; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
494; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
495; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
496; GFX11-NEXT:    s_setpc_b64 s[30:31]
497  %shl2 = shl nuw i32 2, %cnt
498  %shl1 = shl nuw i32 1, %cnt
499  %shl = select i1 %c, i32 %shl1, i32 %shl2
500  %conv = uitofp i32 %shl to float
501  %mul = fmul float 9.000000e+00, %conv
502  ret float %mul
503}
504
505define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
506; VI-LABEL: fmul_fly_pow_mul_min_pow2:
507; VI:       ; %bb.0:
508; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
509; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
510; VI-NEXT:    s_mov_b64 s[4:5], 0x2000
511; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
512; VI-NEXT:    v_mov_b32_e32 v2, 0x2000
513; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
514; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
515; VI-NEXT:    v_ffbh_u32_e32 v2, v1
516; VI-NEXT:    v_min_u32_e32 v2, 32, v2
517; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
518; VI-NEXT:    v_min_u32_e32 v0, 1, v0
519; VI-NEXT:    v_or_b32_e32 v0, v1, v0
520; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
521; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
522; VI-NEXT:    v_ldexp_f32 v0, v0, v1
523; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
524; VI-NEXT:    s_setpc_b64 s[30:31]
525;
526; GFX10-LABEL: fmul_fly_pow_mul_min_pow2:
527; GFX10:       ; %bb.0:
528; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
530; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
531; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
532; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
533; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
534; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
535; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
536; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
537; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
538; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
539; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
540; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
541; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
542; GFX10-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX11-LABEL: fmul_fly_pow_mul_min_pow2:
545; GFX11:       ; %bb.0:
546; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
548; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
549; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
550; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
551; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
552; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
553; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
554; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
555; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
556; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
557; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
558; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
559; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
560; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
561; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
562; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
563; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
564; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
565; GFX11-NEXT:    s_setpc_b64 s[30:31]
566  %shl8 = shl nuw i64 8, %cnt
567  %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192)
568  %conv = uitofp i64 %shl to float
569  %mul = fmul float 9.000000e+00, %conv
570  ret float %mul
571}
572
573define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
574; VI-LABEL: fmul_pow_mul_max_pow2:
575; VI:       ; %bb.0:
576; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 2
578; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
579; VI-NEXT:    s_mov_b32 s4, 0
580; VI-NEXT:    s_mov_b32 s5, 0x40080000
581; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
582; VI-NEXT:    s_setpc_b64 s[30:31]
583;
584; GFX10-LABEL: fmul_pow_mul_max_pow2:
585; GFX10:       ; %bb.0:
586; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
587; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 2
588; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
589; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
590; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
591; GFX10-NEXT:    s_setpc_b64 s[30:31]
592;
593; GFX11-LABEL: fmul_pow_mul_max_pow2:
594; GFX11:       ; %bb.0:
595; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
596; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 2
597; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
598; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
599; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
600; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
601; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
602; GFX11-NEXT:    s_setpc_b64 s[30:31]
603  %shl2 = shl nuw i16 2, %cnt
604  %shl1 = shl nuw i16 1, %cnt
605  %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2)
606  %conv = uitofp i16 %shl to double
607  %mul = fmul double 3.000000e+00, %conv
608  ret double %mul
609}
610
611define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
612; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
613; VI:       ; %bb.0:
614; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
616; VI-NEXT:    s_mov_b32 s4, 0
617; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
618; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
619; VI-NEXT:    s_mov_b32 s5, 0x40220000
620; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
621; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
622; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
623; VI-NEXT:    s_setpc_b64 s[30:31]
624;
625; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
626; GFX10:       ; %bb.0:
627; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
629; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
630; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
631; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
632; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
633; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
634; GFX10-NEXT:    s_setpc_b64 s[30:31]
635;
636; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
637; GFX11:       ; %bb.0:
638; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
640; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
641; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
642; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
643; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
644; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
645; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
646; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
647; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
648; GFX11-NEXT:    s_setpc_b64 s[30:31]
649  %shl = shl nuw i64 %v, %cnt
650  %conv = uitofp i64 %shl to double
651  %mul = fmul double 9.000000e+00, %conv
652  ret double %mul
653}
654
655define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind {
656; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
657; VI:       ; %bb.0:
658; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
659; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 2
660; VI-NEXT:    v_ffbh_u32_e32 v3, v2
661; VI-NEXT:    v_min_u32_e32 v5, 32, v3
662; VI-NEXT:    v_lshlrev_b64 v[1:2], v5, v[1:2]
663; VI-NEXT:    v_lshlrev_b64 v[3:4], v0, 2
664; VI-NEXT:    v_min_u32_e32 v0, 1, v1
665; VI-NEXT:    v_or_b32_e32 v0, v2, v0
666; VI-NEXT:    v_cvt_f32_u32_e32 v2, v0
667; VI-NEXT:    v_ffbh_u32_e32 v0, v4
668; VI-NEXT:    v_min_u32_e32 v6, 32, v0
669; VI-NEXT:    v_lshlrev_b64 v[0:1], v6, v[3:4]
670; VI-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
671; VI-NEXT:    v_min_u32_e32 v0, 1, v0
672; VI-NEXT:    v_or_b32_e32 v0, v1, v0
673; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
674; VI-NEXT:    v_ldexp_f32 v1, v2, v3
675; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
676; VI-NEXT:    v_ldexp_f32 v0, v0, v2
677; VI-NEXT:    v_mul_f32_e32 v0, 0x41700000, v0
678; VI-NEXT:    v_mul_f32_e32 v1, 0x41700000, v1
679; VI-NEXT:    s_setpc_b64 s[30:31]
680;
681; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
682; GFX10:       ; %bb.0:
683; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
684; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
685; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
686; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
687; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
688; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
689; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
690; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
691; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
692; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
693; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
694; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
695; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
696; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v5
697; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v4
698; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
699; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
700; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
701; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
702; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41700000, v0
703; GFX10-NEXT:    v_mul_f32_e32 v1, 0x41700000, v1
704; GFX10-NEXT:    s_setpc_b64 s[30:31]
705;
706; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
707; GFX11:       ; %bb.0:
708; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
710; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
711; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
712; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
713; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
715; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
716; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
717; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
718; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
719; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
720; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
721; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
722; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
723; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
724; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
725; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
726; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v5
727; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v4
728; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
729; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
730; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
731; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
732; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
733; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
734; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
735; GFX11-NEXT:    v_dual_mul_f32 v0, 0x41700000, v0 :: v_dual_mul_f32 v1, 0x41700000, v1
736; GFX11-NEXT:    s_setpc_b64 s[30:31]
737  %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
738  %conv = uitofp <2 x i64> %shl to <2 x float>
739  %mul = fmul <2 x float> <float 15.000000e+00, float 15.000000e+00>, %conv
740  ret <2 x float> %mul
741}
742
743define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
744; VI-LABEL: fmul_pow_shl_cnt_vec:
745; VI:       ; %bb.0:
746; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
747; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
748; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
749; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
750; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
751; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
752; VI-NEXT:    s_mov_b32 s4, 0
753; VI-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
754; VI-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
755; VI-NEXT:    v_cvt_f64_u32_e32 v[7:8], v2
756; VI-NEXT:    s_mov_b32 s5, 0x402e0000
757; VI-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
758; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
759; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
760; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
761; VI-NEXT:    s_setpc_b64 s[30:31]
762;
763; GFX10-LABEL: fmul_pow_shl_cnt_vec:
764; GFX10:       ; %bb.0:
765; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
767; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
768; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
769; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
770; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
771; GFX10-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
772; GFX10-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
773; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
774; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
775; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
776; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
777; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
778; GFX10-NEXT:    s_setpc_b64 s[30:31]
779;
780; GFX11-LABEL: fmul_pow_shl_cnt_vec:
781; GFX11:       ; %bb.0:
782; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
784; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
785; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
786; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
787; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
788; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
789; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
790; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
791; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
792; GFX11-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
793; GFX11-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
794; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
795; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
796; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
797; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
798; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
799; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
800; GFX11-NEXT:    s_setpc_b64 s[30:31]
801  %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
802  %conv = uitofp <2 x i64> %shl to <2 x double>
803  %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
804  ret <2 x double> %mul
805}
806
807define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind {
808; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
809; VI:       ; %bb.0:
810; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
811; VI-NEXT:    v_lshlrev_b32_e64 v3, v3, 2
812; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
813; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
814; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
815; VI-NEXT:    v_cvt_f32_u32_e32 v3, v3
816; VI-NEXT:    v_cvt_f32_u32_e32 v2, v2
817; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
818; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
819; VI-NEXT:    v_mul_f32_e32 v3, 0x40a00000, v3
820; VI-NEXT:    v_mul_f32_e32 v2, 0x40a00000, v2
821; VI-NEXT:    v_mul_f32_e32 v1, 0x40a00000, v1
822; VI-NEXT:    v_mul_f32_e32 v0, 0x40a00000, v0
823; VI-NEXT:    v_add_f32_e32 v0, v0, v4
824; VI-NEXT:    v_add_f32_e32 v1, v1, v5
825; VI-NEXT:    v_add_f32_e32 v2, v2, v6
826; VI-NEXT:    v_add_f32_e32 v3, v3, v7
827; VI-NEXT:    s_setpc_b64 s[30:31]
828;
829; GFX10-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
830; GFX10:       ; %bb.0:
831; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
832; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
833; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
834; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
835; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v3, 2
836; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
837; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
838; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
839; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
840; GFX10-NEXT:    v_mul_f32_e32 v0, 0x40a00000, v0
841; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40a00000, v1
842; GFX10-NEXT:    v_mul_f32_e32 v2, 0x40a00000, v2
843; GFX10-NEXT:    v_mul_f32_e32 v3, 0x40a00000, v3
844; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
845; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
846; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
847; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
848; GFX10-NEXT:    s_setpc_b64 s[30:31]
849;
850; GFX11-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
851; GFX11:       ; %bb.0:
852; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
854; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
855; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
856; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v3, 2
857; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
858; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
859; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
860; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
861; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
862; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
863; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
864; GFX11-NEXT:    v_dual_mul_f32 v0, 0x40a00000, v0 :: v_dual_mul_f32 v1, 0x40a00000, v1
865; GFX11-NEXT:    v_dual_mul_f32 v2, 0x40a00000, v2 :: v_dual_mul_f32 v3, 0x40a00000, v3
866; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
867; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
868; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
869; GFX11-NEXT:    s_setpc_b64 s[30:31]
870  %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
871  %conv = uitofp <4 x i32> %shl to <4 x float>
872  %mul = fmul <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv
873  %res = fadd <4 x float> %mul, %add
874  ret <4 x float> %res
875}
876
877define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind {
878; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
879; VI:       ; %bb.0:
880; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
882; VI-NEXT:    s_mov_b32 s4, 0
883; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v1
884; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 2
885; VI-NEXT:    s_mov_b32 s5, 0x402e0000
886; VI-NEXT:    v_cvt_f64_u32_e32 v[5:6], v2
887; VI-NEXT:    v_ldexp_f64 v[2:3], v[3:4], 32
888; VI-NEXT:    v_ldexp_f64 v[4:5], v[5:6], 32
889; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v0
890; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v1
891; VI-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
892; VI-NEXT:    v_add_f64 v[4:5], v[4:5], v[0:1]
893; VI-NEXT:    v_mul_f64 v[0:1], v[2:3], s[4:5]
894; VI-NEXT:    s_mov_b32 s4, 0
895; VI-NEXT:    s_mov_b32 s5, 0x402c0000
896; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], s[4:5]
897; VI-NEXT:    s_setpc_b64 s[30:31]
898;
899; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
900; GFX10:       ; %bb.0:
901; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
902; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
903; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
904; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
905; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
906; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
907; GFX10-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
908; GFX10-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
909; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
910; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
911; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
912; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
913; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
914; GFX10-NEXT:    s_setpc_b64 s[30:31]
915;
916; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
917; GFX11:       ; %bb.0:
918; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
920; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
921; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
922; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
923; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
924; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
925; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
926; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
927; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
928; GFX11-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
929; GFX11-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
930; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
931; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
932; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
933; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
934; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
935; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
936; GFX11-NEXT:    s_setpc_b64 s[30:31]
937  %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
938  %conv = uitofp <2 x i64> %shl to <2 x double>
939  %mul = fmul <2 x double> <double 15.000000e+00, double 14.000000e+00>, %conv
940  ret <2 x double> %mul
941}
942
943define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind {
944; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
945; VI:       ; %bb.0:
946; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
947; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
948; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
949; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
950; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
951; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
952; VI-NEXT:    s_mov_b32 s4, 0
953; VI-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
954; VI-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
955; VI-NEXT:    v_cvt_f64_u32_e32 v[7:8], v2
956; VI-NEXT:    s_mov_b32 s5, 0x402e0000
957; VI-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
958; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
959; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
960; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
961; VI-NEXT:    s_setpc_b64 s[30:31]
962;
963; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
964; GFX10:       ; %bb.0:
965; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
967; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
968; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
969; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
970; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
971; GFX10-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
972; GFX10-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
973; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
974; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
975; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
976; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
977; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
978; GFX10-NEXT:    s_setpc_b64 s[30:31]
979;
980; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
981; GFX11:       ; %bb.0:
982; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
983; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
984; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
985; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
986; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
987; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
988; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
989; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
990; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
991; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
992; GFX11-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
993; GFX11-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
994; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
995; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
996; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
997; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
998; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
999; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
1000; GFX11-NEXT:    s_setpc_b64 s[30:31]
1001  %shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
1002  %conv = uitofp <2 x i64> %shl to <2 x double>
1003  %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
1004  ret <2 x double> %mul
1005}
1006
1007define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
1008; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1009; VI:       ; %bb.0:
1010; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1011; VI-NEXT:    v_mov_b32_e32 v1, 2
1012; VI-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1013; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 2
1014; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
1015; VI-NEXT:    v_cvt_f16_u16_e32 v1, v1
1016; VI-NEXT:    v_mov_b32_e32 v2, 0x4b80
1017; VI-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1018; VI-NEXT:    v_mul_f16_e32 v0, 0x4b80, v0
1019; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1020; VI-NEXT:    s_setpc_b64 s[30:31]
1021;
1022; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1023; GFX10:       ; %bb.0:
1024; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1025; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
1026; GFX10-NEXT:    v_cvt_f16_u16_e32 v1, v0
1027; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1028; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
1029; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
1030; GFX10-NEXT:    s_setpc_b64 s[30:31]
1031;
1032; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
1033; GFX11:       ; %bb.0:
1034; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1035; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
1036; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1037; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1038; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
1039; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
1040; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1041; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
1042; GFX11-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
1043; GFX11-NEXT:    s_setpc_b64 s[30:31]
1044  %shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
1045  %conv = uitofp <2 x i16> %shl to <2 x half>
1046  %mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv
1047  ret <2 x half> %mul
1048}
1049
1050define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
1051; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1052; VI:       ; %bb.0:
1053; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1054; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
1055; VI-NEXT:    s_mov_b32 s4, 0xff5f3992
1056; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
1057; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
1058; VI-NEXT:    s_mov_b32 s5, 0x7befffff
1059; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
1060; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
1061; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
1062; VI-NEXT:    s_setpc_b64 s[30:31]
1063;
1064; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1065; GFX10:       ; %bb.0:
1066; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1067; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
1068; GFX10-NEXT:    s_mov_b32 s4, 0xff5f3992
1069; GFX10-NEXT:    s_mov_b32 s5, 0x7befffff
1070; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
1071; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
1072; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
1073; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
1074; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
1075; GFX10-NEXT:    s_setpc_b64 s[30:31]
1076;
1077; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
1078; GFX11:       ; %bb.0:
1079; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1080; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
1081; GFX11-NEXT:    s_mov_b32 s0, 0xff5f3992
1082; GFX11-NEXT:    s_mov_b32 s1, 0x7befffff
1083; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1084; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
1085; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
1086; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1087; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
1088; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
1089; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1090; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
1091; GFX11-NEXT:    s_setpc_b64 s[30:31]
1092  %shl = shl nuw i64 1, %cnt
1093  %conv = uitofp i64 %shl to double
1094  %mul = fmul double 9.745314e+288, %conv
1095  ret double %mul
1096}
1097
1098define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
1099; VI-LABEL: fmul_pow_shl_cnt_safe:
1100; VI:       ; %bb.0:
1101; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
1103; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
1104; VI-NEXT:    s_mov_b32 s4, 0xff5f3992
1105; VI-NEXT:    s_mov_b32 s5, 0x7befffff
1106; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
1107; VI-NEXT:    s_setpc_b64 s[30:31]
1108;
1109; GFX10-LABEL: fmul_pow_shl_cnt_safe:
1110; GFX10:       ; %bb.0:
1111; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
1113; GFX10-NEXT:    s_mov_b32 s4, 0xff5f3992
1114; GFX10-NEXT:    s_mov_b32 s5, 0x7befffff
1115; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1116; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
1117; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
1118; GFX10-NEXT:    s_setpc_b64 s[30:31]
1119;
1120; GFX11-LABEL: fmul_pow_shl_cnt_safe:
1121; GFX11:       ; %bb.0:
1122; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1123; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
1124; GFX11-NEXT:    s_mov_b32 s0, 0xff5f3992
1125; GFX11-NEXT:    s_mov_b32 s1, 0x7befffff
1126; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1127; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1128; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
1129; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1130; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
1131; GFX11-NEXT:    s_setpc_b64 s[30:31]
1132  %shl = shl nuw i16 1, %cnt
1133  %conv = uitofp i16 %shl to double
1134  %mul = fmul double 9.745314e+288, %conv
1135  ret double %mul
1136}
1137
1138define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
1139; VI-LABEL: fdiv_pow_shl_cnt_vec:
1140; VI:       ; %bb.0:
1141; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1142; VI-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
1143; VI-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
1144; VI-NEXT:    v_sub_u32_e64 v0, vcc, 0, 0
1145; VI-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
1146; VI-NEXT:    v_lshlrev_b32_e32 v4, 20, v2
1147; VI-NEXT:    v_sub_u32_e64 v2, vcc, 0, 0
1148; VI-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
1149; VI-NEXT:    s_setpc_b64 s[30:31]
1150;
1151; GFX10-LABEL: fdiv_pow_shl_cnt_vec:
1152; GFX10:       ; %bb.0:
1153; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1154; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
1155; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
1156; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, 0
1157; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
1158; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, 0, 0
1159; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
1160; GFX10-NEXT:    s_setpc_b64 s[30:31]
1161;
1162; GFX11-LABEL: fdiv_pow_shl_cnt_vec:
1163; GFX11:       ; %bb.0:
1164; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1165; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
1166; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
1167; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, 0
1168; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
1169; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
1170; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, 0, 0
1171; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
1172; GFX11-NEXT:    s_setpc_b64 s[30:31]
1173  %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
1174  %conv = uitofp <2 x i64> %shl to <2 x double>
1175  %mul = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %conv
1176  ret <2 x double> %mul
1177}
1178
1179define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind {
1180; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
1181; VI:       ; %bb.0:
1182; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1183; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1184; VI-NEXT:    v_lshlrev_b32_e32 v1, 23, v2
1185; VI-NEXT:    v_sub_u32_e32 v0, vcc, 1.0, v0
1186; VI-NEXT:    v_sub_u32_e32 v1, vcc, 1.0, v1
1187; VI-NEXT:    s_setpc_b64 s[30:31]
1188;
1189; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
1190; GFX10:       ; %bb.0:
1191; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1192; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1193; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 23, v2
1194; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 1.0, v0
1195; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 1.0, v1
1196; GFX10-NEXT:    s_setpc_b64 s[30:31]
1197;
1198; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
1199; GFX11:       ; %bb.0:
1200; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1201; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1202; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 23, v2
1203; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1204; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 1.0, v0
1205; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 1.0, v1
1206; GFX11-NEXT:    s_setpc_b64 s[30:31]
1207  %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
1208  %conv = uitofp <2 x i64> %shl to <2 x float>
1209  %mul = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %conv
1210  ret <2 x float> %mul
1211}
1212
1213define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
1214; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
1215; VI:       ; %bb.0:
1216; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1217; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
1218; VI-NEXT:    s_mov_b32 s6, 0xc1100000
1219; VI-NEXT:    v_ffbh_u32_e32 v2, v1
1220; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1221; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1222; VI-NEXT:    v_min_u32_e32 v0, 1, v0
1223; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1224; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1225; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
1226; VI-NEXT:    v_ldexp_f32 v0, v0, v1
1227; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
1228; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
1229; VI-NEXT:    v_rcp_f32_e32 v3, v1
1230; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
1231; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
1232; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
1233; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
1234; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
1235; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
1236; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
1237; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
1238; VI-NEXT:    s_setpc_b64 s[30:31]
1239;
1240; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
1241; GFX10:       ; %bb.0:
1242; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
1244; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
1245; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
1246; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1247; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
1248; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
1249; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
1250; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
1251; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
1252; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
1253; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
1254; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
1255; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
1256; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
1257; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
1258; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
1259; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
1260; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
1261; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
1262; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
1263; GFX10-NEXT:    s_setpc_b64 s[30:31]
1264;
1265; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
1266; GFX11:       ; %bb.0:
1267; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1268; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
1269; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1270; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
1271; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
1272; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1273; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1274; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
1275; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1276; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
1277; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
1278; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
1279; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1280; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
1281; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0xc1100000
1282; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1283; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
1284; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1285; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
1286; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
1287; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
1288; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1289; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
1290; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
1291; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1292; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
1293; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
1294; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1295; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
1296; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
1297; GFX11-NEXT:    s_setpc_b64 s[30:31]
1298  %shl = shl i64 8, %cnt
1299  %conv = uitofp i64 %shl to float
1300  %mul = fdiv float -9.000000e+00, %conv
1301  ret float %mul
1302}
1303
1304define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
1305; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
1306; VI:       ; %bb.0:
1307; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1308; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
1309; VI-NEXT:    s_mov_b32 s6, 0xc1100000
1310; VI-NEXT:    v_xor_b32_e32 v2, v0, v1
1311; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
1312; VI-NEXT:    v_ffbh_i32_e32 v3, v1
1313; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
1314; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
1315; VI-NEXT:    v_min_u32_e32 v2, v3, v2
1316; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1317; VI-NEXT:    v_min_u32_e32 v0, 1, v0
1318; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1319; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
1320; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
1321; VI-NEXT:    v_ldexp_f32 v0, v0, v1
1322; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
1323; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
1324; VI-NEXT:    v_rcp_f32_e32 v3, v1
1325; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
1326; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
1327; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
1328; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
1329; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
1330; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
1331; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
1332; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
1333; VI-NEXT:    s_setpc_b64 s[30:31]
1334;
1335; GFX10-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
1336; GFX10:       ; %bb.0:
1337; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1338; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
1339; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
1340; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
1341; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
1342; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
1343; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
1344; GFX10-NEXT:    v_min_u32_e32 v2, v3, v2
1345; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1346; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
1347; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
1348; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
1349; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
1350; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
1351; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
1352; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
1353; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
1354; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
1355; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
1356; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
1357; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
1358; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
1359; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
1360; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
1361; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
1362; GFX10-NEXT:    s_setpc_b64 s[30:31]
1363;
1364; GFX11-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
1365; GFX11:       ; %bb.0:
1366; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1367; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
1368; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1369; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
1370; GFX11-NEXT:    v_cls_i32_e32 v3, v1
1371; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
1372; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1373; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
1374; GFX11-NEXT:    v_add_nc_u32_e32 v2, 32, v2
1375; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1376; GFX11-NEXT:    v_min_u32_e32 v2, v3, v2
1377; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1378; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1379; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
1380; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
1381; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
1382; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1383; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
1384; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
1385; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1386; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0xc1100000
1387; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
1388; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1389; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
1390; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1391; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
1392; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
1393; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
1394; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1395; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
1396; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
1397; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1398; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
1399; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
1400; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1401; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
1402; GFX11-NEXT:    s_setpc_b64 s[30:31]
1403  %shl = shl i64 8, %cnt
1404  %conv = sitofp i64 %shl to float
1405  %mul = fdiv float -9.000000e+00, %conv
1406  ret float %mul
1407}
1408
1409define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
1410; VI-LABEL: fdiv_pow_shl_cnt:
1411; VI:       ; %bb.0:
1412; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1413; VI-NEXT:    v_and_b32_e32 v0, 31, v0
1414; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1415; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0xbd800000, v0
1416; VI-NEXT:    s_setpc_b64 s[30:31]
1417;
1418; GFX10-LABEL: fdiv_pow_shl_cnt:
1419; GFX10:       ; %bb.0:
1420; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1421; GFX10-NEXT:    v_and_b32_e32 v0, 31, v0
1422; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1423; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0xbd800000, v0
1424; GFX10-NEXT:    s_setpc_b64 s[30:31]
1425;
1426; GFX11-LABEL: fdiv_pow_shl_cnt:
1427; GFX11:       ; %bb.0:
1428; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1429; GFX11-NEXT:    v_and_b32_e32 v0, 31, v0
1430; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1431; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1432; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0xbd800000, v0
1433; GFX11-NEXT:    s_setpc_b64 s[30:31]
1434  %cnt = and i64 %cnt_in, 31
1435  %shl = shl i64 8, %cnt
1436  %conv = sitofp i64 %shl to float
1437  %mul = fdiv float -0.500000e+00, %conv
1438  ret float %mul
1439}
1440
1441define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
1442; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
1443; VI:       ; %bb.0:
1444; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1445; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
1446; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1447; VI-NEXT:    s_mov_b32 s4, 0x46000000
1448; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1449; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1450; VI-NEXT:    v_rcp_f32_e32 v2, v1
1451; VI-NEXT:    v_mul_f32_e32 v3, 0x46000000, v2
1452; VI-NEXT:    v_mad_f32 v4, -v1, v3, s4
1453; VI-NEXT:    v_mac_f32_e32 v3, v4, v2
1454; VI-NEXT:    v_mad_f32 v1, -v1, v3, s4
1455; VI-NEXT:    v_mul_f32_e32 v1, v1, v2
1456; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
1457; VI-NEXT:    v_add_f32_e32 v1, v1, v3
1458; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1459; VI-NEXT:    s_movk_i32 s4, 0x7000
1460; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, s4
1461; VI-NEXT:    s_setpc_b64 s[30:31]
1462;
1463; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
1464; GFX10:       ; %bb.0:
1465; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
1467; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
1468; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
1469; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
1470; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
1471; GFX10-NEXT:    v_mul_f32_e32 v3, 0x46000000, v2
1472; GFX10-NEXT:    v_mad_f32 v4, -v1, v3, 0x46000000
1473; GFX10-NEXT:    v_mac_f32_e32 v3, v4, v2
1474; GFX10-NEXT:    v_mad_f32 v1, -v1, v3, 0x46000000
1475; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
1476; GFX10-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
1477; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
1478; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
1479; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
1480; GFX10-NEXT:    s_setpc_b64 s[30:31]
1481;
1482; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
1483; GFX11:       ; %bb.0:
1484; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1485; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
1486; GFX11-NEXT:    s_mov_b32 s0, 0x46000000
1487; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1488; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
1489; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
1490; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1491; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
1492; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
1493; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1494; GFX11-NEXT:    v_mul_f32_e32 v2, 0x46000000, v1
1495; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1496; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
1497; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v1
1498; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1499; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
1500; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
1501; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1502; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
1503; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
1504; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1505; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
1506; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
1507; GFX11-NEXT:    s_setpc_b64 s[30:31]
1508  %shl = shl nuw i32 1, %cnt
1509  %conv = uitofp i32 %shl to half
1510  %mul = fdiv half 0xH7000, %conv
1511  ret half %mul
1512}
1513
1514define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
1515; VI-LABEL: fdiv_pow_shl_cnt_in_bounds:
1516; VI:       ; %bb.0:
1517; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1518; VI-NEXT:    v_lshlrev_b16_e32 v0, 10, v0
1519; VI-NEXT:    v_sub_u16_e32 v0, 0x7000, v0
1520; VI-NEXT:    s_setpc_b64 s[30:31]
1521;
1522; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds:
1523; GFX10:       ; %bb.0:
1524; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1525; GFX10-NEXT:    v_lshlrev_b16 v0, 10, v0
1526; GFX10-NEXT:    v_sub_nc_u16 v0, 0x7000, v0
1527; GFX10-NEXT:    s_setpc_b64 s[30:31]
1528;
1529; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds:
1530; GFX11:       ; %bb.0:
1531; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1532; GFX11-NEXT:    v_lshlrev_b16 v0, 10, v0
1533; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1534; GFX11-NEXT:    v_sub_nc_u16 v0, 0x7000, v0
1535; GFX11-NEXT:    s_setpc_b64 s[30:31]
1536  %shl = shl nuw i16 1, %cnt
1537  %conv = uitofp i16 %shl to half
1538  %mul = fdiv half 0xH7000, %conv
1539  ret half %mul
1540}
1541
1542define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
1543; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2:
1544; VI:       ; %bb.0:
1545; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1546; VI-NEXT:    v_lshlrev_b16_e32 v0, 10, v0
1547; VI-NEXT:    v_sub_u16_e32 v0, 0x4800, v0
1548; VI-NEXT:    s_setpc_b64 s[30:31]
1549;
1550; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2:
1551; GFX10:       ; %bb.0:
1552; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1553; GFX10-NEXT:    v_lshlrev_b16 v0, 10, v0
1554; GFX10-NEXT:    v_sub_nc_u16 v0, 0x4800, v0
1555; GFX10-NEXT:    s_setpc_b64 s[30:31]
1556;
1557; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2:
1558; GFX11:       ; %bb.0:
1559; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1560; GFX11-NEXT:    v_lshlrev_b16 v0, 10, v0
1561; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1562; GFX11-NEXT:    v_sub_nc_u16 v0, 0x4800, v0
1563; GFX11-NEXT:    s_setpc_b64 s[30:31]
1564  %shl = shl nuw i16 1, %cnt
1565  %conv = uitofp i16 %shl to half
1566  %mul = fdiv half 0xH4800, %conv
1567  ret half %mul
1568}
1569
1570define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
1571; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
1572; VI:       ; %bb.0:
1573; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
1575; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
1576; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1577; VI-NEXT:    v_rcp_f32_e32 v2, v1
1578; VI-NEXT:    v_add_f32_e32 v3, v2, v2
1579; VI-NEXT:    v_mad_f32 v4, -v1, v3, 2.0
1580; VI-NEXT:    v_mac_f32_e32 v3, v4, v2
1581; VI-NEXT:    v_mad_f32 v1, -v1, v3, 2.0
1582; VI-NEXT:    v_mul_f32_e32 v1, v1, v2
1583; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
1584; VI-NEXT:    v_add_f32_e32 v1, v1, v3
1585; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1586; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
1587; VI-NEXT:    s_setpc_b64 s[30:31]
1588;
1589; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
1590; GFX10:       ; %bb.0:
1591; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1592; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
1593; GFX10-NEXT:    v_cvt_f16_u16_e32 v0, v0
1594; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
1595; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
1596; GFX10-NEXT:    v_add_f32_e32 v3, v2, v2
1597; GFX10-NEXT:    v_mad_f32 v4, -v1, v3, 2.0
1598; GFX10-NEXT:    v_mac_f32_e32 v3, v4, v2
1599; GFX10-NEXT:    v_mad_f32 v1, -v1, v3, 2.0
1600; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
1601; GFX10-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
1602; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
1603; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
1604; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
1605; GFX10-NEXT:    s_setpc_b64 s[30:31]
1606;
1607; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
1608; GFX11:       ; %bb.0:
1609; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1610; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
1611; GFX11-NEXT:    s_mov_b32 s0, 2.0
1612; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1613; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
1614; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
1615; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1616; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
1617; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1618; GFX11-NEXT:    v_add_f32_e32 v2, v1, v1
1619; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
1620; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1621; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v1
1622; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
1623; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1624; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
1625; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
1626; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1627; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
1628; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
1629; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1630; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
1631; GFX11-NEXT:    s_setpc_b64 s[30:31]
1632  %shl = shl nuw i16 1, %cnt
1633  %conv = uitofp i16 %shl to half
1634  %mul = fdiv half 0xH4000, %conv
1635  ret half %mul
1636}
1637
1638define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
1639; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
1640; VI:       ; %bb.0:
1641; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1642; VI-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
1643; VI-NEXT:    v_mov_b32_e32 v1, 0x36a00000
1644; VI-NEXT:    v_sub_u32_e64 v2, vcc, 0, 0
1645; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v0, vcc
1646; VI-NEXT:    v_mov_b32_e32 v0, 0
1647; VI-NEXT:    s_setpc_b64 s[30:31]
1648;
1649; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
1650; GFX10:       ; %bb.0:
1651; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1652; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
1653; GFX10-NEXT:    v_sub_co_u32 v1, vcc_lo, 0, 0
1654; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
1655; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1656; GFX10-NEXT:    s_setpc_b64 s[30:31]
1657;
1658; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
1659; GFX11:       ; %bb.0:
1660; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1661; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
1662; GFX11-NEXT:    v_sub_co_u32 v1, vcc_lo, 0, 0
1663; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1664; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
1665; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1666; GFX11-NEXT:    s_setpc_b64 s[30:31]
1667  %shl = shl nuw i32 1, %cnt
1668  %conv = uitofp i32 %shl to double
1669  %mul = fdiv double 0x36A0000000000000, %conv
1670  ret double %mul
1671}
1672
1673define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
1674; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
1675; VI:       ; %bb.0:
1676; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1677; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
1678; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1679; VI-NEXT:    s_mov_b32 s6, 0x10fffff8
1680; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
1681; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
1682; VI-NEXT:    v_rcp_f32_e32 v3, v1
1683; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
1684; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
1685; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
1686; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
1687; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
1688; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
1689; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
1690; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
1691; VI-NEXT:    s_setpc_b64 s[30:31]
1692;
1693; GFX10-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
1694; GFX10:       ; %bb.0:
1695; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1696; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
1697; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
1698; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x10fffff8
1699; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
1700; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
1701; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
1702; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
1703; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
1704; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
1705; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
1706; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
1707; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
1708; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x10fffff8
1709; GFX10-NEXT:    s_setpc_b64 s[30:31]
1710;
1711; GFX11-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
1712; GFX11:       ; %bb.0:
1713; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1714; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
1715; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1716; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
1717; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x10fffff8
1718; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1719; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
1720; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1721; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
1722; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
1723; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8
1724; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1725; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
1726; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
1727; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1728; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
1729; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
1730; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1731; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
1732; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x10fffff8
1733; GFX11-NEXT:    s_setpc_b64 s[30:31]
1734  %shl = shl nuw i32 1, %cnt
1735  %conv = uitofp i32 %shl to float
1736  %mul = fdiv float 0x3a1fffff00000000, %conv
1737  ret float %mul
1738}
1739
1740define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
1741; VI-LABEL: fdiv_pow_shl_cnt32_okay:
1742; VI:       ; %bb.0:
1743; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1744; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1745; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0x11000000, v0
1746; VI-NEXT:    s_setpc_b64 s[30:31]
1747;
1748; GFX10-LABEL: fdiv_pow_shl_cnt32_okay:
1749; GFX10:       ; %bb.0:
1750; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1752; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x11000000, v0
1753; GFX10-NEXT:    s_setpc_b64 s[30:31]
1754;
1755; GFX11-LABEL: fdiv_pow_shl_cnt32_okay:
1756; GFX11:       ; %bb.0:
1757; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1758; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1759; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1760; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x11000000, v0
1761; GFX11-NEXT:    s_setpc_b64 s[30:31]
1762  %shl = shl nuw i32 1, %cnt
1763  %conv = uitofp i32 %shl to float
1764  %mul = fdiv float 0x3a20000000000000, %conv
1765  ret float %mul
1766}
1767