xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7
8; Make sure fdiv is promoted to f32.
9
10define amdgpu_kernel void @v_fdiv_f16(
11; SI-LABEL: v_fdiv_f16:
12; SI:       ; %bb.0: ; %entry
13; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
14; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
15; SI-NEXT:    s_mov_b32 s7, 0xf000
16; SI-NEXT:    s_mov_b32 s6, 0
17; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
20; SI-NEXT:    v_mov_b32_e32 v1, 0
21; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
22; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
23; SI-NEXT:    s_waitcnt vmcnt(0)
24; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
27; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
28; SI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, v2
29; SI-NEXT:    v_rcp_f32_e32 v5, v4
30; SI-NEXT:    v_div_scale_f32 v6, vcc, v2, v3, v2
31; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
32; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
33; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
34; SI-NEXT:    v_mul_f32_e32 v7, v6, v5
35; SI-NEXT:    v_fma_f32 v8, -v4, v7, v6
36; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
37; SI-NEXT:    v_fma_f32 v4, -v4, v7, v6
38; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
39; SI-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
40; SI-NEXT:    v_div_fixup_f32 v2, v4, v3, v2
41; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
42; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
43; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
44; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
45; SI-NEXT:    s_endpgm
46;
47; GFX8-LABEL: v_fdiv_f16:
48; GFX8:       ; %bb.0: ; %entry
49; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
50; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
51; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
52; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX8-NEXT:    v_mov_b32_e32 v1, s3
54; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
55; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
56; GFX8-NEXT:    v_mov_b32_e32 v3, s5
57; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
58; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
59; GFX8-NEXT:    flat_load_ushort v5, v[0:1] glc
60; GFX8-NEXT:    s_waitcnt vmcnt(0)
61; GFX8-NEXT:    flat_load_ushort v2, v[2:3] glc
62; GFX8-NEXT:    s_waitcnt vmcnt(0)
63; GFX8-NEXT:    v_mov_b32_e32 v6, s1
64; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v5
65; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v2
66; GFX8-NEXT:    v_rcp_f32_e32 v3, v0
67; GFX8-NEXT:    v_mul_f32_e32 v7, v1, v3
68; GFX8-NEXT:    v_mad_f32 v8, -v0, v7, v1
69; GFX8-NEXT:    v_mac_f32_e32 v7, v8, v3
70; GFX8-NEXT:    v_mad_f32 v0, -v0, v7, v1
71; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v3
72; GFX8-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
73; GFX8-NEXT:    v_add_f32_e32 v0, v0, v7
74; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v0
75; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
76; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
77; GFX8-NEXT:    v_div_fixup_f16 v2, v3, v2, v5
78; GFX8-NEXT:    flat_store_short v[0:1], v2
79; GFX8-NEXT:    s_endpgm
80;
81; GFX9-LABEL: v_fdiv_f16:
82; GFX9:       ; %bb.0: ; %entry
83; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
84; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
85; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
86; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
87; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
88; GFX9-NEXT:    s_waitcnt vmcnt(0)
89; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
90; GFX9-NEXT:    s_waitcnt vmcnt(0)
91; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v1
92; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
93; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
94; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v3
95; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
96; GFX9-NEXT:    v_mac_f32_e32 v4, v5, v3
97; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
98; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
99; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
100; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
101; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
102; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v2, v1
103; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
104; GFX9-NEXT:    s_endpgm
105;
106; GFX10-LABEL: v_fdiv_f16:
107; GFX10:       ; %bb.0: ; %entry
108; GFX10-NEXT:    s_clause 0x1
109; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
110; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
111; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
112; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
114; GFX10-NEXT:    s_waitcnt vmcnt(0)
115; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
116; GFX10-NEXT:    s_waitcnt vmcnt(0)
117; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
118; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v2
119; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
120; GFX10-NEXT:    v_mul_f32_e32 v6, v5, v4
121; GFX10-NEXT:    v_mad_f32 v7, -v3, v6, v5
122; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v4
123; GFX10-NEXT:    v_mad_f32 v3, -v3, v6, v5
124; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
125; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
126; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
127; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
128; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v2, v1
129; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
130; GFX10-NEXT:    s_endpgm
131;
132; GFX11-LABEL: v_fdiv_f16:
133; GFX11:       ; %bb.0: ; %entry
134; GFX11-NEXT:    s_clause 0x1
135; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
136; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
137; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
138; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
139; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
140; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
142; GFX11-NEXT:    s_waitcnt vmcnt(0)
143; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
144; GFX11-NEXT:    s_waitcnt vmcnt(0)
145; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v1
146; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v2
147; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
148; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
149; GFX11-NEXT:    s_waitcnt_depctr 0xfff
150; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
151; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
152; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
153; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v3
154; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
155; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
156; GFX11-NEXT:    v_mul_f32_e32 v3, v5, v3
157; GFX11-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
158; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
159; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
160; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
161; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
162; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v2, v1
163; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
164; GFX11-NEXT:    s_endpgm
165    ptr addrspace(1) %r,
166    ptr addrspace(1) %a,
167    ptr addrspace(1) %b) #0 {
168entry:
169  %tid = call i32 @llvm.amdgcn.workitem.id.x()
170  %tid.ext = sext i32 %tid to i64
171  %gep.a = getelementptr inbounds half, ptr addrspace(1) %a, i64 %tid.ext
172  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
173  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
174  %a.val = load volatile half, ptr addrspace(1) %gep.a
175  %b.val = load volatile half, ptr addrspace(1) %gep.b
176  %r.val = fdiv half %a.val, %b.val
177  store half %r.val, ptr addrspace(1) %gep.r
178  ret void
179}
180
181define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
182; SI-LABEL: v_rcp_f16:
183; SI:       ; %bb.0: ; %entry
184; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
185; SI-NEXT:    s_mov_b32 s7, 0xf000
186; SI-NEXT:    s_mov_b32 s6, 0
187; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
188; SI-NEXT:    v_mov_b32_e32 v1, 0
189; SI-NEXT:    s_waitcnt lgkmcnt(0)
190; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
191; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
192; SI-NEXT:    s_waitcnt vmcnt(0)
193; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
194; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
195; SI-NEXT:    v_rcp_f32_e32 v4, v3
196; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
197; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
198; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
199; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
200; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
201; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
202; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
203; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
204; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
205; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
206; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
207; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
208; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
209; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
210; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
211; SI-NEXT:    s_endpgm
212;
213; GFX8-LABEL: v_rcp_f16:
214; GFX8:       ; %bb.0: ; %entry
215; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
216; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
217; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX8-NEXT:    v_mov_b32_e32 v1, s3
219; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
220; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
221; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
222; GFX8-NEXT:    s_waitcnt vmcnt(0)
223; GFX8-NEXT:    v_mov_b32_e32 v1, s1
224; GFX8-NEXT:    v_rcp_f16_e32 v3, v0
225; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
226; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
227; GFX8-NEXT:    flat_store_short v[0:1], v3
228; GFX8-NEXT:    s_endpgm
229;
230; GFX9-LABEL: v_rcp_f16:
231; GFX9:       ; %bb.0: ; %entry
232; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
233; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
236; GFX9-NEXT:    s_waitcnt vmcnt(0)
237; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
238; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
239; GFX9-NEXT:    s_endpgm
240;
241; GFX10-LABEL: v_rcp_f16:
242; GFX10:       ; %bb.0: ; %entry
243; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
244; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
245; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
247; GFX10-NEXT:    s_waitcnt vmcnt(0)
248; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
249; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
250; GFX10-NEXT:    s_endpgm
251;
252; GFX11-LABEL: v_rcp_f16:
253; GFX11:       ; %bb.0: ; %entry
254; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
255; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
256; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
257; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
258; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
260; GFX11-NEXT:    s_waitcnt vmcnt(0)
261; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
262; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
263; GFX11-NEXT:    s_endpgm
264entry:
265  %tid = call i32 @llvm.amdgcn.workitem.id.x()
266  %tid.ext = sext i32 %tid to i64
267  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
268  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
269  %b.val = load volatile half, ptr addrspace(1) %gep.b
270  %r.val = fdiv half 1.0, %b.val
271  store half %r.val, ptr addrspace(1) %gep.r
272  ret void
273}
274
275define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
276; SI-LABEL: v_rcp_f16_abs:
277; SI:       ; %bb.0: ; %entry
278; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
279; SI-NEXT:    s_mov_b32 s7, 0xf000
280; SI-NEXT:    s_mov_b32 s6, 0
281; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
282; SI-NEXT:    v_mov_b32_e32 v1, 0
283; SI-NEXT:    s_waitcnt lgkmcnt(0)
284; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
285; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
286; SI-NEXT:    s_waitcnt vmcnt(0)
287; SI-NEXT:    v_cvt_f32_f16_e64 v2, |v2|
288; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
289; SI-NEXT:    v_rcp_f32_e32 v4, v3
290; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
291; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
292; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
293; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
294; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
295; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
296; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
297; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
298; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
299; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
300; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
301; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
302; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
303; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
304; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
305; SI-NEXT:    s_endpgm
306;
307; GFX8-LABEL: v_rcp_f16_abs:
308; GFX8:       ; %bb.0: ; %entry
309; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
310; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
311; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX8-NEXT:    v_mov_b32_e32 v1, s3
313; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
314; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
315; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
316; GFX8-NEXT:    s_waitcnt vmcnt(0)
317; GFX8-NEXT:    v_mov_b32_e32 v1, s1
318; GFX8-NEXT:    v_rcp_f16_e64 v3, |v0|
319; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
320; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
321; GFX8-NEXT:    flat_store_short v[0:1], v3
322; GFX8-NEXT:    s_endpgm
323;
324; GFX9-LABEL: v_rcp_f16_abs:
325; GFX9:       ; %bb.0: ; %entry
326; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
327; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
330; GFX9-NEXT:    s_waitcnt vmcnt(0)
331; GFX9-NEXT:    v_rcp_f16_e64 v1, |v1|
332; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
333; GFX9-NEXT:    s_endpgm
334;
335; GFX10-LABEL: v_rcp_f16_abs:
336; GFX10:       ; %bb.0: ; %entry
337; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
338; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
339; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
341; GFX10-NEXT:    s_waitcnt vmcnt(0)
342; GFX10-NEXT:    v_rcp_f16_e64 v1, |v1|
343; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
344; GFX10-NEXT:    s_endpgm
345;
346; GFX11-LABEL: v_rcp_f16_abs:
347; GFX11:       ; %bb.0: ; %entry
348; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
349; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
350; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
351; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
352; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
354; GFX11-NEXT:    s_waitcnt vmcnt(0)
355; GFX11-NEXT:    v_rcp_f16_e64 v1, |v1|
356; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
357; GFX11-NEXT:    s_endpgm
358entry:
359  %tid = call i32 @llvm.amdgcn.workitem.id.x()
360  %tid.ext = sext i32 %tid to i64
361  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
362  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
363  %b.val = load volatile half, ptr addrspace(1) %gep.b
364  %b.abs = call half @llvm.fabs.f16(half %b.val)
365  %r.val = fdiv half 1.0, %b.abs
366  store half %r.val, ptr addrspace(1) %gep.r
367  ret void
368}
369
370; We could not do 1/b -> rcp_f32(b) under !fpmath < 1ulp.
371
372define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
373; SI-LABEL: reciprocal_f16_rounded:
374; SI:       ; %bb.0: ; %entry
375; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
376; SI-NEXT:    s_mov_b32 s7, 0xf000
377; SI-NEXT:    s_mov_b32 s6, 0
378; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
379; SI-NEXT:    v_mov_b32_e32 v1, 0
380; SI-NEXT:    s_waitcnt lgkmcnt(0)
381; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
382; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
383; SI-NEXT:    s_waitcnt vmcnt(0)
384; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
385; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
386; SI-NEXT:    v_rcp_f32_e32 v4, v3
387; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
388; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
389; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
390; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
391; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
392; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
393; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
394; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
395; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
396; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
397; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
398; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
399; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
400; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
401; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
402; SI-NEXT:    s_endpgm
403;
404; GFX8-LABEL: reciprocal_f16_rounded:
405; GFX8:       ; %bb.0: ; %entry
406; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
407; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
408; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX8-NEXT:    v_mov_b32_e32 v1, s3
410; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
411; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
412; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
413; GFX8-NEXT:    s_waitcnt vmcnt(0)
414; GFX8-NEXT:    v_mov_b32_e32 v1, s1
415; GFX8-NEXT:    v_rcp_f16_e32 v3, v0
416; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
417; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
418; GFX8-NEXT:    flat_store_short v[0:1], v3
419; GFX8-NEXT:    s_endpgm
420;
421; GFX9-LABEL: reciprocal_f16_rounded:
422; GFX9:       ; %bb.0: ; %entry
423; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
424; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
425; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
427; GFX9-NEXT:    s_waitcnt vmcnt(0)
428; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
429; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
430; GFX9-NEXT:    s_endpgm
431;
432; GFX10-LABEL: reciprocal_f16_rounded:
433; GFX10:       ; %bb.0: ; %entry
434; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
435; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
436; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
438; GFX10-NEXT:    s_waitcnt vmcnt(0)
439; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
440; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
441; GFX10-NEXT:    s_endpgm
442;
443; GFX11-LABEL: reciprocal_f16_rounded:
444; GFX11:       ; %bb.0: ; %entry
445; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
446; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
447; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
448; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
449; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
451; GFX11-NEXT:    s_waitcnt vmcnt(0)
452; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
453; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
454; GFX11-NEXT:    s_endpgm
455entry:
456  %tid = call i32 @llvm.amdgcn.workitem.id.x()
457  %tid.ext = sext i32 %tid to i64
458  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
459  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
460  %b.val = load volatile half, ptr addrspace(1) %gep.b
461  %r.val = fdiv half 1.0, %b.val
462  store half %r.val, ptr addrspace(1) %gep.r
463  ret void
464}
465
466define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
467; SI-LABEL: v_rcp_f16_afn:
468; SI:       ; %bb.0: ; %entry
469; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
470; SI-NEXT:    s_mov_b32 s7, 0xf000
471; SI-NEXT:    s_mov_b32 s6, 0
472; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
473; SI-NEXT:    v_mov_b32_e32 v1, 0
474; SI-NEXT:    s_waitcnt lgkmcnt(0)
475; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
476; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
477; SI-NEXT:    s_waitcnt vmcnt(0)
478; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
479; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
480; SI-NEXT:    v_rcp_f32_e32 v2, v2
481; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
482; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
483; SI-NEXT:    s_endpgm
484;
485; GFX8-LABEL: v_rcp_f16_afn:
486; GFX8:       ; %bb.0: ; %entry
487; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
488; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
489; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX8-NEXT:    v_mov_b32_e32 v1, s3
491; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
492; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
493; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
494; GFX8-NEXT:    s_waitcnt vmcnt(0)
495; GFX8-NEXT:    v_mov_b32_e32 v1, s1
496; GFX8-NEXT:    v_rcp_f16_e32 v3, v0
497; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
498; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
499; GFX8-NEXT:    flat_store_short v[0:1], v3
500; GFX8-NEXT:    s_endpgm
501;
502; GFX9-LABEL: v_rcp_f16_afn:
503; GFX9:       ; %bb.0: ; %entry
504; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
505; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
506; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
508; GFX9-NEXT:    s_waitcnt vmcnt(0)
509; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
510; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
511; GFX9-NEXT:    s_endpgm
512;
513; GFX10-LABEL: v_rcp_f16_afn:
514; GFX10:       ; %bb.0: ; %entry
515; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
516; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
517; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
519; GFX10-NEXT:    s_waitcnt vmcnt(0)
520; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
521; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
522; GFX10-NEXT:    s_endpgm
523;
524; GFX11-LABEL: v_rcp_f16_afn:
525; GFX11:       ; %bb.0: ; %entry
526; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
527; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
528; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
529; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
530; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
532; GFX11-NEXT:    s_waitcnt vmcnt(0)
533; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
534; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
535; GFX11-NEXT:    s_endpgm
536entry:
537  %tid = call i32 @llvm.amdgcn.workitem.id.x()
538  %tid.ext = sext i32 %tid to i64
539  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
540  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
541  %b.val = load volatile half, ptr addrspace(1) %gep.b
542  %r.val = fdiv afn half 1.0, %b.val
543  store half %r.val, ptr addrspace(1) %gep.r
544  ret void
545}
546
547define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
548; SI-LABEL: v_rcp_f16_neg:
549; SI:       ; %bb.0: ; %entry
550; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
551; SI-NEXT:    s_mov_b32 s7, 0xf000
552; SI-NEXT:    s_mov_b32 s6, 0
553; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
554; SI-NEXT:    v_mov_b32_e32 v1, 0
555; SI-NEXT:    s_waitcnt lgkmcnt(0)
556; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
557; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
558; SI-NEXT:    s_waitcnt vmcnt(0)
559; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
560; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, -1.0
561; SI-NEXT:    v_rcp_f32_e32 v4, v3
562; SI-NEXT:    v_div_scale_f32 v5, vcc, -1.0, v2, -1.0
563; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
564; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
565; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
566; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
567; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
568; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
569; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
570; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
571; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
572; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, -1.0
573; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
574; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
575; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
576; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
577; SI-NEXT:    s_endpgm
578;
579; GFX8-LABEL: v_rcp_f16_neg:
580; GFX8:       ; %bb.0: ; %entry
581; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
582; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
583; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX8-NEXT:    v_mov_b32_e32 v1, s3
585; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
586; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
587; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
588; GFX8-NEXT:    s_waitcnt vmcnt(0)
589; GFX8-NEXT:    v_mov_b32_e32 v1, s1
590; GFX8-NEXT:    v_rcp_f16_e64 v3, -v0
591; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
592; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
593; GFX8-NEXT:    flat_store_short v[0:1], v3
594; GFX8-NEXT:    s_endpgm
595;
596; GFX9-LABEL: v_rcp_f16_neg:
597; GFX9:       ; %bb.0: ; %entry
598; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
599; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
600; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
601; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
602; GFX9-NEXT:    s_waitcnt vmcnt(0)
603; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
604; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
605; GFX9-NEXT:    s_endpgm
606;
607; GFX10-LABEL: v_rcp_f16_neg:
608; GFX10:       ; %bb.0: ; %entry
609; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
610; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
611; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
613; GFX10-NEXT:    s_waitcnt vmcnt(0)
614; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
615; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
616; GFX10-NEXT:    s_endpgm
617;
618; GFX11-LABEL: v_rcp_f16_neg:
619; GFX11:       ; %bb.0: ; %entry
620; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
621; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
622; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
623; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
624; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
626; GFX11-NEXT:    s_waitcnt vmcnt(0)
627; GFX11-NEXT:    v_rcp_f16_e64 v1, -v1
628; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
629; GFX11-NEXT:    s_endpgm
630entry:
631  %tid = call i32 @llvm.amdgcn.workitem.id.x()
632  %tid.ext = sext i32 %tid to i64
633  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
634  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
635  %b.val = load volatile half, ptr addrspace(1) %gep.b
636  %r.val = fdiv half -1.0, %b.val
637  store half %r.val, ptr addrspace(1) %gep.r
638  ret void
639}
640
641define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
642; SI-LABEL: v_rsq_f16:
643; SI:       ; %bb.0: ; %entry
644; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
645; SI-NEXT:    s_mov_b32 s7, 0xf000
646; SI-NEXT:    s_mov_b32 s6, 0
647; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
648; SI-NEXT:    v_mov_b32_e32 v1, 0
649; SI-NEXT:    s_waitcnt lgkmcnt(0)
650; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
651; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
652; SI-NEXT:    s_waitcnt vmcnt(0)
653; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
654; SI-NEXT:    v_sqrt_f32_e32 v2, v2
655; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
656; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
657; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
658; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
659; SI-NEXT:    v_rcp_f32_e32 v4, v3
660; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
661; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
662; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
663; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
664; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
665; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
666; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
667; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
668; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
669; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
670; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
671; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
672; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
673; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
674; SI-NEXT:    s_endpgm
675;
676; GFX8-LABEL: v_rsq_f16:
677; GFX8:       ; %bb.0: ; %entry
678; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
679; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
680; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX8-NEXT:    v_mov_b32_e32 v1, s3
682; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
683; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
684; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
685; GFX8-NEXT:    s_waitcnt vmcnt(0)
686; GFX8-NEXT:    v_mov_b32_e32 v1, s1
687; GFX8-NEXT:    v_rsq_f16_e32 v3, v0
688; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
689; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
690; GFX8-NEXT:    flat_store_short v[0:1], v3
691; GFX8-NEXT:    s_endpgm
692;
693; GFX9-LABEL: v_rsq_f16:
694; GFX9:       ; %bb.0: ; %entry
695; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
696; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
699; GFX9-NEXT:    s_waitcnt vmcnt(0)
700; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
701; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
702; GFX9-NEXT:    s_endpgm
703;
704; GFX10-LABEL: v_rsq_f16:
705; GFX10:       ; %bb.0: ; %entry
706; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
707; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
708; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
710; GFX10-NEXT:    s_waitcnt vmcnt(0)
711; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
712; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
713; GFX10-NEXT:    s_endpgm
714;
715; GFX11-LABEL: v_rsq_f16:
716; GFX11:       ; %bb.0: ; %entry
717; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
718; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
720; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
721; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
722; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
723; GFX11-NEXT:    s_waitcnt vmcnt(0)
724; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
725; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
726; GFX11-NEXT:    s_endpgm
727entry:
728  %tid = call i32 @llvm.amdgcn.workitem.id.x()
729  %tid.ext = sext i32 %tid to i64
730  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
731  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
732  %b.val = load volatile half, ptr addrspace(1) %gep.b
733  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
734  %r.val = fdiv contract half 1.0, %b.sqrt
735  store half %r.val, ptr addrspace(1) %gep.r
736  ret void
737}
738
739define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
740; SI-LABEL: v_rsq_f16_neg:
741; SI:       ; %bb.0: ; %entry
742; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
743; SI-NEXT:    s_mov_b32 s7, 0xf000
744; SI-NEXT:    s_mov_b32 s6, 0
745; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
746; SI-NEXT:    v_mov_b32_e32 v1, 0
747; SI-NEXT:    s_waitcnt lgkmcnt(0)
748; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
749; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
750; SI-NEXT:    s_waitcnt vmcnt(0)
751; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
752; SI-NEXT:    v_sqrt_f32_e32 v2, v2
753; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
754; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
755; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
756; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, -1.0
757; SI-NEXT:    v_rcp_f32_e32 v4, v3
758; SI-NEXT:    v_div_scale_f32 v5, vcc, -1.0, v2, -1.0
759; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
760; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
761; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
762; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
763; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
764; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
765; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
766; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
767; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
768; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, -1.0
769; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
770; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
771; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
772; SI-NEXT:    s_endpgm
773;
774; GFX8-LABEL: v_rsq_f16_neg:
775; GFX8:       ; %bb.0: ; %entry
776; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
777; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
778; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX8-NEXT:    v_mov_b32_e32 v1, s3
780; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
781; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
782; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
783; GFX8-NEXT:    s_waitcnt vmcnt(0)
784; GFX8-NEXT:    v_mov_b32_e32 v1, s1
785; GFX8-NEXT:    v_rsq_f16_e32 v3, v0
786; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
787; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
788; GFX8-NEXT:    v_xor_b32_e32 v2, 0x8000, v3
789; GFX8-NEXT:    flat_store_short v[0:1], v2
790; GFX8-NEXT:    s_endpgm
791;
792; GFX9-LABEL: v_rsq_f16_neg:
793; GFX9:       ; %bb.0: ; %entry
794; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
795; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
796; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
797; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
798; GFX9-NEXT:    s_waitcnt vmcnt(0)
799; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
800; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
801; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
802; GFX9-NEXT:    s_endpgm
803;
804; GFX10-LABEL: v_rsq_f16_neg:
805; GFX10:       ; %bb.0: ; %entry
806; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
807; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
808; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
809; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
810; GFX10-NEXT:    s_waitcnt vmcnt(0)
811; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
812; GFX10-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
813; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
814; GFX10-NEXT:    s_endpgm
815;
816; GFX11-LABEL: v_rsq_f16_neg:
817; GFX11:       ; %bb.0: ; %entry
818; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
819; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
820; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
821; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
822; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
824; GFX11-NEXT:    s_waitcnt vmcnt(0)
825; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
826; GFX11-NEXT:    s_waitcnt_depctr 0xfff
827; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
828; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
829; GFX11-NEXT:    s_endpgm
830entry:
831  %tid = call i32 @llvm.amdgcn.workitem.id.x()
832  %tid.ext = sext i32 %tid to i64
833  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
834  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
835  %b.val = load volatile half, ptr addrspace(1) %gep.b
836  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
837  %r.val = fdiv contract half -1.0, %b.sqrt
838  store half %r.val, ptr addrspace(1) %gep.r
839  ret void
840}
841
842define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
843; SI-LABEL: v_rsq_f16_multi_use:
844; SI:       ; %bb.0: ; %entry
845; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
846; SI-NEXT:    s_mov_b32 s7, 0xf000
847; SI-NEXT:    s_mov_b32 s6, 0
848; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
849; SI-NEXT:    v_mov_b32_e32 v1, 0
850; SI-NEXT:    s_waitcnt lgkmcnt(0)
851; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
852; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
853; SI-NEXT:    s_waitcnt vmcnt(0)
854; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
855; SI-NEXT:    v_sqrt_f32_e32 v3, v3
856; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
857; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
858; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
859; SI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, 1.0
860; SI-NEXT:    v_rcp_f32_e32 v5, v4
861; SI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
862; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
863; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
864; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
865; SI-NEXT:    v_mul_f32_e32 v7, v6, v5
866; SI-NEXT:    v_fma_f32 v8, -v4, v7, v6
867; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
868; SI-NEXT:    v_fma_f32 v4, -v4, v7, v6
869; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
870; SI-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
871; SI-NEXT:    v_div_fixup_f32 v3, v4, v3, 1.0
872; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
873; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
874; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
875; SI-NEXT:    s_waitcnt vmcnt(0)
876; SI-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
877; SI-NEXT:    s_endpgm
878;
879; GFX8-LABEL: v_rsq_f16_multi_use:
880; GFX8:       ; %bb.0: ; %entry
881; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
882; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
883; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
884; GFX8-NEXT:    v_mov_b32_e32 v1, s3
885; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
886; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
887; GFX8-NEXT:    flat_load_ushort v3, v[0:1] glc
888; GFX8-NEXT:    s_waitcnt vmcnt(0)
889; GFX8-NEXT:    v_mov_b32_e32 v1, s1
890; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
891; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
892; GFX8-NEXT:    v_rsq_f16_e32 v4, v3
893; GFX8-NEXT:    flat_store_short v[0:1], v3
894; GFX8-NEXT:    s_waitcnt vmcnt(0)
895; GFX8-NEXT:    flat_store_short v[0:1], v4
896; GFX8-NEXT:    s_endpgm
897;
898; GFX9-LABEL: v_rsq_f16_multi_use:
899; GFX9:       ; %bb.0: ; %entry
900; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
901; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
904; GFX9-NEXT:    s_waitcnt vmcnt(0)
905; GFX9-NEXT:    v_rsq_f16_e32 v2, v1
906; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
907; GFX9-NEXT:    s_waitcnt vmcnt(0)
908; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
909; GFX9-NEXT:    s_endpgm
910;
911; GFX10-LABEL: v_rsq_f16_multi_use:
912; GFX10:       ; %bb.0: ; %entry
913; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
914; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
915; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
917; GFX10-NEXT:    s_waitcnt vmcnt(0)
918; GFX10-NEXT:    v_rsq_f16_e32 v2, v1
919; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
920; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
921; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
922; GFX10-NEXT:    s_endpgm
923;
924; GFX11-LABEL: v_rsq_f16_multi_use:
925; GFX11:       ; %bb.0: ; %entry
926; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
927; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
928; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
929; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
930; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
932; GFX11-NEXT:    s_waitcnt vmcnt(0)
933; GFX11-NEXT:    v_rsq_f16_e32 v2, v1
934; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
935; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
936; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1]
937; GFX11-NEXT:    s_endpgm
938entry:
939  %tid = call i32 @llvm.amdgcn.workitem.id.x()
940  %tid.ext = sext i32 %tid to i64
941  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
942  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
943  %b.val = load volatile half, ptr addrspace(1) %gep.b
944  store volatile half %b.val, ptr addrspace(1) %gep.r
945  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
946  %r.val = fdiv contract half 1.0, %b.sqrt
947  store half %r.val, ptr addrspace(1) %gep.r
948  ret void
949}
950
951define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
952; SI-LABEL: v_rsq_f16_missing_contract0:
953; SI:       ; %bb.0: ; %entry
954; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
955; SI-NEXT:    s_mov_b32 s7, 0xf000
956; SI-NEXT:    s_mov_b32 s6, 0
957; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
958; SI-NEXT:    v_mov_b32_e32 v1, 0
959; SI-NEXT:    s_waitcnt lgkmcnt(0)
960; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
961; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
962; SI-NEXT:    s_waitcnt vmcnt(0)
963; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
964; SI-NEXT:    v_sqrt_f32_e32 v2, v2
965; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
966; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
967; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
968; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
969; SI-NEXT:    v_rcp_f32_e32 v4, v3
970; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
971; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
972; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
973; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
974; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
975; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
976; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
977; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
978; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
979; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
980; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
981; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
982; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
983; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
984; SI-NEXT:    s_endpgm
985;
986; GFX8-LABEL: v_rsq_f16_missing_contract0:
987; GFX8:       ; %bb.0: ; %entry
988; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
989; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
990; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX8-NEXT:    v_mov_b32_e32 v1, s3
992; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
993; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
994; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
995; GFX8-NEXT:    s_waitcnt vmcnt(0)
996; GFX8-NEXT:    v_mov_b32_e32 v1, s1
997; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
998; GFX8-NEXT:    v_rcp_f16_e32 v3, v0
999; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1000; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1001; GFX8-NEXT:    flat_store_short v[0:1], v3
1002; GFX8-NEXT:    s_endpgm
1003;
1004; GFX9-LABEL: v_rsq_f16_missing_contract0:
1005; GFX9:       ; %bb.0: ; %entry
1006; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1007; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1008; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1009; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
1010; GFX9-NEXT:    s_waitcnt vmcnt(0)
1011; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
1012; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
1013; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1014; GFX9-NEXT:    s_endpgm
1015;
1016; GFX10-LABEL: v_rsq_f16_missing_contract0:
1017; GFX10:       ; %bb.0: ; %entry
1018; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1019; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1020; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1021; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1022; GFX10-NEXT:    s_waitcnt vmcnt(0)
1023; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
1024; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
1025; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1026; GFX10-NEXT:    s_endpgm
1027;
1028; GFX11-LABEL: v_rsq_f16_missing_contract0:
1029; GFX11:       ; %bb.0: ; %entry
1030; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1031; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1032; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1033; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1034; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1036; GFX11-NEXT:    s_waitcnt vmcnt(0)
1037; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
1038; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1039; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
1040; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1041; GFX11-NEXT:    s_endpgm
1042entry:
1043  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1044  %tid.ext = sext i32 %tid to i64
1045  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
1046  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
1047  %b.val = load volatile half, ptr addrspace(1) %gep.b
1048  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
1049  %r.val = fdiv contract half 1.0, %b.sqrt
1050  store half %r.val, ptr addrspace(1) %gep.r
1051  ret void
1052}
1053
1054define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
1055; SI-LABEL: v_rsq_f16_missing_contract1:
1056; SI:       ; %bb.0: ; %entry
1057; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1058; SI-NEXT:    s_mov_b32 s7, 0xf000
1059; SI-NEXT:    s_mov_b32 s6, 0
1060; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1061; SI-NEXT:    v_mov_b32_e32 v1, 0
1062; SI-NEXT:    s_waitcnt lgkmcnt(0)
1063; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1064; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
1065; SI-NEXT:    s_waitcnt vmcnt(0)
1066; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1067; SI-NEXT:    v_sqrt_f32_e32 v2, v2
1068; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1069; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1070; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1071; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
1072; SI-NEXT:    v_rcp_f32_e32 v4, v3
1073; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
1074; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1075; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
1076; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
1077; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
1078; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
1079; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
1080; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
1081; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1082; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
1083; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
1084; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1085; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1086; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1087; SI-NEXT:    s_endpgm
1088;
1089; GFX8-LABEL: v_rsq_f16_missing_contract1:
1090; GFX8:       ; %bb.0: ; %entry
1091; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1092; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1093; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1094; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1095; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1096; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1097; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
1098; GFX8-NEXT:    s_waitcnt vmcnt(0)
1099; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1100; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
1101; GFX8-NEXT:    v_rcp_f16_e32 v3, v0
1102; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1103; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1104; GFX8-NEXT:    flat_store_short v[0:1], v3
1105; GFX8-NEXT:    s_endpgm
1106;
1107; GFX9-LABEL: v_rsq_f16_missing_contract1:
1108; GFX9:       ; %bb.0: ; %entry
1109; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1110; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1111; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
1113; GFX9-NEXT:    s_waitcnt vmcnt(0)
1114; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
1115; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
1116; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1117; GFX9-NEXT:    s_endpgm
1118;
1119; GFX10-LABEL: v_rsq_f16_missing_contract1:
1120; GFX10:       ; %bb.0: ; %entry
1121; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1122; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1123; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1125; GFX10-NEXT:    s_waitcnt vmcnt(0)
1126; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
1127; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
1128; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1129; GFX10-NEXT:    s_endpgm
1130;
1131; GFX11-LABEL: v_rsq_f16_missing_contract1:
1132; GFX11:       ; %bb.0: ; %entry
1133; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1134; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1135; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1136; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1137; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1139; GFX11-NEXT:    s_waitcnt vmcnt(0)
1140; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
1141; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1142; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
1143; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1144; GFX11-NEXT:    s_endpgm
1145entry:
1146  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1147  %tid.ext = sext i32 %tid to i64
1148  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
1149  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
1150  %b.val = load volatile half, ptr addrspace(1) %gep.b
1151  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
1152  %r.val = fdiv half 1.0, %b.sqrt
1153  store half %r.val, ptr addrspace(1) %gep.r
1154  ret void
1155}
1156
1157define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
1158; SI-LABEL: v_neg_rsq_f16_missing_contract1:
1159; SI:       ; %bb.0: ; %entry
1160; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1161; SI-NEXT:    s_mov_b32 s7, 0xf000
1162; SI-NEXT:    s_mov_b32 s6, 0
1163; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1164; SI-NEXT:    v_mov_b32_e32 v1, 0
1165; SI-NEXT:    s_waitcnt lgkmcnt(0)
1166; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1167; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
1168; SI-NEXT:    s_waitcnt vmcnt(0)
1169; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1170; SI-NEXT:    v_sqrt_f32_e32 v2, v2
1171; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1172; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1173; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1174; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, -1.0
1175; SI-NEXT:    v_rcp_f32_e32 v4, v3
1176; SI-NEXT:    v_div_scale_f32 v5, vcc, -1.0, v2, -1.0
1177; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1178; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
1179; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
1180; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
1181; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
1182; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
1183; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
1184; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1185; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
1186; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, -1.0
1187; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1188; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1189; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1190; SI-NEXT:    s_endpgm
1191;
1192; GFX8-LABEL: v_neg_rsq_f16_missing_contract1:
1193; GFX8:       ; %bb.0: ; %entry
1194; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1195; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1196; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1197; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1198; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1199; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1200; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
1201; GFX8-NEXT:    s_waitcnt vmcnt(0)
1202; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1203; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
1204; GFX8-NEXT:    v_rcp_f16_e64 v3, -v0
1205; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1206; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1207; GFX8-NEXT:    flat_store_short v[0:1], v3
1208; GFX8-NEXT:    s_endpgm
1209;
1210; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
1211; GFX9:       ; %bb.0: ; %entry
1212; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1213; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1215; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
1216; GFX9-NEXT:    s_waitcnt vmcnt(0)
1217; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
1218; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
1219; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1220; GFX9-NEXT:    s_endpgm
1221;
1222; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
1223; GFX10:       ; %bb.0: ; %entry
1224; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1225; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1226; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1227; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1228; GFX10-NEXT:    s_waitcnt vmcnt(0)
1229; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
1230; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
1231; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1232; GFX10-NEXT:    s_endpgm
1233;
1234; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
1235; GFX11:       ; %bb.0: ; %entry
1236; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1237; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1238; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1239; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1240; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1241; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1242; GFX11-NEXT:    s_waitcnt vmcnt(0)
1243; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
1244; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1245; GFX11-NEXT:    v_rcp_f16_e64 v1, -v1
1246; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1247; GFX11-NEXT:    s_endpgm
1248entry:
1249  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1250  %tid.ext = sext i32 %tid to i64
1251  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
1252  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
1253  %b.val = load volatile half, ptr addrspace(1) %gep.b
1254  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
1255  %r.val = fdiv half -1.0, %b.sqrt
1256  store half %r.val, ptr addrspace(1) %gep.r
1257  ret void
1258}
1259
1260define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 {
1261; SI-LABEL: v_fdiv_f16_afn:
1262; SI:       ; %bb.0: ; %entry
1263; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1264; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1265; SI-NEXT:    s_mov_b32 s7, 0xf000
1266; SI-NEXT:    s_mov_b32 s6, 0
1267; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1268; SI-NEXT:    s_waitcnt lgkmcnt(0)
1269; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1270; SI-NEXT:    v_mov_b32_e32 v1, 0
1271; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
1272; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
1273; SI-NEXT:    s_waitcnt vmcnt(0)
1274; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
1275; SI-NEXT:    s_waitcnt vmcnt(0)
1276; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1277; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1278; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1279; SI-NEXT:    v_rcp_f32_e32 v3, v3
1280; SI-NEXT:    v_mul_f32_e32 v2, v2, v3
1281; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1282; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1283; SI-NEXT:    s_endpgm
1284;
1285; GFX8-LABEL: v_fdiv_f16_afn:
1286; GFX8:       ; %bb.0: ; %entry
1287; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1288; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1289; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
1290; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1292; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1293; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1294; GFX8-NEXT:    v_mov_b32_e32 v3, s5
1295; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1296; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1297; GFX8-NEXT:    flat_load_ushort v5, v[0:1] glc
1298; GFX8-NEXT:    s_waitcnt vmcnt(0)
1299; GFX8-NEXT:    flat_load_ushort v0, v[2:3] glc
1300; GFX8-NEXT:    s_waitcnt vmcnt(0)
1301; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1302; GFX8-NEXT:    v_rcp_f16_e32 v2, v0
1303; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1304; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1305; GFX8-NEXT:    v_mul_f16_e32 v2, v5, v2
1306; GFX8-NEXT:    flat_store_short v[0:1], v2
1307; GFX8-NEXT:    s_endpgm
1308;
1309; GFX9-LABEL: v_fdiv_f16_afn:
1310; GFX9:       ; %bb.0: ; %entry
1311; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1312; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1313; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1314; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
1316; GFX9-NEXT:    s_waitcnt vmcnt(0)
1317; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
1318; GFX9-NEXT:    s_waitcnt vmcnt(0)
1319; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
1320; GFX9-NEXT:    v_mul_f16_e32 v1, v1, v2
1321; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1322; GFX9-NEXT:    s_endpgm
1323;
1324; GFX10-LABEL: v_fdiv_f16_afn:
1325; GFX10:       ; %bb.0: ; %entry
1326; GFX10-NEXT:    s_clause 0x1
1327; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1328; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1329; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1330; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1332; GFX10-NEXT:    s_waitcnt vmcnt(0)
1333; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
1334; GFX10-NEXT:    s_waitcnt vmcnt(0)
1335; GFX10-NEXT:    v_rcp_f16_e32 v2, v2
1336; GFX10-NEXT:    v_mul_f16_e32 v1, v1, v2
1337; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1338; GFX10-NEXT:    s_endpgm
1339;
1340; GFX11-LABEL: v_fdiv_f16_afn:
1341; GFX11:       ; %bb.0: ; %entry
1342; GFX11-NEXT:    s_clause 0x1
1343; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1344; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1345; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1346; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1347; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1348; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1349; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1350; GFX11-NEXT:    s_waitcnt vmcnt(0)
1351; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
1352; GFX11-NEXT:    s_waitcnt vmcnt(0)
1353; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
1354; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1355; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v2
1356; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1357; GFX11-NEXT:    s_endpgm
1358entry:
1359  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1360  %tid.ext = sext i32 %tid to i64
1361  %gep.a = getelementptr inbounds half, ptr addrspace(1) %a, i64 %tid.ext
1362  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
1363  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
1364  %a.val = load volatile half, ptr addrspace(1) %gep.a
1365  %b.val = load volatile half, ptr addrspace(1) %gep.b
1366  %r.val = fdiv afn half %a.val, %b.val
1367  store half %r.val, ptr addrspace(1) %gep.r
1368  ret void
1369}
1370
1371define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #2 {
1372; SI-LABEL: v_fdiv_f16_unsafe:
1373; SI:       ; %bb.0: ; %entry
1374; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1375; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1376; SI-NEXT:    s_mov_b32 s7, 0xf000
1377; SI-NEXT:    s_mov_b32 s6, 0
1378; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1379; SI-NEXT:    s_waitcnt lgkmcnt(0)
1380; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1381; SI-NEXT:    v_mov_b32_e32 v1, 0
1382; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
1383; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
1384; SI-NEXT:    s_waitcnt vmcnt(0)
1385; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
1386; SI-NEXT:    s_waitcnt vmcnt(0)
1387; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1388; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1389; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1390; SI-NEXT:    v_rcp_f32_e32 v3, v3
1391; SI-NEXT:    v_mul_f32_e32 v2, v2, v3
1392; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1393; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
1394; SI-NEXT:    s_endpgm
1395;
1396; GFX8-LABEL: v_fdiv_f16_unsafe:
1397; GFX8:       ; %bb.0: ; %entry
1398; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1399; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1400; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
1401; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1402; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1403; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1404; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1405; GFX8-NEXT:    v_mov_b32_e32 v3, s5
1406; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1407; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1408; GFX8-NEXT:    flat_load_ushort v5, v[0:1] glc
1409; GFX8-NEXT:    s_waitcnt vmcnt(0)
1410; GFX8-NEXT:    flat_load_ushort v0, v[2:3] glc
1411; GFX8-NEXT:    s_waitcnt vmcnt(0)
1412; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1413; GFX8-NEXT:    v_rcp_f16_e32 v2, v0
1414; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1415; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1416; GFX8-NEXT:    v_mul_f16_e32 v2, v5, v2
1417; GFX8-NEXT:    flat_store_short v[0:1], v2
1418; GFX8-NEXT:    s_endpgm
1419;
1420; GFX9-LABEL: v_fdiv_f16_unsafe:
1421; GFX9:       ; %bb.0: ; %entry
1422; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1423; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1424; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1425; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1426; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
1427; GFX9-NEXT:    s_waitcnt vmcnt(0)
1428; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
1429; GFX9-NEXT:    s_waitcnt vmcnt(0)
1430; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
1431; GFX9-NEXT:    v_mul_f16_e32 v1, v1, v2
1432; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1433; GFX9-NEXT:    s_endpgm
1434;
1435; GFX10-LABEL: v_fdiv_f16_unsafe:
1436; GFX10:       ; %bb.0: ; %entry
1437; GFX10-NEXT:    s_clause 0x1
1438; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1439; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1440; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1441; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1442; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1443; GFX10-NEXT:    s_waitcnt vmcnt(0)
1444; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] glc dlc
1445; GFX10-NEXT:    s_waitcnt vmcnt(0)
1446; GFX10-NEXT:    v_rcp_f16_e32 v2, v2
1447; GFX10-NEXT:    v_mul_f16_e32 v1, v1, v2
1448; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1449; GFX10-NEXT:    s_endpgm
1450;
1451; GFX11-LABEL: v_fdiv_f16_unsafe:
1452; GFX11:       ; %bb.0: ; %entry
1453; GFX11-NEXT:    s_clause 0x1
1454; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1455; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1456; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1457; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1458; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1459; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1460; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1461; GFX11-NEXT:    s_waitcnt vmcnt(0)
1462; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
1463; GFX11-NEXT:    s_waitcnt vmcnt(0)
1464; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
1465; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1466; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v2
1467; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1468; GFX11-NEXT:    s_endpgm
1469entry:
1470  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1471  %tid.ext = sext i32 %tid to i64
1472  %gep.a = getelementptr inbounds half, ptr addrspace(1) %a, i64 %tid.ext
1473  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
1474  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
1475  %a.val = load volatile half, ptr addrspace(1) %gep.a
1476  %b.val = load volatile half, ptr addrspace(1) %gep.b
1477  %r.val = fdiv half %a.val, %b.val
1478  store half %r.val, ptr addrspace(1) %gep.r
1479  ret void
1480}
1481
1482define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
1483; SI-LABEL: div_afn_2_x_pat_f16:
1484; SI:       ; %bb.0:
1485; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1486; SI-NEXT:    s_mov_b32 s3, 0xf000
1487; SI-NEXT:    s_mov_b32 s2, -1
1488; SI-NEXT:    s_waitcnt lgkmcnt(0)
1489; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1490; SI-NEXT:    s_waitcnt vmcnt(0)
1491; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1492; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
1493; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1494; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1495; SI-NEXT:    s_endpgm
1496;
1497; GFX8-LABEL: div_afn_2_x_pat_f16:
1498; GFX8:       ; %bb.0:
1499; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
1500; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1501; GFX8-NEXT:    s_waitcnt vmcnt(0)
1502; GFX8-NEXT:    v_mul_f16_e32 v2, 0.5, v0
1503; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1504; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1505; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1506; GFX8-NEXT:    flat_store_short v[0:1], v2
1507; GFX8-NEXT:    s_endpgm
1508;
1509; GFX9-LABEL: div_afn_2_x_pat_f16:
1510; GFX9:       ; %bb.0:
1511; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
1512; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1513; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1514; GFX9-NEXT:    s_waitcnt vmcnt(0)
1515; GFX9-NEXT:    v_mul_f16_e32 v0, 0.5, v0
1516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1517; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
1518; GFX9-NEXT:    s_endpgm
1519;
1520; GFX10-LABEL: div_afn_2_x_pat_f16:
1521; GFX10:       ; %bb.0:
1522; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
1523; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1524; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1525; GFX10-NEXT:    s_waitcnt vmcnt(0)
1526; GFX10-NEXT:    v_mul_f16_e32 v0, 0.5, v0
1527; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1528; GFX10-NEXT:    global_store_short v1, v0, s[0:1]
1529; GFX10-NEXT:    s_endpgm
1530;
1531; GFX11-LABEL: div_afn_2_x_pat_f16:
1532; GFX11:       ; %bb.0:
1533; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
1534; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1535; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1536; GFX11-NEXT:    s_waitcnt vmcnt(0)
1537; GFX11-NEXT:    v_mul_f16_e32 v0, 0.5, v0
1538; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1539; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
1540; GFX11-NEXT:    s_endpgm
1541  %x = load half, ptr addrspace(1) undef
1542  %rcp = fdiv afn half %x, 2.0
1543  store half %rcp, ptr addrspace(1) %out, align 4
1544  ret void
1545}
1546
1547define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
1548; SI-LABEL: div_afn_k_x_pat_f16:
1549; SI:       ; %bb.0:
1550; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1551; SI-NEXT:    s_mov_b32 s3, 0xf000
1552; SI-NEXT:    s_mov_b32 s2, -1
1553; SI-NEXT:    s_waitcnt lgkmcnt(0)
1554; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1555; SI-NEXT:    s_waitcnt vmcnt(0)
1556; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1557; SI-NEXT:    v_mul_f32_e32 v0, 0x3dcccccd, v0
1558; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1559; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1560; SI-NEXT:    s_endpgm
1561;
1562; GFX8-LABEL: div_afn_k_x_pat_f16:
1563; GFX8:       ; %bb.0:
1564; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
1565; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1566; GFX8-NEXT:    s_waitcnt vmcnt(0)
1567; GFX8-NEXT:    v_mul_f16_e32 v2, 0x2e66, v0
1568; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1569; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1570; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1571; GFX8-NEXT:    flat_store_short v[0:1], v2
1572; GFX8-NEXT:    s_endpgm
1573;
1574; GFX9-LABEL: div_afn_k_x_pat_f16:
1575; GFX9:       ; %bb.0:
1576; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
1577; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1578; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1579; GFX9-NEXT:    s_waitcnt vmcnt(0)
1580; GFX9-NEXT:    v_mul_f16_e32 v0, 0x2e66, v0
1581; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1582; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
1583; GFX9-NEXT:    s_endpgm
1584;
1585; GFX10-LABEL: div_afn_k_x_pat_f16:
1586; GFX10:       ; %bb.0:
1587; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
1588; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1589; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1590; GFX10-NEXT:    s_waitcnt vmcnt(0)
1591; GFX10-NEXT:    v_mul_f16_e32 v0, 0x2e66, v0
1592; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1593; GFX10-NEXT:    global_store_short v1, v0, s[0:1]
1594; GFX10-NEXT:    s_endpgm
1595;
1596; GFX11-LABEL: div_afn_k_x_pat_f16:
1597; GFX11:       ; %bb.0:
1598; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
1599; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1600; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1601; GFX11-NEXT:    s_waitcnt vmcnt(0)
1602; GFX11-NEXT:    v_mul_f16_e32 v0, 0x2e66, v0
1603; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1604; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
1605; GFX11-NEXT:    s_endpgm
1606  %x = load half, ptr addrspace(1) undef
1607  %rcp = fdiv afn half %x, 10.0
1608  store half %rcp, ptr addrspace(1) %out, align 4
1609  ret void
1610}
1611
1612define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
1613; SI-LABEL: div_afn_neg_k_x_pat_f16:
1614; SI:       ; %bb.0:
1615; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1616; SI-NEXT:    s_mov_b32 s3, 0xf000
1617; SI-NEXT:    s_mov_b32 s2, -1
1618; SI-NEXT:    s_waitcnt lgkmcnt(0)
1619; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
1620; SI-NEXT:    s_waitcnt vmcnt(0)
1621; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1622; SI-NEXT:    v_mul_f32_e32 v0, 0xbdcccccd, v0
1623; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1624; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1625; SI-NEXT:    s_endpgm
1626;
1627; GFX8-LABEL: div_afn_neg_k_x_pat_f16:
1628; GFX8:       ; %bb.0:
1629; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
1630; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1631; GFX8-NEXT:    s_waitcnt vmcnt(0)
1632; GFX8-NEXT:    v_mul_f16_e32 v2, 0xae66, v0
1633; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1634; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1635; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1636; GFX8-NEXT:    flat_store_short v[0:1], v2
1637; GFX8-NEXT:    s_endpgm
1638;
1639; GFX9-LABEL: div_afn_neg_k_x_pat_f16:
1640; GFX9:       ; %bb.0:
1641; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
1642; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1643; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1644; GFX9-NEXT:    s_waitcnt vmcnt(0)
1645; GFX9-NEXT:    v_mul_f16_e32 v0, 0xae66, v0
1646; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1647; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
1648; GFX9-NEXT:    s_endpgm
1649;
1650; GFX10-LABEL: div_afn_neg_k_x_pat_f16:
1651; GFX10:       ; %bb.0:
1652; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
1653; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1654; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1655; GFX10-NEXT:    s_waitcnt vmcnt(0)
1656; GFX10-NEXT:    v_mul_f16_e32 v0, 0xae66, v0
1657; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1658; GFX10-NEXT:    global_store_short v1, v0, s[0:1]
1659; GFX10-NEXT:    s_endpgm
1660;
1661; GFX11-LABEL: div_afn_neg_k_x_pat_f16:
1662; GFX11:       ; %bb.0:
1663; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
1664; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1665; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1666; GFX11-NEXT:    s_waitcnt vmcnt(0)
1667; GFX11-NEXT:    v_mul_f16_e32 v0, 0xae66, v0
1668; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
1670; GFX11-NEXT:    s_endpgm
1671  %x = load half, ptr addrspace(1) undef
1672  %rcp = fdiv afn half %x, -10.0
1673  store half %rcp, ptr addrspace(1) %out, align 4
1674  ret void
1675}
1676
1677define half @v_fdiv_f16_arcp(half %x, half %y) {
1678; SI-LABEL: v_fdiv_f16_arcp:
1679; SI:       ; %bb.0:
1680; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1681; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1682; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1683; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1684; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1685; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1686; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
1687; SI-NEXT:    v_rcp_f32_e32 v3, v2
1688; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
1689; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1690; SI-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
1691; SI-NEXT:    v_fma_f32 v3, v5, v3, v3
1692; SI-NEXT:    v_mul_f32_e32 v5, v4, v3
1693; SI-NEXT:    v_fma_f32 v6, -v2, v5, v4
1694; SI-NEXT:    v_fma_f32 v5, v6, v3, v5
1695; SI-NEXT:    v_fma_f32 v2, -v2, v5, v4
1696; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1697; SI-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
1698; SI-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
1699; SI-NEXT:    s_setpc_b64 s[30:31]
1700;
1701; GFX8-LABEL: v_fdiv_f16_arcp:
1702; GFX8:       ; %bb.0:
1703; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1704; GFX8-NEXT:    v_rcp_f16_e32 v1, v1
1705; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
1706; GFX8-NEXT:    s_setpc_b64 s[30:31]
1707;
1708; GFX9-LABEL: v_fdiv_f16_arcp:
1709; GFX9:       ; %bb.0:
1710; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1711; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
1712; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
1713; GFX9-NEXT:    s_setpc_b64 s[30:31]
1714;
1715; GFX10-LABEL: v_fdiv_f16_arcp:
1716; GFX10:       ; %bb.0:
1717; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1718; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
1719; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
1720; GFX10-NEXT:    s_setpc_b64 s[30:31]
1721;
1722; GFX11-LABEL: v_fdiv_f16_arcp:
1723; GFX11:       ; %bb.0:
1724; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1725; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
1726; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1727; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
1728; GFX11-NEXT:    s_setpc_b64 s[30:31]
1729  %fdiv = fdiv arcp half %x, %y
1730  ret half %fdiv
1731}
1732
1733define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
1734; SI-LABEL: v_fdiv_f16_afn_nsz:
1735; SI:       ; %bb.0:
1736; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1737; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1738; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1739; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1740; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1741; SI-NEXT:    v_rcp_f32_e32 v1, v1
1742; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
1743; SI-NEXT:    s_setpc_b64 s[30:31]
1744;
1745; GFX8-LABEL: v_fdiv_f16_afn_nsz:
1746; GFX8:       ; %bb.0:
1747; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1748; GFX8-NEXT:    v_rcp_f16_e32 v1, v1
1749; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
1750; GFX8-NEXT:    s_setpc_b64 s[30:31]
1751;
1752; GFX9-LABEL: v_fdiv_f16_afn_nsz:
1753; GFX9:       ; %bb.0:
1754; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1755; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
1756; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
1757; GFX9-NEXT:    s_setpc_b64 s[30:31]
1758;
1759; GFX10-LABEL: v_fdiv_f16_afn_nsz:
1760; GFX10:       ; %bb.0:
1761; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1762; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
1763; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
1764; GFX10-NEXT:    s_setpc_b64 s[30:31]
1765;
1766; GFX11-LABEL: v_fdiv_f16_afn_nsz:
1767; GFX11:       ; %bb.0:
1768; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1769; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
1770; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1771; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
1772; GFX11-NEXT:    s_setpc_b64 s[30:31]
1773  %fdiv = fdiv afn nsz half %x, %y
1774  ret half %fdiv
1775}
1776
1777define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
1778; GFX6-IEEE-LABEL: v_rsq_v2f16:
1779; GFX6-IEEE:       ; %bb.0:
1780; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1781; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
1782; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
1783; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
1784; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
1785; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
1786; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
1787; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
1788; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
1789; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
1790; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
1791; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
1792; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v2, v0, v2
1793; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v1, v1, v2
1794; GFX6-IEEE-NEXT:    v_fma_f32 v9, -v3, v6, 1.0
1795; GFX6-IEEE-NEXT:    v_fma_f32 v6, v9, v6, v6
1796; GFX6-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
1797; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v8, v5
1798; GFX6-IEEE-NEXT:    v_fma_f32 v10, -v3, v9, v4
1799; GFX6-IEEE-NEXT:    v_fma_f32 v9, v10, v6, v9
1800; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v9, v4
1801; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v6, v9
1802; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
1803; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v5, v8, 1.0
1804; GFX6-IEEE-NEXT:    v_div_scale_f32 v7, s[4:5], v2, v1, v2
1805; GFX6-IEEE-NEXT:    v_fma_f32 v3, v3, v8, v8
1806; GFX6-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v3
1807; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v5, v4, v7
1808; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v3, v4
1809; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v5, v4, v7
1810; GFX6-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
1811; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
1812; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
1813; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
1814; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
1815; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
1816;
1817; GFX6-FLUSH-LABEL: v_rsq_v2f16:
1818; GFX6-FLUSH:       ; %bb.0:
1819; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1820; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
1821; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
1822; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
1823; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
1824; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v1, v1
1825; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1826; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
1827; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
1828; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
1829; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
1830; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
1831; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
1832; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1833; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
1834; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
1835; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
1836; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
1837; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
1838; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
1839; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1840; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
1841; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
1842; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
1843; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
1844; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
1845; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
1846; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
1847; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
1848; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1849; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
1850; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
1851; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
1852; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
1853; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
1854; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
1855; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1856; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
1857; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
1858; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
1859; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
1860;
1861; SI-LABEL: v_rsq_v2f16:
1862; SI:       ; %bb.0:
1863; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1864; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1865; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1866; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1867; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1868; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1869; SI-NEXT:    v_sqrt_f32_e32 v0, v0
1870; SI-NEXT:    v_sqrt_f32_e32 v1, v1
1871; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1872; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1873; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1874; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1875; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
1876; SI-NEXT:    v_rcp_f32_e32 v3, v2
1877; SI-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
1878; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1879; SI-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
1880; SI-NEXT:    v_fma_f32 v3, v5, v3, v3
1881; SI-NEXT:    v_mul_f32_e32 v5, v4, v3
1882; SI-NEXT:    v_fma_f32 v6, -v2, v5, v4
1883; SI-NEXT:    v_fma_f32 v5, v6, v3, v5
1884; SI-NEXT:    v_fma_f32 v2, -v2, v5, v4
1885; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1886; SI-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
1887; SI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
1888; SI-NEXT:    v_rcp_f32_e32 v4, v3
1889; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v1, 1.0
1890; SI-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
1891; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1892; SI-NEXT:    v_fma_f32 v2, -v3, v4, 1.0
1893; SI-NEXT:    v_fma_f32 v2, v2, v4, v4
1894; SI-NEXT:    v_mul_f32_e32 v4, v5, v2
1895; SI-NEXT:    v_fma_f32 v6, -v3, v4, v5
1896; SI-NEXT:    v_fma_f32 v4, v6, v2, v4
1897; SI-NEXT:    v_fma_f32 v3, -v3, v4, v5
1898; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1899; SI-NEXT:    v_div_fmas_f32 v2, v3, v2, v4
1900; SI-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
1901; SI-NEXT:    s_setpc_b64 s[30:31]
1902;
1903; GFX8-LABEL: v_rsq_v2f16:
1904; GFX8:       ; %bb.0:
1905; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1906; GFX8-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
1907; GFX8-NEXT:    v_rsq_f16_e32 v0, v0
1908; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
1909; GFX8-NEXT:    s_setpc_b64 s[30:31]
1910;
1911; GFX9-LABEL: v_rsq_v2f16:
1912; GFX9:       ; %bb.0:
1913; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1914; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1915; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
1916; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
1917; GFX9-NEXT:    s_setpc_b64 s[30:31]
1918;
1919; GFX10-LABEL: v_rsq_v2f16:
1920; GFX10:       ; %bb.0:
1921; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1922; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1923; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
1924; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
1925; GFX10-NEXT:    s_setpc_b64 s[30:31]
1926;
1927; GFX11-LABEL: v_rsq_v2f16:
1928; GFX11:       ; %bb.0:
1929; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1930; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1931; GFX11-NEXT:    v_rsq_f16_e32 v0, v0
1932; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1933; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
1934; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1935; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
1936; GFX11-NEXT:    s_setpc_b64 s[30:31]
1937; GFX9-IEEE-LABEL: v_rsq_v2f16:
1938; GFX9-IEEE:       ; %bb.0:
1939; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1940; GFX9-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
1941; GFX9-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1942; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
1943; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
1944; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
1945; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
1946; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
1947; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
1948; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
1949; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
1950; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
1951; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
1952; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
1953; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
1954; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
1955; GFX9-FLUSH-LABEL: v_rsq_v2f16:
1956; GFX9-FLUSH:       ; %bb.0:
1957; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1958; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
1959; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1960; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
1961; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
1962; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
1963; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
1964; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
1965; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
1966; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
1967; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
1968; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
1969; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
1970  %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
1971  %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt
1972  ret <2 x half> %fdiv
1973}
1974
1975define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
1976; GFX6-IEEE-LABEL: v_neg_rsq_v2f16:
1977; GFX6-IEEE:       ; %bb.0:
1978; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1979; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
1980; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
1981; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
1982; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
1983; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
1984; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
1985; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
1986; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
1987; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
1988; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
1989; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
1990; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v2, v0, v2
1991; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v1, v1, v2
1992; GFX6-IEEE-NEXT:    v_fma_f32 v9, -v3, v6, 1.0
1993; GFX6-IEEE-NEXT:    v_fma_f32 v6, v9, v6, v6
1994; GFX6-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
1995; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v8, v5
1996; GFX6-IEEE-NEXT:    v_fma_f32 v10, -v3, v9, v4
1997; GFX6-IEEE-NEXT:    v_fma_f32 v9, v10, v6, v9
1998; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v9, v4
1999; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v6, v9
2000; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
2001; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v5, v8, 1.0
2002; GFX6-IEEE-NEXT:    v_div_scale_f32 v7, s[4:5], v2, v1, v2
2003; GFX6-IEEE-NEXT:    v_fma_f32 v3, v3, v8, v8
2004; GFX6-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v3
2005; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v5, v4, v7
2006; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v3, v4
2007; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v5, v4, v7
2008; GFX6-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
2009; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
2010; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
2011; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
2012; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
2013; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
2014;
2015; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
2016; GFX6-FLUSH:       ; %bb.0:
2017; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2018; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
2019; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
2020; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
2021; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
2022; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v1, v1
2023; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2024; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
2025; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
2026; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
2027; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
2028; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
2029; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
2030; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2031; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
2032; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
2033; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
2034; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
2035; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
2036; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
2037; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2038; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
2039; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
2040; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
2041; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
2042; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
2043; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
2044; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
2045; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
2046; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2047; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
2048; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
2049; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
2050; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
2051; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
2052; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
2053; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2054; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
2055; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
2056; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
2057; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
2058;
2059; SI-LABEL: v_neg_rsq_v2f16:
2060; SI:       ; %bb.0:
2061; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2062; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2063; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2064; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2065; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2066; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2067; SI-NEXT:    v_sqrt_f32_e32 v0, v0
2068; SI-NEXT:    v_sqrt_f32_e32 v1, v1
2069; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2070; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2071; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2072; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2073; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
2074; SI-NEXT:    v_rcp_f32_e32 v3, v2
2075; SI-NEXT:    v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
2076; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2077; SI-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
2078; SI-NEXT:    v_fma_f32 v3, v5, v3, v3
2079; SI-NEXT:    v_mul_f32_e32 v5, v4, v3
2080; SI-NEXT:    v_fma_f32 v6, -v2, v5, v4
2081; SI-NEXT:    v_fma_f32 v5, v6, v3, v5
2082; SI-NEXT:    v_fma_f32 v2, -v2, v5, v4
2083; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2084; SI-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
2085; SI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, -1.0
2086; SI-NEXT:    v_rcp_f32_e32 v4, v3
2087; SI-NEXT:    v_div_scale_f32 v5, vcc, -1.0, v1, -1.0
2088; SI-NEXT:    v_div_fixup_f32 v0, v2, v0, -1.0
2089; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2090; SI-NEXT:    v_fma_f32 v2, -v3, v4, 1.0
2091; SI-NEXT:    v_fma_f32 v2, v2, v4, v4
2092; SI-NEXT:    v_mul_f32_e32 v4, v5, v2
2093; SI-NEXT:    v_fma_f32 v6, -v3, v4, v5
2094; SI-NEXT:    v_fma_f32 v4, v6, v2, v4
2095; SI-NEXT:    v_fma_f32 v3, -v3, v4, v5
2096; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2097; SI-NEXT:    v_div_fmas_f32 v2, v3, v2, v4
2098; SI-NEXT:    v_div_fixup_f32 v1, v2, v1, -1.0
2099; SI-NEXT:    s_setpc_b64 s[30:31]
2100;
2101; GFX8-LABEL: v_neg_rsq_v2f16:
2102; GFX8:       ; %bb.0:
2103; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2104; GFX8-NEXT:    v_rsq_f16_e32 v1, v0
2105; GFX8-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2106; GFX8-NEXT:    v_mov_b32_e32 v2, 0x8000
2107; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
2108; GFX8-NEXT:    v_xor_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2109; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2110; GFX8-NEXT:    s_setpc_b64 s[30:31]
2111;
2112; GFX9-LABEL: v_neg_rsq_v2f16:
2113; GFX9:       ; %bb.0:
2114; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2115; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2116; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
2117; GFX9-NEXT:    v_pack_b32_f16 v0, -v0, -v1
2118; GFX9-NEXT:    s_setpc_b64 s[30:31]
2119;
2120; GFX10-LABEL: v_neg_rsq_v2f16:
2121; GFX10:       ; %bb.0:
2122; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2123; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2124; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
2125; GFX10-NEXT:    v_pack_b32_f16 v0, -v0, -v1
2126; GFX10-NEXT:    s_setpc_b64 s[30:31]
2127;
2128; GFX11-LABEL: v_neg_rsq_v2f16:
2129; GFX11:       ; %bb.0:
2130; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2131; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
2132; GFX11-NEXT:    v_rsq_f16_e32 v0, v0
2133; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2134; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
2135; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2136; GFX11-NEXT:    v_pack_b32_f16 v0, -v0, -v1
2137; GFX11-NEXT:    s_setpc_b64 s[30:31]
2138; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
2139; GFX9-IEEE:       ; %bb.0:
2140; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2141; GFX9-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
2142; GFX9-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2143; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
2144; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
2145; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
2146; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
2147; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
2148; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
2149; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
2150; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
2151; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
2152; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
2153; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
2154; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
2155; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
2156; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16:
2157; GFX9-FLUSH:       ; %bb.0:
2158; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2159; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
2160; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2161; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
2162; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
2163; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
2164; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
2165; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
2166; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
2167; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
2168; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
2169; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
2170; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
2171  %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
2172  %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
2173  ret <2 x half> %fdiv
2174}
2175
2176declare i32 @llvm.amdgcn.workitem.id.x() #2
2177declare half @llvm.sqrt.f16(half) #2
2178declare half @llvm.fabs.f16(half) #2
2179declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) #2
2180
2181attributes #0 = { nounwind }
2182attributes #1 = { nounwind readnone }
2183attributes #2 = { nounwind "unsafe-fp-math"="true" }
2184