xref: /llvm-project/llvm/test/CodeGen/AMDGPU/frem.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs  < %s | FileCheck --check-prefix=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
9
10define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
11; SI-LABEL: frem_f16:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
14; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
15; SI-NEXT:    s_mov_b32 s11, 0xf000
16; SI-NEXT:    s_mov_b32 s10, -1
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    s_mov_b32 s8, s0
19; SI-NEXT:    s_mov_b32 s9, s1
20; SI-NEXT:    s_mov_b32 s0, s2
21; SI-NEXT:    s_mov_b32 s1, s3
22; SI-NEXT:    s_mov_b32 s2, s10
23; SI-NEXT:    s_mov_b32 s3, s11
24; SI-NEXT:    s_mov_b32 s6, s10
25; SI-NEXT:    s_mov_b32 s7, s11
26; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
27; SI-NEXT:    s_waitcnt vmcnt(0)
28; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
29; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
32; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
33; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
34; SI-NEXT:    v_rcp_f32_e32 v4, v3
35; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
36; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
37; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
38; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
39; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
40; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
41; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
42; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
43; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
44; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
45; SI-NEXT:    v_trunc_f32_e32 v2, v2
46; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
47; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
48; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
49; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
50; SI-NEXT:    s_endpgm
51;
52; CI-LABEL: frem_f16:
53; CI:       ; %bb.0:
54; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
55; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
56; CI-NEXT:    s_mov_b32 s11, 0xf000
57; CI-NEXT:    s_mov_b32 s10, -1
58; CI-NEXT:    s_mov_b32 s6, s10
59; CI-NEXT:    s_waitcnt lgkmcnt(0)
60; CI-NEXT:    s_mov_b32 s8, s0
61; CI-NEXT:    s_mov_b32 s9, s1
62; CI-NEXT:    s_mov_b32 s0, s2
63; CI-NEXT:    s_mov_b32 s1, s3
64; CI-NEXT:    s_mov_b32 s2, s10
65; CI-NEXT:    s_mov_b32 s3, s11
66; CI-NEXT:    s_mov_b32 s7, s11
67; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
68; CI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
69; CI-NEXT:    s_waitcnt vmcnt(1)
70; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
71; CI-NEXT:    s_waitcnt vmcnt(0)
72; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
73; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
74; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
75; CI-NEXT:    v_rcp_f32_e32 v4, v3
76; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
77; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
78; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
79; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
80; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
81; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
82; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
83; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
84; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
85; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
86; CI-NEXT:    v_trunc_f32_e32 v2, v2
87; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
88; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
89; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
90; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
91; CI-NEXT:    s_endpgm
92;
93; VI-LABEL: frem_f16:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
96; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
97; VI-NEXT:    s_waitcnt lgkmcnt(0)
98; VI-NEXT:    v_mov_b32_e32 v0, s0
99; VI-NEXT:    s_add_u32 s0, s4, 8
100; VI-NEXT:    v_mov_b32_e32 v1, s1
101; VI-NEXT:    v_mov_b32_e32 v2, s2
102; VI-NEXT:    v_mov_b32_e32 v3, s3
103; VI-NEXT:    s_addc_u32 s1, s5, 0
104; VI-NEXT:    flat_load_ushort v4, v[2:3]
105; VI-NEXT:    v_mov_b32_e32 v3, s1
106; VI-NEXT:    v_mov_b32_e32 v2, s0
107; VI-NEXT:    flat_load_ushort v2, v[2:3]
108; VI-NEXT:    s_waitcnt vmcnt(1)
109; VI-NEXT:    v_cvt_f32_f16_e32 v3, v4
110; VI-NEXT:    s_waitcnt vmcnt(0)
111; VI-NEXT:    v_cvt_f32_f16_e32 v5, v2
112; VI-NEXT:    v_rcp_f32_e32 v6, v5
113; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
114; VI-NEXT:    v_mad_f32 v8, -v5, v7, v3
115; VI-NEXT:    v_mac_f32_e32 v7, v8, v6
116; VI-NEXT:    v_mad_f32 v3, -v5, v7, v3
117; VI-NEXT:    v_mul_f32_e32 v3, v3, v6
118; VI-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
119; VI-NEXT:    v_add_f32_e32 v3, v3, v7
120; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
121; VI-NEXT:    v_div_fixup_f16 v3, v3, v2, v4
122; VI-NEXT:    v_trunc_f16_e32 v3, v3
123; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
124; VI-NEXT:    flat_store_short v[0:1], v2
125; VI-NEXT:    s_endpgm
126;
127; GFX9-LABEL: frem_f16:
128; GFX9:       ; %bb.0:
129; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
130; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
131; GFX9-NEXT:    v_mov_b32_e32 v0, 0
132; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
133; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
134; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
135; GFX9-NEXT:    s_waitcnt vmcnt(1)
136; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
137; GFX9-NEXT:    s_waitcnt vmcnt(0)
138; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
139; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
140; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
141; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
142; GFX9-NEXT:    v_mac_f32_e32 v3, v5, v4
143; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
144; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
145; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
146; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
147; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
148; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
149; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
150; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
151; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
152; GFX9-NEXT:    s_endpgm
153;
154; GFX10-LABEL: frem_f16:
155; GFX10:       ; %bb.0:
156; GFX10-NEXT:    s_clause 0x1
157; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
158; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
159; GFX10-NEXT:    v_mov_b32_e32 v0, 0
160; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
161; GFX10-NEXT:    s_clause 0x1
162; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
163; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
164; GFX10-NEXT:    s_waitcnt vmcnt(1)
165; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
166; GFX10-NEXT:    s_waitcnt vmcnt(0)
167; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
168; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
169; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
170; GFX10-NEXT:    v_mad_f32 v7, -v4, v6, v3
171; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v5
172; GFX10-NEXT:    v_mad_f32 v3, -v4, v6, v3
173; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
174; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
175; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
176; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
177; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
178; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
179; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
180; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
181; GFX10-NEXT:    s_endpgm
182;
183; GFX11-LABEL: frem_f16:
184; GFX11:       ; %bb.0:
185; GFX11-NEXT:    s_clause 0x1
186; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
187; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
188; GFX11-NEXT:    v_mov_b32_e32 v0, 0
189; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX11-NEXT:    s_clause 0x1
191; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
192; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
193; GFX11-NEXT:    s_waitcnt vmcnt(1)
194; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
195; GFX11-NEXT:    s_waitcnt vmcnt(0)
196; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
197; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
198; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
199; GFX11-NEXT:    s_waitcnt_depctr 0xfff
200; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
201; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
202; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
203; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
204; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
205; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
206; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
207; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
208; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
209; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
210; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
211; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
212; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
213; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
214; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
215; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
216; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
217; GFX11-NEXT:    s_endpgm
218;
219; GFX1150-LABEL: frem_f16:
220; GFX1150:       ; %bb.0:
221; GFX1150-NEXT:    s_clause 0x1
222; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
223; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
224; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
225; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX1150-NEXT:    s_clause 0x1
227; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
228; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
229; GFX1150-NEXT:    s_waitcnt vmcnt(1)
230; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v1
231; GFX1150-NEXT:    s_waitcnt vmcnt(0)
232; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v2
233; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
234; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
235; GFX1150-NEXT:    v_mul_f32_e32 v3, v3, v4
236; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
237; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
238; GFX1150-NEXT:    v_fmac_f32_e32 v3, v5, v4
239; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
240; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
241; GFX1150-NEXT:    v_mul_f32_e32 v4, v5, v4
242; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
243; GFX1150-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
244; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
245; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
246; GFX1150-NEXT:    v_cvt_f16_f32_e32 v3, v3
247; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
248; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
249; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
250; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
251; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
252; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
253; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
254; GFX1150-NEXT:    s_endpgm
255                      ptr addrspace(1) %in2) #0 {
256   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
257   %r0 = load half, ptr addrspace(1) %in1, align 4
258   %r1 = load half, ptr addrspace(1) %gep2, align 4
259   %r2 = frem half %r0, %r1
260   store half %r2, ptr addrspace(1) %out, align 4
261   ret void
262}
263
264define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
265; SI-LABEL: fast_frem_f16:
266; SI:       ; %bb.0:
267; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
268; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
269; SI-NEXT:    s_mov_b32 s11, 0xf000
270; SI-NEXT:    s_mov_b32 s10, -1
271; SI-NEXT:    s_waitcnt lgkmcnt(0)
272; SI-NEXT:    s_mov_b32 s8, s0
273; SI-NEXT:    s_mov_b32 s9, s1
274; SI-NEXT:    s_mov_b32 s0, s2
275; SI-NEXT:    s_mov_b32 s1, s3
276; SI-NEXT:    s_mov_b32 s2, s10
277; SI-NEXT:    s_mov_b32 s3, s11
278; SI-NEXT:    s_mov_b32 s6, s10
279; SI-NEXT:    s_mov_b32 s7, s11
280; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
281; SI-NEXT:    s_waitcnt vmcnt(0)
282; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
283; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
284; SI-NEXT:    s_waitcnt vmcnt(0)
285; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
286; SI-NEXT:    v_rcp_f32_e32 v2, v1
287; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
288; SI-NEXT:    v_trunc_f32_e32 v2, v2
289; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
290; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
291; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
292; SI-NEXT:    s_endpgm
293;
294; CI-LABEL: fast_frem_f16:
295; CI:       ; %bb.0:
296; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
297; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
298; CI-NEXT:    s_mov_b32 s11, 0xf000
299; CI-NEXT:    s_mov_b32 s10, -1
300; CI-NEXT:    s_mov_b32 s6, s10
301; CI-NEXT:    s_mov_b32 s7, s11
302; CI-NEXT:    s_waitcnt lgkmcnt(0)
303; CI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
304; CI-NEXT:    s_mov_b32 s8, s0
305; CI-NEXT:    s_mov_b32 s9, s1
306; CI-NEXT:    s_mov_b32 s0, s2
307; CI-NEXT:    s_mov_b32 s1, s3
308; CI-NEXT:    s_mov_b32 s2, s10
309; CI-NEXT:    s_mov_b32 s3, s11
310; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
311; CI-NEXT:    s_waitcnt vmcnt(1)
312; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
313; CI-NEXT:    v_rcp_f32_e32 v2, v1
314; CI-NEXT:    s_waitcnt vmcnt(0)
315; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
316; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
317; CI-NEXT:    v_trunc_f32_e32 v2, v2
318; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
319; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
320; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
321; CI-NEXT:    s_endpgm
322;
323; VI-LABEL: fast_frem_f16:
324; VI:       ; %bb.0:
325; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
326; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
327; VI-NEXT:    s_waitcnt lgkmcnt(0)
328; VI-NEXT:    v_mov_b32_e32 v0, s0
329; VI-NEXT:    s_add_u32 s0, s4, 8
330; VI-NEXT:    v_mov_b32_e32 v1, s1
331; VI-NEXT:    v_mov_b32_e32 v2, s2
332; VI-NEXT:    v_mov_b32_e32 v3, s3
333; VI-NEXT:    s_addc_u32 s1, s5, 0
334; VI-NEXT:    flat_load_ushort v4, v[2:3]
335; VI-NEXT:    v_mov_b32_e32 v3, s1
336; VI-NEXT:    v_mov_b32_e32 v2, s0
337; VI-NEXT:    flat_load_ushort v2, v[2:3]
338; VI-NEXT:    s_waitcnt vmcnt(0)
339; VI-NEXT:    v_rcp_f16_e32 v3, v2
340; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
341; VI-NEXT:    v_trunc_f16_e32 v3, v3
342; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
343; VI-NEXT:    flat_store_short v[0:1], v2
344; VI-NEXT:    s_endpgm
345;
346; GFX9-LABEL: fast_frem_f16:
347; GFX9:       ; %bb.0:
348; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
349; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
350; GFX9-NEXT:    v_mov_b32_e32 v0, 0
351; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
353; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
354; GFX9-NEXT:    s_waitcnt vmcnt(0)
355; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
356; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
357; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
358; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
359; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
360; GFX9-NEXT:    s_endpgm
361;
362; GFX10-LABEL: fast_frem_f16:
363; GFX10:       ; %bb.0:
364; GFX10-NEXT:    s_clause 0x1
365; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
366; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
367; GFX10-NEXT:    v_mov_b32_e32 v0, 0
368; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX10-NEXT:    s_clause 0x1
370; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
371; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
372; GFX10-NEXT:    s_waitcnt vmcnt(0)
373; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
374; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
375; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
376; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
377; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
378; GFX10-NEXT:    s_endpgm
379;
380; GFX11-LABEL: fast_frem_f16:
381; GFX11:       ; %bb.0:
382; GFX11-NEXT:    s_clause 0x1
383; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
384; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
385; GFX11-NEXT:    v_mov_b32_e32 v0, 0
386; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX11-NEXT:    s_clause 0x1
388; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
389; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
390; GFX11-NEXT:    s_waitcnt vmcnt(0)
391; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
392; GFX11-NEXT:    s_waitcnt_depctr 0xfff
393; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
394; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
395; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
396; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
397; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
398; GFX11-NEXT:    s_endpgm
399;
400; GFX1150-LABEL: fast_frem_f16:
401; GFX1150:       ; %bb.0:
402; GFX1150-NEXT:    s_clause 0x1
403; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
404; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
405; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
406; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
407; GFX1150-NEXT:    s_clause 0x1
408; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
409; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
410; GFX1150-NEXT:    s_waitcnt vmcnt(0)
411; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
412; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
413; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
414; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
415; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
416; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
417; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
418; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
419; GFX1150-NEXT:    s_endpgm
420                      ptr addrspace(1) %in2) #0 {
421   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
422   %r0 = load half, ptr addrspace(1) %in1, align 4
423   %r1 = load half, ptr addrspace(1) %gep2, align 4
424   %r2 = frem fast half %r0, %r1
425   store half %r2, ptr addrspace(1) %out, align 4
426   ret void
427}
428
429define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
430; SI-LABEL: unsafe_frem_f16:
431; SI:       ; %bb.0:
432; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
433; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
434; SI-NEXT:    s_mov_b32 s11, 0xf000
435; SI-NEXT:    s_mov_b32 s10, -1
436; SI-NEXT:    s_waitcnt lgkmcnt(0)
437; SI-NEXT:    s_mov_b32 s8, s0
438; SI-NEXT:    s_mov_b32 s9, s1
439; SI-NEXT:    s_mov_b32 s0, s2
440; SI-NEXT:    s_mov_b32 s1, s3
441; SI-NEXT:    s_mov_b32 s2, s10
442; SI-NEXT:    s_mov_b32 s3, s11
443; SI-NEXT:    s_mov_b32 s6, s10
444; SI-NEXT:    s_mov_b32 s7, s11
445; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
446; SI-NEXT:    s_waitcnt vmcnt(0)
447; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
448; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
449; SI-NEXT:    s_waitcnt vmcnt(0)
450; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
451; SI-NEXT:    v_rcp_f32_e32 v2, v1
452; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
453; SI-NEXT:    v_trunc_f32_e32 v2, v2
454; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
455; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
456; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
457; SI-NEXT:    s_endpgm
458;
459; CI-LABEL: unsafe_frem_f16:
460; CI:       ; %bb.0:
461; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
462; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
463; CI-NEXT:    s_mov_b32 s11, 0xf000
464; CI-NEXT:    s_mov_b32 s10, -1
465; CI-NEXT:    s_mov_b32 s6, s10
466; CI-NEXT:    s_mov_b32 s7, s11
467; CI-NEXT:    s_waitcnt lgkmcnt(0)
468; CI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
469; CI-NEXT:    s_mov_b32 s8, s0
470; CI-NEXT:    s_mov_b32 s9, s1
471; CI-NEXT:    s_mov_b32 s0, s2
472; CI-NEXT:    s_mov_b32 s1, s3
473; CI-NEXT:    s_mov_b32 s2, s10
474; CI-NEXT:    s_mov_b32 s3, s11
475; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
476; CI-NEXT:    s_waitcnt vmcnt(1)
477; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
478; CI-NEXT:    v_rcp_f32_e32 v2, v1
479; CI-NEXT:    s_waitcnt vmcnt(0)
480; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
481; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
482; CI-NEXT:    v_trunc_f32_e32 v2, v2
483; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
484; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
485; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
486; CI-NEXT:    s_endpgm
487;
488; VI-LABEL: unsafe_frem_f16:
489; VI:       ; %bb.0:
490; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
491; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
492; VI-NEXT:    s_waitcnt lgkmcnt(0)
493; VI-NEXT:    v_mov_b32_e32 v0, s0
494; VI-NEXT:    s_add_u32 s0, s4, 8
495; VI-NEXT:    v_mov_b32_e32 v1, s1
496; VI-NEXT:    v_mov_b32_e32 v2, s2
497; VI-NEXT:    v_mov_b32_e32 v3, s3
498; VI-NEXT:    s_addc_u32 s1, s5, 0
499; VI-NEXT:    flat_load_ushort v4, v[2:3]
500; VI-NEXT:    v_mov_b32_e32 v3, s1
501; VI-NEXT:    v_mov_b32_e32 v2, s0
502; VI-NEXT:    flat_load_ushort v2, v[2:3]
503; VI-NEXT:    s_waitcnt vmcnt(0)
504; VI-NEXT:    v_rcp_f16_e32 v3, v2
505; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
506; VI-NEXT:    v_trunc_f16_e32 v3, v3
507; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
508; VI-NEXT:    flat_store_short v[0:1], v2
509; VI-NEXT:    s_endpgm
510;
511; GFX9-LABEL: unsafe_frem_f16:
512; GFX9:       ; %bb.0:
513; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
514; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
515; GFX9-NEXT:    v_mov_b32_e32 v0, 0
516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
517; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
518; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
519; GFX9-NEXT:    s_waitcnt vmcnt(0)
520; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
521; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
522; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
523; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
524; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
525; GFX9-NEXT:    s_endpgm
526;
527; GFX10-LABEL: unsafe_frem_f16:
528; GFX10:       ; %bb.0:
529; GFX10-NEXT:    s_clause 0x1
530; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
531; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
532; GFX10-NEXT:    v_mov_b32_e32 v0, 0
533; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX10-NEXT:    s_clause 0x1
535; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
536; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
537; GFX10-NEXT:    s_waitcnt vmcnt(0)
538; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
539; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
540; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
541; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
542; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
543; GFX10-NEXT:    s_endpgm
544;
545; GFX11-LABEL: unsafe_frem_f16:
546; GFX11:       ; %bb.0:
547; GFX11-NEXT:    s_clause 0x1
548; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
549; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
550; GFX11-NEXT:    v_mov_b32_e32 v0, 0
551; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX11-NEXT:    s_clause 0x1
553; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
554; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
555; GFX11-NEXT:    s_waitcnt vmcnt(0)
556; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
557; GFX11-NEXT:    s_waitcnt_depctr 0xfff
558; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
559; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
560; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
561; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
562; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
563; GFX11-NEXT:    s_endpgm
564;
565; GFX1150-LABEL: unsafe_frem_f16:
566; GFX1150:       ; %bb.0:
567; GFX1150-NEXT:    s_clause 0x1
568; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
569; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
570; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
571; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX1150-NEXT:    s_clause 0x1
573; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
574; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
575; GFX1150-NEXT:    s_waitcnt vmcnt(0)
576; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
577; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
578; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
579; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
580; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
581; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
582; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
583; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
584; GFX1150-NEXT:    s_endpgm
585                             ptr addrspace(1) %in2) #1 {
586   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
587   %r0 = load half, ptr addrspace(1) %in1, align 4
588   %r1 = load half, ptr addrspace(1) %gep2, align 4
589   %r2 = frem afn half %r0, %r1
590   store half %r2, ptr addrspace(1) %out, align 4
591   ret void
592}
593
594define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
595; SI-LABEL: frem_f32:
596; SI:       ; %bb.0:
597; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
598; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
599; SI-NEXT:    s_mov_b32 s11, 0xf000
600; SI-NEXT:    s_mov_b32 s10, -1
601; SI-NEXT:    s_waitcnt lgkmcnt(0)
602; SI-NEXT:    s_mov_b32 s8, s0
603; SI-NEXT:    s_mov_b32 s9, s1
604; SI-NEXT:    s_mov_b32 s0, s2
605; SI-NEXT:    s_mov_b32 s1, s3
606; SI-NEXT:    s_mov_b32 s2, s10
607; SI-NEXT:    s_mov_b32 s3, s11
608; SI-NEXT:    s_mov_b32 s6, s10
609; SI-NEXT:    s_mov_b32 s7, s11
610; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
611; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
612; SI-NEXT:    s_waitcnt vmcnt(0)
613; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
614; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
615; SI-NEXT:    v_rcp_f32_e32 v4, v3
616; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
617; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
618; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
619; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
620; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
621; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
622; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
623; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
624; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
625; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
626; SI-NEXT:    v_trunc_f32_e32 v2, v2
627; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
628; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
629; SI-NEXT:    s_endpgm
630;
631; CI-LABEL: frem_f32:
632; CI:       ; %bb.0:
633; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
634; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
635; CI-NEXT:    s_mov_b32 s11, 0xf000
636; CI-NEXT:    s_mov_b32 s10, -1
637; CI-NEXT:    s_mov_b32 s6, s10
638; CI-NEXT:    s_waitcnt lgkmcnt(0)
639; CI-NEXT:    s_mov_b32 s8, s0
640; CI-NEXT:    s_mov_b32 s9, s1
641; CI-NEXT:    s_mov_b32 s0, s2
642; CI-NEXT:    s_mov_b32 s1, s3
643; CI-NEXT:    s_mov_b32 s2, s10
644; CI-NEXT:    s_mov_b32 s3, s11
645; CI-NEXT:    s_mov_b32 s7, s11
646; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
647; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
648; CI-NEXT:    s_waitcnt vmcnt(0)
649; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
650; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
651; CI-NEXT:    v_rcp_f32_e32 v4, v3
652; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
653; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
654; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
655; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
656; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
657; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
658; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
659; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
660; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
661; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
662; CI-NEXT:    v_trunc_f32_e32 v2, v2
663; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
664; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
665; CI-NEXT:    s_endpgm
666;
667; VI-LABEL: frem_f32:
668; VI:       ; %bb.0:
669; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
670; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
671; VI-NEXT:    s_waitcnt lgkmcnt(0)
672; VI-NEXT:    v_mov_b32_e32 v0, s0
673; VI-NEXT:    s_add_u32 s0, s4, 16
674; VI-NEXT:    v_mov_b32_e32 v1, s1
675; VI-NEXT:    v_mov_b32_e32 v2, s2
676; VI-NEXT:    v_mov_b32_e32 v3, s3
677; VI-NEXT:    s_addc_u32 s1, s5, 0
678; VI-NEXT:    flat_load_dword v4, v[2:3]
679; VI-NEXT:    v_mov_b32_e32 v3, s1
680; VI-NEXT:    v_mov_b32_e32 v2, s0
681; VI-NEXT:    flat_load_dword v2, v[2:3]
682; VI-NEXT:    s_waitcnt vmcnt(0)
683; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v4
684; VI-NEXT:    v_div_scale_f32 v3, vcc, v4, v2, v4
685; VI-NEXT:    v_rcp_f32_e32 v6, v5
686; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
687; VI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
688; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
689; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
690; VI-NEXT:    v_fma_f32 v8, -v5, v7, v3
691; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
692; VI-NEXT:    v_fma_f32 v3, -v5, v7, v3
693; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
694; VI-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
695; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, v4
696; VI-NEXT:    v_trunc_f32_e32 v3, v3
697; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
698; VI-NEXT:    flat_store_dword v[0:1], v2
699; VI-NEXT:    s_endpgm
700;
701; GFX9-LABEL: frem_f32:
702; GFX9:       ; %bb.0:
703; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
704; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
705; GFX9-NEXT:    v_mov_b32_e32 v0, 0
706; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
708; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
709; GFX9-NEXT:    s_waitcnt vmcnt(0)
710; GFX9-NEXT:    v_div_scale_f32 v4, s[2:3], v2, v2, v1
711; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v1, v2, v1
712; GFX9-NEXT:    v_rcp_f32_e32 v5, v4
713; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
714; GFX9-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
715; GFX9-NEXT:    v_fma_f32 v5, v6, v5, v5
716; GFX9-NEXT:    v_mul_f32_e32 v6, v3, v5
717; GFX9-NEXT:    v_fma_f32 v7, -v4, v6, v3
718; GFX9-NEXT:    v_fma_f32 v6, v7, v5, v6
719; GFX9-NEXT:    v_fma_f32 v3, -v4, v6, v3
720; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
721; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
722; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
723; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
724; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
725; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
726; GFX9-NEXT:    s_endpgm
727;
728; GFX10-LABEL: frem_f32:
729; GFX10:       ; %bb.0:
730; GFX10-NEXT:    s_clause 0x1
731; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
732; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
733; GFX10-NEXT:    v_mov_b32_e32 v0, 0
734; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX10-NEXT:    s_clause 0x1
736; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
737; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
738; GFX10-NEXT:    s_waitcnt vmcnt(0)
739; GFX10-NEXT:    v_div_scale_f32 v4, s2, v2, v2, v1
740; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
741; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
742; GFX10-NEXT:    s_denorm_mode 15
743; GFX10-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
744; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v5
745; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
746; GFX10-NEXT:    v_fma_f32 v7, -v4, v6, v3
747; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v5
748; GFX10-NEXT:    v_fma_f32 v3, -v4, v6, v3
749; GFX10-NEXT:    s_denorm_mode 12
750; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
751; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
752; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
753; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
754; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
755; GFX10-NEXT:    s_endpgm
756;
757; GFX11-LABEL: frem_f32:
758; GFX11:       ; %bb.0:
759; GFX11-NEXT:    s_clause 0x1
760; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
761; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
762; GFX11-NEXT:    v_mov_b32_e32 v0, 0
763; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX11-NEXT:    s_clause 0x1
765; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
766; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
767; GFX11-NEXT:    s_waitcnt vmcnt(0)
768; GFX11-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
769; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
770; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
771; GFX11-NEXT:    v_rcp_f32_e32 v5, v4
772; GFX11-NEXT:    s_denorm_mode 15
773; GFX11-NEXT:    s_waitcnt_depctr 0xfff
774; GFX11-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
775; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v5
776; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
777; GFX11-NEXT:    v_mul_f32_e32 v6, v3, v5
778; GFX11-NEXT:    v_fma_f32 v7, -v4, v6, v3
779; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
780; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v5
781; GFX11-NEXT:    v_fma_f32 v3, -v4, v6, v3
782; GFX11-NEXT:    s_denorm_mode 12
783; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
784; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
785; GFX11-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
786; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
787; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
788; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
789; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
790; GFX11-NEXT:    s_endpgm
791;
792; GFX1150-LABEL: frem_f32:
793; GFX1150:       ; %bb.0:
794; GFX1150-NEXT:    s_clause 0x1
795; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
796; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
797; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
798; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
799; GFX1150-NEXT:    s_clause 0x1
800; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
801; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
802; GFX1150-NEXT:    s_waitcnt vmcnt(0)
803; GFX1150-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
804; GFX1150-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
805; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
806; GFX1150-NEXT:    v_rcp_f32_e32 v5, v4
807; GFX1150-NEXT:    s_denorm_mode 15
808; GFX1150-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
809; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
810; GFX1150-NEXT:    v_fmac_f32_e32 v5, v6, v5
811; GFX1150-NEXT:    v_mul_f32_e32 v6, v3, v5
812; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
813; GFX1150-NEXT:    v_fma_f32 v7, -v4, v6, v3
814; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v5
815; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
816; GFX1150-NEXT:    v_fma_f32 v3, -v4, v6, v3
817; GFX1150-NEXT:    s_denorm_mode 12
818; GFX1150-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
819; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
820; GFX1150-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
821; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
822; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
823; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
824; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
825; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
826; GFX1150-NEXT:    s_endpgm
827                      ptr addrspace(1) %in2) #0 {
828   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
829   %r0 = load float, ptr addrspace(1) %in1, align 4
830   %r1 = load float, ptr addrspace(1) %gep2, align 4
831   %r2 = frem float %r0, %r1
832   store float %r2, ptr addrspace(1) %out, align 4
833   ret void
834}
835
836define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
837; SI-LABEL: fast_frem_f32:
838; SI:       ; %bb.0:
839; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
840; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
841; SI-NEXT:    s_mov_b32 s11, 0xf000
842; SI-NEXT:    s_mov_b32 s10, -1
843; SI-NEXT:    s_waitcnt lgkmcnt(0)
844; SI-NEXT:    s_mov_b32 s8, s0
845; SI-NEXT:    s_mov_b32 s9, s1
846; SI-NEXT:    s_mov_b32 s0, s2
847; SI-NEXT:    s_mov_b32 s1, s3
848; SI-NEXT:    s_mov_b32 s2, s10
849; SI-NEXT:    s_mov_b32 s3, s11
850; SI-NEXT:    s_mov_b32 s6, s10
851; SI-NEXT:    s_mov_b32 s7, s11
852; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
853; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
854; SI-NEXT:    s_waitcnt vmcnt(0)
855; SI-NEXT:    v_rcp_f32_e32 v2, v1
856; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
857; SI-NEXT:    v_trunc_f32_e32 v2, v2
858; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
859; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
860; SI-NEXT:    s_endpgm
861;
862; CI-LABEL: fast_frem_f32:
863; CI:       ; %bb.0:
864; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
865; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
866; CI-NEXT:    s_mov_b32 s11, 0xf000
867; CI-NEXT:    s_mov_b32 s10, -1
868; CI-NEXT:    s_mov_b32 s6, s10
869; CI-NEXT:    s_waitcnt lgkmcnt(0)
870; CI-NEXT:    s_mov_b32 s8, s0
871; CI-NEXT:    s_mov_b32 s9, s1
872; CI-NEXT:    s_mov_b32 s0, s2
873; CI-NEXT:    s_mov_b32 s1, s3
874; CI-NEXT:    s_mov_b32 s2, s10
875; CI-NEXT:    s_mov_b32 s3, s11
876; CI-NEXT:    s_mov_b32 s7, s11
877; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
878; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
879; CI-NEXT:    s_waitcnt vmcnt(0)
880; CI-NEXT:    v_rcp_f32_e32 v2, v1
881; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
882; CI-NEXT:    v_trunc_f32_e32 v2, v2
883; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
884; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
885; CI-NEXT:    s_endpgm
886;
887; VI-LABEL: fast_frem_f32:
888; VI:       ; %bb.0:
889; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
890; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
891; VI-NEXT:    s_waitcnt lgkmcnt(0)
892; VI-NEXT:    v_mov_b32_e32 v0, s0
893; VI-NEXT:    s_add_u32 s0, s4, 16
894; VI-NEXT:    v_mov_b32_e32 v1, s1
895; VI-NEXT:    v_mov_b32_e32 v2, s2
896; VI-NEXT:    v_mov_b32_e32 v3, s3
897; VI-NEXT:    s_addc_u32 s1, s5, 0
898; VI-NEXT:    flat_load_dword v4, v[2:3]
899; VI-NEXT:    v_mov_b32_e32 v3, s1
900; VI-NEXT:    v_mov_b32_e32 v2, s0
901; VI-NEXT:    flat_load_dword v2, v[2:3]
902; VI-NEXT:    s_waitcnt vmcnt(0)
903; VI-NEXT:    v_rcp_f32_e32 v3, v2
904; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
905; VI-NEXT:    v_trunc_f32_e32 v3, v3
906; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
907; VI-NEXT:    flat_store_dword v[0:1], v2
908; VI-NEXT:    s_endpgm
909;
910; GFX9-LABEL: fast_frem_f32:
911; GFX9:       ; %bb.0:
912; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
913; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
914; GFX9-NEXT:    v_mov_b32_e32 v0, 0
915; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
917; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
918; GFX9-NEXT:    s_waitcnt vmcnt(0)
919; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
920; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
921; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
922; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
923; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
924; GFX9-NEXT:    s_endpgm
925;
926; GFX10-LABEL: fast_frem_f32:
927; GFX10:       ; %bb.0:
928; GFX10-NEXT:    s_clause 0x1
929; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
930; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
931; GFX10-NEXT:    v_mov_b32_e32 v0, 0
932; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
933; GFX10-NEXT:    s_clause 0x1
934; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
935; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
936; GFX10-NEXT:    s_waitcnt vmcnt(0)
937; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
938; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
939; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
940; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
941; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
942; GFX10-NEXT:    s_endpgm
943;
944; GFX11-LABEL: fast_frem_f32:
945; GFX11:       ; %bb.0:
946; GFX11-NEXT:    s_clause 0x1
947; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
948; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
949; GFX11-NEXT:    v_mov_b32_e32 v0, 0
950; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX11-NEXT:    s_clause 0x1
952; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
953; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
954; GFX11-NEXT:    s_waitcnt vmcnt(0)
955; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
956; GFX11-NEXT:    s_waitcnt_depctr 0xfff
957; GFX11-NEXT:    v_mul_f32_e32 v3, v1, v3
958; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
959; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
960; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
961; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
962; GFX11-NEXT:    s_endpgm
963;
964; GFX1150-LABEL: fast_frem_f32:
965; GFX1150:       ; %bb.0:
966; GFX1150-NEXT:    s_clause 0x1
967; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
968; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
969; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
970; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
971; GFX1150-NEXT:    s_clause 0x1
972; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
973; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
974; GFX1150-NEXT:    s_waitcnt vmcnt(0)
975; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
976; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
977; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
978; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
979; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
980; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
981; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
982; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
983; GFX1150-NEXT:    s_endpgm
984                      ptr addrspace(1) %in2) #0 {
985   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
986   %r0 = load float, ptr addrspace(1) %in1, align 4
987   %r1 = load float, ptr addrspace(1) %gep2, align 4
988   %r2 = frem fast float %r0, %r1
989   store float %r2, ptr addrspace(1) %out, align 4
990   ret void
991}
992
993define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
994; SI-LABEL: unsafe_frem_f32:
995; SI:       ; %bb.0:
996; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
997; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
998; SI-NEXT:    s_mov_b32 s11, 0xf000
999; SI-NEXT:    s_mov_b32 s10, -1
1000; SI-NEXT:    s_waitcnt lgkmcnt(0)
1001; SI-NEXT:    s_mov_b32 s8, s0
1002; SI-NEXT:    s_mov_b32 s9, s1
1003; SI-NEXT:    s_mov_b32 s0, s2
1004; SI-NEXT:    s_mov_b32 s1, s3
1005; SI-NEXT:    s_mov_b32 s2, s10
1006; SI-NEXT:    s_mov_b32 s3, s11
1007; SI-NEXT:    s_mov_b32 s6, s10
1008; SI-NEXT:    s_mov_b32 s7, s11
1009; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
1010; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
1011; SI-NEXT:    s_waitcnt vmcnt(0)
1012; SI-NEXT:    v_rcp_f32_e32 v2, v1
1013; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
1014; SI-NEXT:    v_trunc_f32_e32 v2, v2
1015; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
1016; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1017; SI-NEXT:    s_endpgm
1018;
1019; CI-LABEL: unsafe_frem_f32:
1020; CI:       ; %bb.0:
1021; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1022; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1023; CI-NEXT:    s_mov_b32 s11, 0xf000
1024; CI-NEXT:    s_mov_b32 s10, -1
1025; CI-NEXT:    s_mov_b32 s6, s10
1026; CI-NEXT:    s_waitcnt lgkmcnt(0)
1027; CI-NEXT:    s_mov_b32 s8, s0
1028; CI-NEXT:    s_mov_b32 s9, s1
1029; CI-NEXT:    s_mov_b32 s0, s2
1030; CI-NEXT:    s_mov_b32 s1, s3
1031; CI-NEXT:    s_mov_b32 s2, s10
1032; CI-NEXT:    s_mov_b32 s3, s11
1033; CI-NEXT:    s_mov_b32 s7, s11
1034; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
1035; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
1036; CI-NEXT:    s_waitcnt vmcnt(0)
1037; CI-NEXT:    v_rcp_f32_e32 v2, v1
1038; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
1039; CI-NEXT:    v_trunc_f32_e32 v2, v2
1040; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
1041; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1042; CI-NEXT:    s_endpgm
1043;
1044; VI-LABEL: unsafe_frem_f32:
1045; VI:       ; %bb.0:
1046; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1047; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1048; VI-NEXT:    s_waitcnt lgkmcnt(0)
1049; VI-NEXT:    v_mov_b32_e32 v0, s0
1050; VI-NEXT:    s_add_u32 s0, s4, 16
1051; VI-NEXT:    v_mov_b32_e32 v1, s1
1052; VI-NEXT:    v_mov_b32_e32 v2, s2
1053; VI-NEXT:    v_mov_b32_e32 v3, s3
1054; VI-NEXT:    s_addc_u32 s1, s5, 0
1055; VI-NEXT:    flat_load_dword v4, v[2:3]
1056; VI-NEXT:    v_mov_b32_e32 v3, s1
1057; VI-NEXT:    v_mov_b32_e32 v2, s0
1058; VI-NEXT:    flat_load_dword v2, v[2:3]
1059; VI-NEXT:    s_waitcnt vmcnt(0)
1060; VI-NEXT:    v_rcp_f32_e32 v3, v2
1061; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
1062; VI-NEXT:    v_trunc_f32_e32 v3, v3
1063; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
1064; VI-NEXT:    flat_store_dword v[0:1], v2
1065; VI-NEXT:    s_endpgm
1066;
1067; GFX9-LABEL: unsafe_frem_f32:
1068; GFX9:       ; %bb.0:
1069; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1070; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1071; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1072; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1073; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1074; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
1075; GFX9-NEXT:    s_waitcnt vmcnt(0)
1076; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
1077; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
1078; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
1079; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
1080; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1081; GFX9-NEXT:    s_endpgm
1082;
1083; GFX10-LABEL: unsafe_frem_f32:
1084; GFX10:       ; %bb.0:
1085; GFX10-NEXT:    s_clause 0x1
1086; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1087; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1088; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1089; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1090; GFX10-NEXT:    s_clause 0x1
1091; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1092; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
1093; GFX10-NEXT:    s_waitcnt vmcnt(0)
1094; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
1095; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
1096; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
1097; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
1098; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1099; GFX10-NEXT:    s_endpgm
1100;
1101; GFX11-LABEL: unsafe_frem_f32:
1102; GFX11:       ; %bb.0:
1103; GFX11-NEXT:    s_clause 0x1
1104; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1105; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1106; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1107; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1108; GFX11-NEXT:    s_clause 0x1
1109; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1110; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
1111; GFX11-NEXT:    s_waitcnt vmcnt(0)
1112; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
1113; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1114; GFX11-NEXT:    v_mul_f32_e32 v3, v1, v3
1115; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1116; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
1117; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
1118; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1119; GFX11-NEXT:    s_endpgm
1120;
1121; GFX1150-LABEL: unsafe_frem_f32:
1122; GFX1150:       ; %bb.0:
1123; GFX1150-NEXT:    s_clause 0x1
1124; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1125; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1126; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
1127; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX1150-NEXT:    s_clause 0x1
1129; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
1130; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
1131; GFX1150-NEXT:    s_waitcnt vmcnt(0)
1132; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
1133; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1134; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
1135; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
1136; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1137; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
1138; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
1139; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
1140; GFX1150-NEXT:    s_endpgm
1141                             ptr addrspace(1) %in2) #1 {
1142   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
1143   %r0 = load float, ptr addrspace(1) %in1, align 4
1144   %r1 = load float, ptr addrspace(1) %gep2, align 4
1145   %r2 = frem afn float %r0, %r1
1146   store float %r2, ptr addrspace(1) %out, align 4
1147   ret void
1148}
1149
1150define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1151; SI-LABEL: frem_f64:
1152; SI:       ; %bb.0:
1153; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1154; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1155; SI-NEXT:    s_mov_b32 s7, 0xf000
1156; SI-NEXT:    s_mov_b32 s6, -1
1157; SI-NEXT:    s_waitcnt lgkmcnt(0)
1158; SI-NEXT:    s_mov_b32 s4, s0
1159; SI-NEXT:    s_mov_b32 s5, s1
1160; SI-NEXT:    s_mov_b32 s0, s2
1161; SI-NEXT:    s_mov_b32 s1, s3
1162; SI-NEXT:    s_mov_b32 s2, s6
1163; SI-NEXT:    s_mov_b32 s3, s7
1164; SI-NEXT:    s_mov_b32 s10, s6
1165; SI-NEXT:    s_mov_b32 s11, s7
1166; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
1167; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1168; SI-NEXT:    s_waitcnt vmcnt(0)
1169; SI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1170; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1171; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1172; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1173; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1174; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1175; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
1176; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1177; SI-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
1178; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1179; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v9
1180; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
1181; SI-NEXT:    s_nop 1
1182; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
1183; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1184; SI-NEXT:    v_readfirstlane_b32 s2, v5
1185; SI-NEXT:    s_bfe_u32 s0, s2, 0xb0014
1186; SI-NEXT:    s_add_i32 s3, s0, 0xfffffc01
1187; SI-NEXT:    s_mov_b32 s1, 0xfffff
1188; SI-NEXT:    s_mov_b32 s0, s6
1189; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s3
1190; SI-NEXT:    v_not_b32_e32 v6, s0
1191; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1192; SI-NEXT:    v_not_b32_e32 v7, s1
1193; SI-NEXT:    v_and_b32_e32 v5, v5, v7
1194; SI-NEXT:    s_and_b32 s0, s2, 0x80000000
1195; SI-NEXT:    s_cmp_lt_i32 s3, 0
1196; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1197; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1198; SI-NEXT:    v_mov_b32_e32 v7, s0
1199; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1200; SI-NEXT:    s_cmp_gt_i32 s3, 51
1201; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1202; SI-NEXT:    v_mov_b32_e32 v7, s2
1203; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1204; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
1205; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1206; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1207; SI-NEXT:    s_endpgm
1208;
1209; CI-LABEL: frem_f64:
1210; CI:       ; %bb.0:
1211; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1212; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1213; CI-NEXT:    s_mov_b32 s11, 0xf000
1214; CI-NEXT:    s_mov_b32 s10, -1
1215; CI-NEXT:    s_mov_b32 s6, s10
1216; CI-NEXT:    s_waitcnt lgkmcnt(0)
1217; CI-NEXT:    s_mov_b32 s8, s0
1218; CI-NEXT:    s_mov_b32 s9, s1
1219; CI-NEXT:    s_mov_b32 s0, s2
1220; CI-NEXT:    s_mov_b32 s1, s3
1221; CI-NEXT:    s_mov_b32 s2, s10
1222; CI-NEXT:    s_mov_b32 s3, s11
1223; CI-NEXT:    s_mov_b32 s7, s11
1224; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
1225; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
1226; CI-NEXT:    s_waitcnt vmcnt(0)
1227; CI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1228; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1229; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1230; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1231; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1232; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1233; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1234; CI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1235; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1236; CI-NEXT:    s_nop 1
1237; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1238; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1239; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1240; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1241; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1242; CI-NEXT:    s_endpgm
1243;
1244; VI-LABEL: frem_f64:
1245; VI:       ; %bb.0:
1246; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1247; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1248; VI-NEXT:    s_waitcnt lgkmcnt(0)
1249; VI-NEXT:    v_mov_b32_e32 v2, s2
1250; VI-NEXT:    v_mov_b32_e32 v3, s3
1251; VI-NEXT:    v_mov_b32_e32 v4, s4
1252; VI-NEXT:    v_mov_b32_e32 v5, s5
1253; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1254; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1255; VI-NEXT:    v_mov_b32_e32 v0, s0
1256; VI-NEXT:    v_mov_b32_e32 v1, s1
1257; VI-NEXT:    s_waitcnt vmcnt(0)
1258; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
1259; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
1260; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1261; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1262; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1263; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1264; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
1265; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
1266; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
1267; VI-NEXT:    s_nop 1
1268; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
1269; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
1270; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1271; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1272; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1273; VI-NEXT:    s_endpgm
1274;
1275; GFX9-LABEL: frem_f64:
1276; GFX9:       ; %bb.0:
1277; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1278; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1279; GFX9-NEXT:    v_mov_b32_e32 v12, 0
1280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1281; GFX9-NEXT:    global_load_dwordx2 v[0:1], v12, s[2:3]
1282; GFX9-NEXT:    global_load_dwordx2 v[2:3], v12, s[6:7]
1283; GFX9-NEXT:    s_waitcnt vmcnt(0)
1284; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], v[0:1]
1285; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1286; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1287; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1288; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1289; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1290; GFX9-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1291; GFX9-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1292; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1293; GFX9-NEXT:    s_nop 1
1294; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1295; GFX9-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1296; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1297; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1298; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1]
1299; GFX9-NEXT:    s_endpgm
1300;
1301; GFX10-LABEL: frem_f64:
1302; GFX10:       ; %bb.0:
1303; GFX10-NEXT:    s_clause 0x1
1304; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1305; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1306; GFX10-NEXT:    v_mov_b32_e32 v12, 0
1307; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1308; GFX10-NEXT:    s_clause 0x1
1309; GFX10-NEXT:    global_load_dwordx2 v[0:1], v12, s[2:3]
1310; GFX10-NEXT:    global_load_dwordx2 v[2:3], v12, s[6:7]
1311; GFX10-NEXT:    s_waitcnt vmcnt(0)
1312; GFX10-NEXT:    v_div_scale_f64 v[4:5], s2, v[2:3], v[2:3], v[0:1]
1313; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1314; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1315; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1316; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1317; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1318; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1319; GFX10-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1320; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1321; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1322; GFX10-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1323; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1324; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1325; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1]
1326; GFX10-NEXT:    s_endpgm
1327;
1328; GFX11-LABEL: frem_f64:
1329; GFX11:       ; %bb.0:
1330; GFX11-NEXT:    s_clause 0x1
1331; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1332; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1333; GFX11-NEXT:    v_mov_b32_e32 v12, 0
1334; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1335; GFX11-NEXT:    s_clause 0x1
1336; GFX11-NEXT:    global_load_b64 v[0:1], v12, s[2:3]
1337; GFX11-NEXT:    global_load_b64 v[2:3], v12, s[4:5]
1338; GFX11-NEXT:    s_waitcnt vmcnt(0)
1339; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
1340; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1341; GFX11-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1342; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1343; GFX11-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1344; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1345; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1346; GFX11-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1347; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1348; GFX11-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1349; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1350; GFX11-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1351; GFX11-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1352; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1353; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1354; GFX11-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1355; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1356; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1357; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1358; GFX11-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
1359; GFX11-NEXT:    s_endpgm
1360;
1361; GFX1150-LABEL: frem_f64:
1362; GFX1150:       ; %bb.0:
1363; GFX1150-NEXT:    s_clause 0x1
1364; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1365; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1366; GFX1150-NEXT:    v_mov_b32_e32 v12, 0
1367; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
1368; GFX1150-NEXT:    s_clause 0x1
1369; GFX1150-NEXT:    global_load_b64 v[0:1], v12, s[2:3]
1370; GFX1150-NEXT:    global_load_b64 v[2:3], v12, s[4:5]
1371; GFX1150-NEXT:    s_waitcnt vmcnt(0)
1372; GFX1150-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
1373; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
1374; GFX1150-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1375; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1376; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1377; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1378; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1379; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1380; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1381; GFX1150-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1382; GFX1150-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1383; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1384; GFX1150-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1385; GFX1150-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1386; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1387; GFX1150-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1388; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1389; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1390; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1391; GFX1150-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
1392; GFX1150-NEXT:    s_endpgm
1393                      ptr addrspace(1) %in2) #0 {
1394   %r0 = load double, ptr addrspace(1) %in1, align 8
1395   %r1 = load double, ptr addrspace(1) %in2, align 8
1396   %r2 = frem double %r0, %r1
1397   store double %r2, ptr addrspace(1) %out, align 8
1398   ret void
1399}
1400
1401define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1402; SI-LABEL: fast_frem_f64:
1403; SI:       ; %bb.0:
1404; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1405; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1406; SI-NEXT:    s_mov_b32 s3, 0xf000
1407; SI-NEXT:    s_mov_b32 s2, -1
1408; SI-NEXT:    s_waitcnt lgkmcnt(0)
1409; SI-NEXT:    s_mov_b32 s0, s8
1410; SI-NEXT:    s_mov_b32 s1, s9
1411; SI-NEXT:    s_mov_b32 s8, s10
1412; SI-NEXT:    s_mov_b32 s9, s11
1413; SI-NEXT:    s_mov_b32 s10, s2
1414; SI-NEXT:    s_mov_b32 s11, s3
1415; SI-NEXT:    s_mov_b32 s6, s2
1416; SI-NEXT:    s_mov_b32 s7, s3
1417; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1418; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
1419; SI-NEXT:    s_waitcnt vmcnt(0)
1420; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1421; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1422; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1423; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1424; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1425; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1426; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1427; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1428; SI-NEXT:    v_readfirstlane_b32 s6, v5
1429; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
1430; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
1431; SI-NEXT:    s_mov_b32 s5, 0xfffff
1432; SI-NEXT:    s_mov_b32 s4, s2
1433; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
1434; SI-NEXT:    v_not_b32_e32 v6, s4
1435; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1436; SI-NEXT:    v_not_b32_e32 v7, s5
1437; SI-NEXT:    v_and_b32_e32 v5, v5, v7
1438; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
1439; SI-NEXT:    s_cmp_lt_i32 s7, 0
1440; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1441; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1442; SI-NEXT:    v_mov_b32_e32 v7, s4
1443; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1444; SI-NEXT:    s_cmp_gt_i32 s7, 51
1445; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1446; SI-NEXT:    v_mov_b32_e32 v7, s6
1447; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1448; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
1449; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1450; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1451; SI-NEXT:    s_endpgm
1452;
1453; CI-LABEL: fast_frem_f64:
1454; CI:       ; %bb.0:
1455; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1456; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1457; CI-NEXT:    s_mov_b32 s11, 0xf000
1458; CI-NEXT:    s_mov_b32 s10, -1
1459; CI-NEXT:    s_mov_b32 s6, s10
1460; CI-NEXT:    s_waitcnt lgkmcnt(0)
1461; CI-NEXT:    s_mov_b32 s8, s0
1462; CI-NEXT:    s_mov_b32 s9, s1
1463; CI-NEXT:    s_mov_b32 s0, s2
1464; CI-NEXT:    s_mov_b32 s1, s3
1465; CI-NEXT:    s_mov_b32 s2, s10
1466; CI-NEXT:    s_mov_b32 s3, s11
1467; CI-NEXT:    s_mov_b32 s7, s11
1468; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
1469; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
1470; CI-NEXT:    s_waitcnt vmcnt(0)
1471; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1472; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1473; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1474; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1475; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1476; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1477; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1478; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1479; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1480; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1481; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1482; CI-NEXT:    s_endpgm
1483;
1484; VI-LABEL: fast_frem_f64:
1485; VI:       ; %bb.0:
1486; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1487; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1488; VI-NEXT:    s_waitcnt lgkmcnt(0)
1489; VI-NEXT:    v_mov_b32_e32 v2, s2
1490; VI-NEXT:    v_mov_b32_e32 v3, s3
1491; VI-NEXT:    v_mov_b32_e32 v4, s4
1492; VI-NEXT:    v_mov_b32_e32 v5, s5
1493; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1494; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1495; VI-NEXT:    v_mov_b32_e32 v0, s0
1496; VI-NEXT:    v_mov_b32_e32 v1, s1
1497; VI-NEXT:    s_waitcnt vmcnt(0)
1498; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1499; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1500; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1501; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1502; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1503; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1504; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1505; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1506; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1507; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1508; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1509; VI-NEXT:    s_endpgm
1510;
1511; GFX9-LABEL: fast_frem_f64:
1512; GFX9:       ; %bb.0:
1513; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1514; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1515; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1517; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
1518; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
1519; GFX9-NEXT:    s_waitcnt vmcnt(0)
1520; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1521; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1522; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1523; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1524; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1525; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1526; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1527; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1528; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1529; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1530; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
1531; GFX9-NEXT:    s_endpgm
1532;
1533; GFX10-LABEL: fast_frem_f64:
1534; GFX10:       ; %bb.0:
1535; GFX10-NEXT:    s_clause 0x1
1536; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1537; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1538; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1539; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1540; GFX10-NEXT:    s_clause 0x1
1541; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
1542; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
1543; GFX10-NEXT:    s_waitcnt vmcnt(0)
1544; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1545; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1546; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1547; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1548; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1549; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1550; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1551; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1552; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1553; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1554; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
1555; GFX10-NEXT:    s_endpgm
1556;
1557; GFX11-LABEL: fast_frem_f64:
1558; GFX11:       ; %bb.0:
1559; GFX11-NEXT:    s_clause 0x1
1560; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1561; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1562; GFX11-NEXT:    v_mov_b32_e32 v10, 0
1563; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1564; GFX11-NEXT:    s_clause 0x1
1565; GFX11-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
1566; GFX11-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
1567; GFX11-NEXT:    s_waitcnt vmcnt(0)
1568; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1569; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1570; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1571; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1572; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1573; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1574; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1575; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1576; GFX11-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1577; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1578; GFX11-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1579; GFX11-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1580; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1581; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1582; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1583; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
1584; GFX11-NEXT:    s_endpgm
1585;
1586; GFX1150-LABEL: fast_frem_f64:
1587; GFX1150:       ; %bb.0:
1588; GFX1150-NEXT:    s_clause 0x1
1589; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1590; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1591; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
1592; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
1593; GFX1150-NEXT:    s_clause 0x1
1594; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
1595; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
1596; GFX1150-NEXT:    s_waitcnt vmcnt(0)
1597; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1598; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1599; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1600; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1601; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1602; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1603; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1604; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1605; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1606; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1607; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1608; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1609; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1610; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1611; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1612; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
1613; GFX1150-NEXT:    s_endpgm
1614                      ptr addrspace(1) %in2) #0 {
1615   %r0 = load double, ptr addrspace(1) %in1, align 8
1616   %r1 = load double, ptr addrspace(1) %in2, align 8
1617   %r2 = frem fast double %r0, %r1
1618   store double %r2, ptr addrspace(1) %out, align 8
1619   ret void
1620}
1621
1622define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1623; SI-LABEL: unsafe_frem_f64:
1624; SI:       ; %bb.0:
1625; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1626; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1627; SI-NEXT:    s_mov_b32 s3, 0xf000
1628; SI-NEXT:    s_mov_b32 s2, -1
1629; SI-NEXT:    s_waitcnt lgkmcnt(0)
1630; SI-NEXT:    s_mov_b32 s0, s8
1631; SI-NEXT:    s_mov_b32 s1, s9
1632; SI-NEXT:    s_mov_b32 s8, s10
1633; SI-NEXT:    s_mov_b32 s9, s11
1634; SI-NEXT:    s_mov_b32 s10, s2
1635; SI-NEXT:    s_mov_b32 s11, s3
1636; SI-NEXT:    s_mov_b32 s6, s2
1637; SI-NEXT:    s_mov_b32 s7, s3
1638; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1639; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
1640; SI-NEXT:    s_waitcnt vmcnt(0)
1641; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1642; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1643; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1644; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1645; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1646; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1647; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1648; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1649; SI-NEXT:    v_readfirstlane_b32 s6, v5
1650; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
1651; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
1652; SI-NEXT:    s_mov_b32 s5, 0xfffff
1653; SI-NEXT:    s_mov_b32 s4, s2
1654; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
1655; SI-NEXT:    v_not_b32_e32 v6, s4
1656; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1657; SI-NEXT:    v_not_b32_e32 v7, s5
1658; SI-NEXT:    v_and_b32_e32 v5, v5, v7
1659; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
1660; SI-NEXT:    s_cmp_lt_i32 s7, 0
1661; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1662; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1663; SI-NEXT:    v_mov_b32_e32 v7, s4
1664; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1665; SI-NEXT:    s_cmp_gt_i32 s7, 51
1666; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1667; SI-NEXT:    v_mov_b32_e32 v7, s6
1668; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1669; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
1670; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1671; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1672; SI-NEXT:    s_endpgm
1673;
1674; CI-LABEL: unsafe_frem_f64:
1675; CI:       ; %bb.0:
1676; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1677; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1678; CI-NEXT:    s_mov_b32 s11, 0xf000
1679; CI-NEXT:    s_mov_b32 s10, -1
1680; CI-NEXT:    s_mov_b32 s6, s10
1681; CI-NEXT:    s_waitcnt lgkmcnt(0)
1682; CI-NEXT:    s_mov_b32 s8, s0
1683; CI-NEXT:    s_mov_b32 s9, s1
1684; CI-NEXT:    s_mov_b32 s0, s2
1685; CI-NEXT:    s_mov_b32 s1, s3
1686; CI-NEXT:    s_mov_b32 s2, s10
1687; CI-NEXT:    s_mov_b32 s3, s11
1688; CI-NEXT:    s_mov_b32 s7, s11
1689; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
1690; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
1691; CI-NEXT:    s_waitcnt vmcnt(0)
1692; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1693; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1694; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1695; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1696; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1697; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1698; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1699; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1700; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1701; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1702; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1703; CI-NEXT:    s_endpgm
1704;
1705; VI-LABEL: unsafe_frem_f64:
1706; VI:       ; %bb.0:
1707; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1708; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1709; VI-NEXT:    s_waitcnt lgkmcnt(0)
1710; VI-NEXT:    v_mov_b32_e32 v2, s2
1711; VI-NEXT:    v_mov_b32_e32 v3, s3
1712; VI-NEXT:    v_mov_b32_e32 v4, s4
1713; VI-NEXT:    v_mov_b32_e32 v5, s5
1714; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1715; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1716; VI-NEXT:    v_mov_b32_e32 v0, s0
1717; VI-NEXT:    v_mov_b32_e32 v1, s1
1718; VI-NEXT:    s_waitcnt vmcnt(0)
1719; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1720; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1721; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1722; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1723; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1724; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1725; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1726; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1727; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1728; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1729; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1730; VI-NEXT:    s_endpgm
1731;
1732; GFX9-LABEL: unsafe_frem_f64:
1733; GFX9:       ; %bb.0:
1734; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1735; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1736; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1737; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1738; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
1739; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
1740; GFX9-NEXT:    s_waitcnt vmcnt(0)
1741; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1742; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1743; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1744; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1745; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1746; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1747; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1748; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1749; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1750; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1751; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
1752; GFX9-NEXT:    s_endpgm
1753;
1754; GFX10-LABEL: unsafe_frem_f64:
1755; GFX10:       ; %bb.0:
1756; GFX10-NEXT:    s_clause 0x1
1757; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1758; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1759; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1760; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1761; GFX10-NEXT:    s_clause 0x1
1762; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
1763; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
1764; GFX10-NEXT:    s_waitcnt vmcnt(0)
1765; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1766; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1767; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1768; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1769; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1770; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1771; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1772; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1773; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1774; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1775; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
1776; GFX10-NEXT:    s_endpgm
1777;
1778; GFX11-LABEL: unsafe_frem_f64:
1779; GFX11:       ; %bb.0:
1780; GFX11-NEXT:    s_clause 0x1
1781; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1782; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1783; GFX11-NEXT:    v_mov_b32_e32 v10, 0
1784; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1785; GFX11-NEXT:    s_clause 0x1
1786; GFX11-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
1787; GFX11-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
1788; GFX11-NEXT:    s_waitcnt vmcnt(0)
1789; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1790; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1791; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1792; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1793; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1794; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1795; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1796; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1797; GFX11-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1798; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1799; GFX11-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1800; GFX11-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1801; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1802; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1803; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1804; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
1805; GFX11-NEXT:    s_endpgm
1806;
1807; GFX1150-LABEL: unsafe_frem_f64:
1808; GFX1150:       ; %bb.0:
1809; GFX1150-NEXT:    s_clause 0x1
1810; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1811; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1812; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
1813; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
1814; GFX1150-NEXT:    s_clause 0x1
1815; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
1816; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
1817; GFX1150-NEXT:    s_waitcnt vmcnt(0)
1818; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1819; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1820; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1821; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1822; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1823; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1824; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1825; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1826; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1827; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1828; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1829; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1830; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1831; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1832; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1833; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
1834; GFX1150-NEXT:    s_endpgm
1835                             ptr addrspace(1) %in2) #1 {
1836   %r0 = load double, ptr addrspace(1) %in1, align 8
1837   %r1 = load double, ptr addrspace(1) %in2, align 8
1838   %r2 = frem afn double %r0, %r1
1839   store double %r2, ptr addrspace(1) %out, align 8
1840   ret void
1841}
1842
1843define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1844; SI-LABEL: frem_v2f16:
1845; SI:       ; %bb.0:
1846; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1847; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1848; SI-NEXT:    s_mov_b32 s3, 0xf000
1849; SI-NEXT:    s_mov_b32 s2, -1
1850; SI-NEXT:    s_waitcnt lgkmcnt(0)
1851; SI-NEXT:    s_mov_b32 s0, s8
1852; SI-NEXT:    s_mov_b32 s1, s9
1853; SI-NEXT:    s_mov_b32 s8, s10
1854; SI-NEXT:    s_mov_b32 s9, s11
1855; SI-NEXT:    s_mov_b32 s10, s2
1856; SI-NEXT:    s_mov_b32 s11, s3
1857; SI-NEXT:    s_mov_b32 s6, s2
1858; SI-NEXT:    s_mov_b32 s7, s3
1859; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1860; SI-NEXT:    s_waitcnt vmcnt(0)
1861; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1862; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1863; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1864; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
1865; SI-NEXT:    s_waitcnt vmcnt(0)
1866; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1867; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1868; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1869; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1870; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1871; SI-NEXT:    v_rcp_f32_e32 v6, v5
1872; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1873; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1874; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
1875; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
1876; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1877; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
1878; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1879; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1880; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1881; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1882; SI-NEXT:    v_trunc_f32_e32 v4, v4
1883; SI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1884; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1885; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1886; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1887; SI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1888; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1889; SI-NEXT:    v_rcp_f32_e32 v5, v4
1890; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1891; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1892; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
1893; SI-NEXT:    v_mul_f32_e32 v6, v2, v5
1894; SI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1895; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
1896; SI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1897; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1898; SI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1899; SI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1900; SI-NEXT:    v_trunc_f32_e32 v2, v2
1901; SI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1902; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1903; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1904; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1905; SI-NEXT:    s_endpgm
1906;
1907; CI-LABEL: frem_v2f16:
1908; CI:       ; %bb.0:
1909; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1910; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1911; CI-NEXT:    s_mov_b32 s3, 0xf000
1912; CI-NEXT:    s_mov_b32 s2, -1
1913; CI-NEXT:    s_mov_b32 s6, s2
1914; CI-NEXT:    s_waitcnt lgkmcnt(0)
1915; CI-NEXT:    s_mov_b32 s0, s8
1916; CI-NEXT:    s_mov_b32 s1, s9
1917; CI-NEXT:    s_mov_b32 s8, s10
1918; CI-NEXT:    s_mov_b32 s9, s11
1919; CI-NEXT:    s_mov_b32 s10, s2
1920; CI-NEXT:    s_mov_b32 s11, s3
1921; CI-NEXT:    s_mov_b32 s7, s3
1922; CI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1923; CI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
1924; CI-NEXT:    s_waitcnt vmcnt(1)
1925; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1926; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1927; CI-NEXT:    s_waitcnt vmcnt(0)
1928; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1929; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1930; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1931; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1932; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1933; CI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1934; CI-NEXT:    v_rcp_f32_e32 v6, v5
1935; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1936; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1937; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
1938; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
1939; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1940; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
1941; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1942; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1943; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1944; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1945; CI-NEXT:    v_trunc_f32_e32 v4, v4
1946; CI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1947; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1948; CI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1949; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1950; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1951; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1952; CI-NEXT:    v_rcp_f32_e32 v5, v4
1953; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1954; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1955; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
1956; CI-NEXT:    v_mul_f32_e32 v6, v2, v5
1957; CI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1958; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
1959; CI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1960; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1961; CI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1962; CI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1963; CI-NEXT:    v_trunc_f32_e32 v2, v2
1964; CI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1965; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1966; CI-NEXT:    v_or_b32_e32 v0, v1, v0
1967; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1968; CI-NEXT:    s_endpgm
1969;
1970; VI-LABEL: frem_v2f16:
1971; VI:       ; %bb.0:
1972; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1973; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1974; VI-NEXT:    s_waitcnt lgkmcnt(0)
1975; VI-NEXT:    v_mov_b32_e32 v0, s0
1976; VI-NEXT:    s_add_u32 s0, s4, 16
1977; VI-NEXT:    v_mov_b32_e32 v1, s1
1978; VI-NEXT:    v_mov_b32_e32 v2, s2
1979; VI-NEXT:    v_mov_b32_e32 v3, s3
1980; VI-NEXT:    s_addc_u32 s1, s5, 0
1981; VI-NEXT:    flat_load_dword v4, v[2:3]
1982; VI-NEXT:    v_mov_b32_e32 v3, s1
1983; VI-NEXT:    v_mov_b32_e32 v2, s0
1984; VI-NEXT:    flat_load_dword v2, v[2:3]
1985; VI-NEXT:    s_waitcnt vmcnt(1)
1986; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
1987; VI-NEXT:    v_cvt_f32_f16_e32 v5, v3
1988; VI-NEXT:    s_waitcnt vmcnt(0)
1989; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1990; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1991; VI-NEXT:    v_rcp_f32_e32 v8, v7
1992; VI-NEXT:    v_mul_f32_e32 v9, v5, v8
1993; VI-NEXT:    v_mad_f32 v10, -v7, v9, v5
1994; VI-NEXT:    v_mac_f32_e32 v9, v10, v8
1995; VI-NEXT:    v_mad_f32 v5, -v7, v9, v5
1996; VI-NEXT:    v_mul_f32_e32 v5, v5, v8
1997; VI-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
1998; VI-NEXT:    v_add_f32_e32 v5, v5, v9
1999; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2000; VI-NEXT:    v_div_fixup_f16 v5, v5, v6, v3
2001; VI-NEXT:    v_trunc_f16_e32 v5, v5
2002; VI-NEXT:    v_fma_f16 v3, -v5, v6, v3
2003; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
2004; VI-NEXT:    v_cvt_f32_f16_e32 v5, v4
2005; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2006; VI-NEXT:    v_rcp_f32_e32 v7, v6
2007; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
2008; VI-NEXT:    v_mad_f32 v9, -v6, v8, v5
2009; VI-NEXT:    v_mac_f32_e32 v8, v9, v7
2010; VI-NEXT:    v_mad_f32 v5, -v6, v8, v5
2011; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
2012; VI-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
2013; VI-NEXT:    v_add_f32_e32 v5, v5, v8
2014; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2015; VI-NEXT:    v_div_fixup_f16 v5, v5, v2, v4
2016; VI-NEXT:    v_trunc_f16_e32 v5, v5
2017; VI-NEXT:    v_fma_f16 v2, -v5, v2, v4
2018; VI-NEXT:    v_or_b32_e32 v2, v2, v3
2019; VI-NEXT:    flat_store_dword v[0:1], v2
2020; VI-NEXT:    s_endpgm
2021;
2022; GFX9-LABEL: frem_v2f16:
2023; GFX9:       ; %bb.0:
2024; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2025; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2026; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2027; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2029; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
2030; GFX9-NEXT:    s_waitcnt vmcnt(1)
2031; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
2032; GFX9-NEXT:    s_waitcnt vmcnt(0)
2033; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
2034; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2035; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
2036; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
2037; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
2038; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
2039; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
2040; GFX9-NEXT:    v_mac_f32_e32 v3, v5, v4
2041; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
2042; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
2043; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
2044; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
2045; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2046; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v4
2047; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
2048; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v7
2049; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
2050; GFX9-NEXT:    v_mad_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2051; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
2052; GFX9-NEXT:    v_mac_f32_e32 v5, v8, v7
2053; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v1
2054; GFX9-NEXT:    v_mad_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2055; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v7
2056; GFX9-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
2057; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
2058; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
2059; GFX9-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
2060; GFX9-NEXT:    v_trunc_f16_e32 v1, v1
2061; GFX9-NEXT:    v_fma_f16 v1, -v1, v6, v4
2062; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
2063; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2064; GFX9-NEXT:    s_endpgm
2065;
2066; GFX10-LABEL: frem_v2f16:
2067; GFX10:       ; %bb.0:
2068; GFX10-NEXT:    s_clause 0x1
2069; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2070; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2071; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2072; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX10-NEXT:    s_clause 0x1
2074; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
2075; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
2076; GFX10-NEXT:    s_waitcnt vmcnt(1)
2077; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
2078; GFX10-NEXT:    s_waitcnt vmcnt(0)
2079; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
2080; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
2081; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
2082; GFX10-NEXT:    v_mad_f32 v7, -v4, v6, v3
2083; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v5
2084; GFX10-NEXT:    v_mad_f32 v3, -v4, v6, v3
2085; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
2086; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
2087; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
2088; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
2089; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
2090; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
2091; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v1
2092; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2093; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2094; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
2095; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v1
2096; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
2097; GFX10-NEXT:    v_mul_f32_e32 v7, v4, v6
2098; GFX10-NEXT:    v_mad_f32 v8, -v5, v7, v4
2099; GFX10-NEXT:    v_mac_f32_e32 v7, v8, v6
2100; GFX10-NEXT:    v_mad_f32 v4, -v5, v7, v4
2101; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v6
2102; GFX10-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
2103; GFX10-NEXT:    v_add_f32_e32 v4, v4, v7
2104; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
2105; GFX10-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
2106; GFX10-NEXT:    v_trunc_f16_e32 v4, v4
2107; GFX10-NEXT:    v_fma_f16 v1, -v4, v2, v1
2108; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
2109; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2110; GFX10-NEXT:    s_endpgm
2111;
2112; GFX11-LABEL: frem_v2f16:
2113; GFX11:       ; %bb.0:
2114; GFX11-NEXT:    s_clause 0x1
2115; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2116; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2117; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2118; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2119; GFX11-NEXT:    s_clause 0x1
2120; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2121; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
2122; GFX11-NEXT:    s_waitcnt vmcnt(1)
2123; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
2124; GFX11-NEXT:    s_waitcnt vmcnt(0)
2125; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
2126; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2127; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2128; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
2129; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
2130; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2131; GFX11-NEXT:    v_rcp_f32_e32 v7, v7
2132; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2133; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
2134; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
2135; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2136; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
2137; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
2138; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2139; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
2140; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
2141; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2142; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
2143; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2144; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
2145; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2146; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v4
2147; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
2148; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2149; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v7
2150; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
2151; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2152; GFX11-NEXT:    v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2153; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v1
2154; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2155; GFX11-NEXT:    v_fmac_f32_e32 v5, v8, v7
2156; GFX11-NEXT:    v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2157; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2158; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v7
2159; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
2160; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2161; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
2162; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
2163; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2164; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
2165; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
2166; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2167; GFX11-NEXT:    v_fma_f16 v1, -v1, v6, v4
2168; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
2169; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2170; GFX11-NEXT:    s_endpgm
2171;
2172; GFX1150-LABEL: frem_v2f16:
2173; GFX1150:       ; %bb.0:
2174; GFX1150-NEXT:    s_clause 0x1
2175; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2176; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2177; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
2178; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
2179; GFX1150-NEXT:    s_clause 0x1
2180; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
2181; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
2182; GFX1150-NEXT:    s_waitcnt vmcnt(1)
2183; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2184; GFX1150-NEXT:    s_waitcnt vmcnt(0)
2185; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2186; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2187; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v3
2188; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
2189; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
2190; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
2191; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v6
2192; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2193; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2194; GFX1150-NEXT:    v_fmac_f32_e32 v4, v7, v6
2195; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2196; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2197; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
2198; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2199; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
2200; GFX1150-NEXT:    v_add_f32_e32 v4, v6, v4
2201; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2202; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
2203; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
2204; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2205; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
2206; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
2207; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2208; GFX1150-NEXT:    v_fmac_f16_e32 v3, v4, v5
2209; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
2210; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v1
2211; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
2212; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2213; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v5
2214; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
2215; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2216; GFX1150-NEXT:    v_fmac_f32_e32 v4, v6, v5
2217; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
2218; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2219; GFX1150-NEXT:    v_mul_f32_e32 v5, v6, v5
2220; GFX1150-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
2221; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2222; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
2223; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
2224; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2225; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
2226; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
2227; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2228; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
2229; GFX1150-NEXT:    v_fmac_f16_e32 v1, v4, v2
2230; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2231; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v3
2232; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
2233; GFX1150-NEXT:    s_endpgm
2234                        ptr addrspace(1) %in2) #0 {
2235   %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
2236   %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
2237   %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8
2238   %r2 = frem <2 x half> %r0, %r1
2239   store <2 x half> %r2, ptr addrspace(1) %out, align 8
2240   ret void
2241}
2242
2243define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
2244; SI-LABEL: frem_v4f16:
2245; SI:       ; %bb.0:
2246; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2247; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2248; SI-NEXT:    s_mov_b32 s3, 0xf000
2249; SI-NEXT:    s_mov_b32 s2, -1
2250; SI-NEXT:    s_waitcnt lgkmcnt(0)
2251; SI-NEXT:    s_mov_b32 s0, s8
2252; SI-NEXT:    s_mov_b32 s1, s9
2253; SI-NEXT:    s_mov_b32 s8, s10
2254; SI-NEXT:    s_mov_b32 s9, s11
2255; SI-NEXT:    s_mov_b32 s10, s2
2256; SI-NEXT:    s_mov_b32 s11, s3
2257; SI-NEXT:    s_mov_b32 s6, s2
2258; SI-NEXT:    s_mov_b32 s7, s3
2259; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2260; SI-NEXT:    s_waitcnt vmcnt(0)
2261; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2262; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2263; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
2264; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
2265; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
2266; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
2267; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
2268; SI-NEXT:    s_waitcnt vmcnt(0)
2269; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
2270; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2271; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2272; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
2273; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2274; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2275; SI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
2276; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
2277; SI-NEXT:    v_rcp_f32_e32 v10, v9
2278; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2279; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2280; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
2281; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
2282; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2283; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
2284; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2285; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2286; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2287; SI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
2288; SI-NEXT:    v_trunc_f32_e32 v8, v8
2289; SI-NEXT:    v_fma_f32 v1, -v8, v1, v5
2290; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2291; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2292; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2293; SI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
2294; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
2295; SI-NEXT:    v_rcp_f32_e32 v9, v8
2296; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2297; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2298; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
2299; SI-NEXT:    v_mul_f32_e32 v10, v5, v9
2300; SI-NEXT:    v_fma_f32 v11, -v8, v10, v5
2301; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
2302; SI-NEXT:    v_fma_f32 v5, -v8, v10, v5
2303; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2304; SI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
2305; SI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
2306; SI-NEXT:    v_trunc_f32_e32 v5, v5
2307; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2308; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2309; SI-NEXT:    v_or_b32_e32 v1, v4, v1
2310; SI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
2311; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
2312; SI-NEXT:    v_rcp_f32_e32 v7, v5
2313; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2314; SI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
2315; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
2316; SI-NEXT:    v_mul_f32_e32 v8, v4, v7
2317; SI-NEXT:    v_fma_f32 v9, -v5, v8, v4
2318; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
2319; SI-NEXT:    v_fma_f32 v4, -v5, v8, v4
2320; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2321; SI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
2322; SI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
2323; SI-NEXT:    v_trunc_f32_e32 v4, v4
2324; SI-NEXT:    v_fma_f32 v0, -v4, v0, v3
2325; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2326; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2327; SI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
2328; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
2329; SI-NEXT:    v_rcp_f32_e32 v5, v4
2330; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2331; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
2332; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
2333; SI-NEXT:    v_mul_f32_e32 v7, v3, v5
2334; SI-NEXT:    v_fma_f32 v8, -v4, v7, v3
2335; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
2336; SI-NEXT:    v_fma_f32 v3, -v4, v7, v3
2337; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2338; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
2339; SI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
2340; SI-NEXT:    v_trunc_f32_e32 v3, v3
2341; SI-NEXT:    v_fma_f32 v2, -v3, v6, v2
2342; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2343; SI-NEXT:    v_or_b32_e32 v0, v2, v0
2344; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2345; SI-NEXT:    s_endpgm
2346;
2347; CI-LABEL: frem_v4f16:
2348; CI:       ; %bb.0:
2349; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2350; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2351; CI-NEXT:    s_mov_b32 s3, 0xf000
2352; CI-NEXT:    s_mov_b32 s2, -1
2353; CI-NEXT:    s_mov_b32 s6, s2
2354; CI-NEXT:    s_waitcnt lgkmcnt(0)
2355; CI-NEXT:    s_mov_b32 s0, s8
2356; CI-NEXT:    s_mov_b32 s1, s9
2357; CI-NEXT:    s_mov_b32 s8, s10
2358; CI-NEXT:    s_mov_b32 s9, s11
2359; CI-NEXT:    s_mov_b32 s10, s2
2360; CI-NEXT:    s_mov_b32 s11, s3
2361; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2362; CI-NEXT:    s_mov_b32 s7, s3
2363; CI-NEXT:    s_waitcnt vmcnt(0)
2364; CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
2365; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2366; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
2367; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
2368; CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
2369; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
2370; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
2371; CI-NEXT:    s_waitcnt vmcnt(0)
2372; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
2373; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2374; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2375; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
2376; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2377; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2378; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
2379; CI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
2380; CI-NEXT:    v_rcp_f32_e32 v10, v9
2381; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2382; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2383; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
2384; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
2385; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2386; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
2387; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2388; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2389; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2390; CI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
2391; CI-NEXT:    v_trunc_f32_e32 v8, v8
2392; CI-NEXT:    v_fma_f32 v1, -v8, v1, v5
2393; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
2394; CI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
2395; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2396; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2397; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2398; CI-NEXT:    v_rcp_f32_e32 v9, v8
2399; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2400; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2401; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
2402; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
2403; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
2404; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
2405; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
2406; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2407; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
2408; CI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
2409; CI-NEXT:    v_trunc_f32_e32 v5, v5
2410; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2411; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
2412; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2413; CI-NEXT:    v_or_b32_e32 v1, v4, v1
2414; CI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
2415; CI-NEXT:    v_rcp_f32_e32 v7, v5
2416; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2417; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
2418; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
2419; CI-NEXT:    v_mul_f32_e32 v8, v4, v7
2420; CI-NEXT:    v_fma_f32 v9, -v5, v8, v4
2421; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
2422; CI-NEXT:    v_fma_f32 v4, -v5, v8, v4
2423; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2424; CI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
2425; CI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
2426; CI-NEXT:    v_trunc_f32_e32 v4, v4
2427; CI-NEXT:    v_fma_f32 v0, -v4, v0, v3
2428; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
2429; CI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
2430; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2431; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2432; CI-NEXT:    v_rcp_f32_e32 v5, v4
2433; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2434; CI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
2435; CI-NEXT:    v_fma_f32 v5, v7, v5, v5
2436; CI-NEXT:    v_mul_f32_e32 v7, v3, v5
2437; CI-NEXT:    v_fma_f32 v8, -v4, v7, v3
2438; CI-NEXT:    v_fma_f32 v7, v8, v5, v7
2439; CI-NEXT:    v_fma_f32 v3, -v4, v7, v3
2440; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2441; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
2442; CI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
2443; CI-NEXT:    v_trunc_f32_e32 v3, v3
2444; CI-NEXT:    v_fma_f32 v2, -v3, v6, v2
2445; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2446; CI-NEXT:    v_or_b32_e32 v0, v2, v0
2447; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2448; CI-NEXT:    s_endpgm
2449;
2450; VI-LABEL: frem_v4f16:
2451; VI:       ; %bb.0:
2452; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2453; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
2454; VI-NEXT:    s_waitcnt lgkmcnt(0)
2455; VI-NEXT:    v_mov_b32_e32 v0, s0
2456; VI-NEXT:    s_add_u32 s0, s4, 32
2457; VI-NEXT:    v_mov_b32_e32 v1, s1
2458; VI-NEXT:    s_addc_u32 s1, s5, 0
2459; VI-NEXT:    v_mov_b32_e32 v5, s1
2460; VI-NEXT:    v_mov_b32_e32 v4, s0
2461; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
2462; VI-NEXT:    v_mov_b32_e32 v2, s2
2463; VI-NEXT:    v_mov_b32_e32 v3, s3
2464; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
2465; VI-NEXT:    s_waitcnt vmcnt(1)
2466; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
2467; VI-NEXT:    v_cvt_f32_f16_e32 v9, v8
2468; VI-NEXT:    s_waitcnt vmcnt(0)
2469; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
2470; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
2471; VI-NEXT:    v_rcp_f32_e32 v10, v9
2472; VI-NEXT:    v_mul_f32_e32 v11, v7, v10
2473; VI-NEXT:    v_mad_f32 v12, -v9, v11, v7
2474; VI-NEXT:    v_mac_f32_e32 v11, v12, v10
2475; VI-NEXT:    v_mad_f32 v7, -v9, v11, v7
2476; VI-NEXT:    v_mul_f32_e32 v7, v7, v10
2477; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
2478; VI-NEXT:    v_add_f32_e32 v7, v7, v11
2479; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2480; VI-NEXT:    v_div_fixup_f16 v7, v7, v8, v6
2481; VI-NEXT:    v_trunc_f16_e32 v7, v7
2482; VI-NEXT:    v_fma_f16 v6, -v7, v8, v6
2483; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
2484; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
2485; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2486; VI-NEXT:    v_rcp_f32_e32 v9, v8
2487; VI-NEXT:    v_mul_f32_e32 v10, v7, v9
2488; VI-NEXT:    v_mad_f32 v11, -v8, v10, v7
2489; VI-NEXT:    v_mac_f32_e32 v10, v11, v9
2490; VI-NEXT:    v_mad_f32 v7, -v8, v10, v7
2491; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
2492; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
2493; VI-NEXT:    v_add_f32_e32 v7, v7, v10
2494; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2495; VI-NEXT:    v_div_fixup_f16 v7, v7, v5, v3
2496; VI-NEXT:    v_trunc_f16_e32 v7, v7
2497; VI-NEXT:    v_fma_f16 v3, -v7, v5, v3
2498; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
2499; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
2500; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2501; VI-NEXT:    v_or_b32_e32 v3, v3, v6
2502; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
2503; VI-NEXT:    v_rcp_f32_e32 v9, v8
2504; VI-NEXT:    v_mul_f32_e32 v10, v6, v9
2505; VI-NEXT:    v_mad_f32 v11, -v8, v10, v6
2506; VI-NEXT:    v_mac_f32_e32 v10, v11, v9
2507; VI-NEXT:    v_mad_f32 v6, -v8, v10, v6
2508; VI-NEXT:    v_mul_f32_e32 v6, v6, v9
2509; VI-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
2510; VI-NEXT:    v_add_f32_e32 v6, v6, v10
2511; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2512; VI-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
2513; VI-NEXT:    v_trunc_f16_e32 v6, v6
2514; VI-NEXT:    v_fma_f16 v5, -v6, v7, v5
2515; VI-NEXT:    v_cvt_f32_f16_e32 v7, v4
2516; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
2517; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2518; VI-NEXT:    v_rcp_f32_e32 v8, v7
2519; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
2520; VI-NEXT:    v_mad_f32 v10, -v7, v9, v6
2521; VI-NEXT:    v_mac_f32_e32 v9, v10, v8
2522; VI-NEXT:    v_mad_f32 v6, -v7, v9, v6
2523; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
2524; VI-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
2525; VI-NEXT:    v_add_f32_e32 v6, v6, v9
2526; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2527; VI-NEXT:    v_div_fixup_f16 v6, v6, v4, v2
2528; VI-NEXT:    v_trunc_f16_e32 v6, v6
2529; VI-NEXT:    v_fma_f16 v2, -v6, v4, v2
2530; VI-NEXT:    v_or_b32_e32 v2, v2, v5
2531; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2532; VI-NEXT:    s_endpgm
2533;
2534; GFX9-LABEL: frem_v4f16:
2535; GFX9:       ; %bb.0:
2536; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2537; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2538; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2539; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2540; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
2541; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
2542; GFX9-NEXT:    s_waitcnt vmcnt(1)
2543; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v1
2544; GFX9-NEXT:    s_waitcnt vmcnt(0)
2545; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v3
2546; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
2547; GFX9-NEXT:    v_cvt_f32_f16_e32 v9, v8
2548; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
2549; GFX9-NEXT:    v_rcp_f32_e32 v9, v9
2550; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
2551; GFX9-NEXT:    v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
2552; GFX9-NEXT:    v_mac_f32_e32 v5, v7, v6
2553; GFX9-NEXT:    v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
2554; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
2555; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
2556; GFX9-NEXT:    v_add_f32_e32 v5, v6, v5
2557; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
2558; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
2559; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
2560; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v9
2561; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
2562; GFX9-NEXT:    v_mad_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2563; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
2564; GFX9-NEXT:    v_mac_f32_e32 v7, v10, v9
2565; GFX9-NEXT:    v_fma_f16 v5, -v5, v3, v1
2566; GFX9-NEXT:    v_mad_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2567; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
2568; GFX9-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
2569; GFX9-NEXT:    v_add_f32_e32 v1, v1, v7
2570; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
2571; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
2572; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
2573; GFX9-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
2574; GFX9-NEXT:    v_trunc_f16_e32 v1, v1
2575; GFX9-NEXT:    v_fma_f16 v1, -v1, v8, v6
2576; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v1
2577; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
2578; GFX9-NEXT:    v_cvt_f32_f16_e32 v8, v7
2579; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
2580; GFX9-NEXT:    v_rcp_f32_e32 v8, v8
2581; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
2582; GFX9-NEXT:    v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
2583; GFX9-NEXT:    v_mac_f32_e32 v3, v6, v5
2584; GFX9-NEXT:    v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
2585; GFX9-NEXT:    v_mul_f32_e32 v5, v6, v5
2586; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
2587; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
2588; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
2589; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v5
2590; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
2591; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v8
2592; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
2593; GFX9-NEXT:    v_mad_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2594; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
2595; GFX9-NEXT:    v_mac_f32_e32 v6, v9, v8
2596; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v0
2597; GFX9-NEXT:    v_mad_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2598; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
2599; GFX9-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
2600; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
2601; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
2602; GFX9-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
2603; GFX9-NEXT:    v_trunc_f16_e32 v0, v0
2604; GFX9-NEXT:    v_fma_f16 v0, -v0, v7, v5
2605; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
2606; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
2607; GFX9-NEXT:    s_endpgm
2608;
2609; GFX10-LABEL: frem_v4f16:
2610; GFX10:       ; %bb.0:
2611; GFX10-NEXT:    s_clause 0x1
2612; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2613; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
2614; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2615; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2616; GFX10-NEXT:    s_clause 0x1
2617; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
2618; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
2619; GFX10-NEXT:    s_waitcnt vmcnt(1)
2620; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
2621; GFX10-NEXT:    s_waitcnt vmcnt(0)
2622; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
2623; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2624; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
2625; GFX10-NEXT:    v_mad_f32 v9, -v6, v8, v5
2626; GFX10-NEXT:    v_mac_f32_e32 v8, v9, v7
2627; GFX10-NEXT:    v_mad_f32 v5, -v6, v8, v5
2628; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
2629; GFX10-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
2630; GFX10-NEXT:    v_add_f32_e32 v5, v5, v8
2631; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
2632; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
2633; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
2634; GFX10-NEXT:    v_fma_f16 v5, -v5, v3, v1
2635; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2636; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2637; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
2638; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v1
2639; GFX10-NEXT:    v_rcp_f32_e32 v8, v7
2640; GFX10-NEXT:    v_mul_f32_e32 v9, v6, v8
2641; GFX10-NEXT:    v_mad_f32 v10, -v7, v9, v6
2642; GFX10-NEXT:    v_mac_f32_e32 v9, v10, v8
2643; GFX10-NEXT:    v_mad_f32 v6, -v7, v9, v6
2644; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v8
2645; GFX10-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
2646; GFX10-NEXT:    v_add_f32_e32 v6, v6, v9
2647; GFX10-NEXT:    v_cvt_f16_f32_e32 v6, v6
2648; GFX10-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
2649; GFX10-NEXT:    v_trunc_f16_e32 v6, v6
2650; GFX10-NEXT:    v_fma_f16 v1, -v6, v3, v1
2651; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
2652; GFX10-NEXT:    v_pack_b32_f16 v1, v5, v1
2653; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
2654; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
2655; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
2656; GFX10-NEXT:    v_mad_f32 v8, -v5, v7, v3
2657; GFX10-NEXT:    v_mac_f32_e32 v7, v8, v6
2658; GFX10-NEXT:    v_mad_f32 v3, -v5, v7, v3
2659; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v6
2660; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
2661; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
2662; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
2663; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
2664; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
2665; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v0
2666; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2667; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2668; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v2
2669; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
2670; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2671; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
2672; GFX10-NEXT:    v_mad_f32 v9, -v6, v8, v5
2673; GFX10-NEXT:    v_mac_f32_e32 v8, v9, v7
2674; GFX10-NEXT:    v_mad_f32 v5, -v6, v8, v5
2675; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
2676; GFX10-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
2677; GFX10-NEXT:    v_add_f32_e32 v5, v5, v8
2678; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
2679; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
2680; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
2681; GFX10-NEXT:    v_fma_f16 v0, -v5, v2, v0
2682; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
2683; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
2684; GFX10-NEXT:    s_endpgm
2685;
2686; GFX11-LABEL: frem_v4f16:
2687; GFX11:       ; %bb.0:
2688; GFX11-NEXT:    s_clause 0x1
2689; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2690; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2691; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2692; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2693; GFX11-NEXT:    s_clause 0x1
2694; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
2695; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
2696; GFX11-NEXT:    s_waitcnt vmcnt(1)
2697; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v1
2698; GFX11-NEXT:    s_waitcnt vmcnt(0)
2699; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v3
2700; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
2701; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2702; GFX11-NEXT:    v_rcp_f32_e32 v6, v6
2703; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v8
2704; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2705; GFX11-NEXT:    v_rcp_f32_e32 v9, v9
2706; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2707; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v6
2708; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
2709; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2710; GFX11-NEXT:    v_fmac_f32_e32 v5, v7, v6
2711; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
2712; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2713; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
2714; GFX11-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
2715; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2716; GFX11-NEXT:    v_add_f32_e32 v5, v6, v5
2717; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
2718; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
2719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2720; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
2721; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
2722; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2723; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v9
2724; GFX11-NEXT:    v_trunc_f16_e32 v5, v5
2725; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2726; GFX11-NEXT:    v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2727; GFX11-NEXT:    v_fma_f16 v5, -v5, v3, v1
2728; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2729; GFX11-NEXT:    v_fmac_f32_e32 v7, v10, v9
2730; GFX11-NEXT:    v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2731; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v0
2732; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2733; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v9
2734; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
2735; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2736; GFX11-NEXT:    v_add_f32_e32 v1, v1, v7
2737; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
2738; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
2739; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2740; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
2741; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
2742; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2743; GFX11-NEXT:    v_fma_f16 v1, -v1, v8, v6
2744; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v7
2745; GFX11-NEXT:    v_pack_b32_f16 v1, v5, v1
2746; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v2
2747; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2748; GFX11-NEXT:    v_rcp_f32_e32 v8, v8
2749; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
2750; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2751; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v5
2752; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2753; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
2754; GFX11-NEXT:    v_fmac_f32_e32 v3, v6, v5
2755; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2756; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
2757; GFX11-NEXT:    v_mul_f32_e32 v5, v6, v5
2758; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2759; GFX11-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
2760; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
2761; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
2762; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2763; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
2764; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v5
2765; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2766; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
2767; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v8
2768; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2769; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
2770; GFX11-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2772; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v0
2773; GFX11-NEXT:    v_fmac_f32_e32 v6, v9, v8
2774; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2775; GFX11-NEXT:    v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2776; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
2777; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2778; GFX11-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
2779; GFX11-NEXT:    v_add_f32_e32 v0, v0, v6
2780; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2781; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
2782; GFX11-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
2783; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2784; GFX11-NEXT:    v_trunc_f16_e32 v0, v0
2785; GFX11-NEXT:    v_fma_f16 v0, -v0, v7, v5
2786; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2787; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v0
2788; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
2789; GFX11-NEXT:    s_endpgm
2790;
2791; GFX1150-LABEL: frem_v4f16:
2792; GFX1150:       ; %bb.0:
2793; GFX1150-NEXT:    s_clause 0x1
2794; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2795; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2796; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
2797; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
2798; GFX1150-NEXT:    s_clause 0x1
2799; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
2800; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
2801; GFX1150-NEXT:    s_waitcnt vmcnt(1)
2802; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
2803; GFX1150-NEXT:    s_waitcnt vmcnt(0)
2804; GFX1150-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
2805; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2806; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
2807; GFX1150-NEXT:    v_cvt_f32_f16_e32 v8, v7
2808; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
2809; GFX1150-NEXT:    v_rcp_f32_e32 v8, v8
2810; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v8
2811; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2812; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2813; GFX1150-NEXT:    v_fmac_f32_e32 v6, v9, v8
2814; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2815; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2816; GFX1150-NEXT:    v_mul_f32_e32 v8, v9, v8
2817; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2818; GFX1150-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
2819; GFX1150-NEXT:    v_add_f32_e32 v6, v8, v6
2820; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2821; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
2822; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
2823; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2824; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
2825; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
2826; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2827; GFX1150-NEXT:    v_fmac_f16_e32 v5, v6, v7
2828; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v2
2829; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v0
2830; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
2831; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2832; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v7
2833; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
2834; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2835; GFX1150-NEXT:    v_fmac_f32_e32 v6, v8, v7
2836; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
2837; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2838; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
2839; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
2840; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2841; GFX1150-NEXT:    v_add_f32_e32 v6, v7, v6
2842; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
2843; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2844; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
2845; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
2846; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2847; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
2848; GFX1150-NEXT:    v_fma_f16 v0, v6, v2, v0
2849; GFX1150-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
2850; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
2851; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2852; GFX1150-NEXT:    v_pack_b32_f16 v0, v0, v5
2853; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v6
2854; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2855; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
2856; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
2857; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2858; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v7
2859; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2860; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2861; GFX1150-NEXT:    v_fmac_f32_e32 v5, v8, v7
2862; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2863; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2864; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
2865; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
2866; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2867; GFX1150-NEXT:    v_add_f32_e32 v5, v7, v5
2868; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
2869; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2870; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
2871; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
2872; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2873; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
2874; GFX1150-NEXT:    v_fmac_f16_e32 v2, v5, v6
2875; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v3
2876; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v1
2877; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
2878; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
2879; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v6
2880; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2881; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
2882; GFX1150-NEXT:    v_fmac_f32_e32 v5, v7, v6
2883; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2884; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
2885; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
2886; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2887; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
2888; GFX1150-NEXT:    v_add_f32_e32 v5, v6, v5
2889; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2890; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
2891; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
2892; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2893; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
2894; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
2895; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2896; GFX1150-NEXT:    v_fmac_f16_e32 v1, v5, v3
2897; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v2
2898; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
2899; GFX1150-NEXT:    s_endpgm
2900                        ptr addrspace(1) %in2) #0 {
2901   %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
2902   %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
2903   %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16
2904   %r2 = frem <4 x half> %r0, %r1
2905   store <4 x half> %r2, ptr addrspace(1) %out, align 16
2906   ret void
2907}
2908
2909define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
2910; SI-LABEL: frem_v2f32:
2911; SI:       ; %bb.0:
2912; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2913; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2914; SI-NEXT:    s_mov_b32 s3, 0xf000
2915; SI-NEXT:    s_mov_b32 s2, -1
2916; SI-NEXT:    s_waitcnt lgkmcnt(0)
2917; SI-NEXT:    s_mov_b32 s0, s8
2918; SI-NEXT:    s_mov_b32 s1, s9
2919; SI-NEXT:    s_mov_b32 s8, s10
2920; SI-NEXT:    s_mov_b32 s9, s11
2921; SI-NEXT:    s_mov_b32 s10, s2
2922; SI-NEXT:    s_mov_b32 s11, s3
2923; SI-NEXT:    s_mov_b32 s6, s2
2924; SI-NEXT:    s_mov_b32 s7, s3
2925; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2926; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
2927; SI-NEXT:    s_waitcnt vmcnt(0)
2928; SI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
2929; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
2930; SI-NEXT:    v_rcp_f32_e32 v6, v5
2931; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2932; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2933; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
2934; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
2935; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
2936; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
2937; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2938; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2939; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
2940; SI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
2941; SI-NEXT:    v_trunc_f32_e32 v4, v4
2942; SI-NEXT:    v_fma_f32 v1, -v4, v3, v1
2943; SI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2944; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
2945; SI-NEXT:    v_rcp_f32_e32 v5, v4
2946; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2947; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
2948; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
2949; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
2950; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
2951; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
2952; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
2953; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2954; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
2955; SI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2956; SI-NEXT:    v_trunc_f32_e32 v3, v3
2957; SI-NEXT:    v_fma_f32 v0, -v3, v2, v0
2958; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2959; SI-NEXT:    s_endpgm
2960;
2961; CI-LABEL: frem_v2f32:
2962; CI:       ; %bb.0:
2963; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
2964; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
2965; CI-NEXT:    s_mov_b32 s3, 0xf000
2966; CI-NEXT:    s_mov_b32 s2, -1
2967; CI-NEXT:    s_mov_b32 s6, s2
2968; CI-NEXT:    s_waitcnt lgkmcnt(0)
2969; CI-NEXT:    s_mov_b32 s0, s8
2970; CI-NEXT:    s_mov_b32 s1, s9
2971; CI-NEXT:    s_mov_b32 s8, s10
2972; CI-NEXT:    s_mov_b32 s9, s11
2973; CI-NEXT:    s_mov_b32 s10, s2
2974; CI-NEXT:    s_mov_b32 s11, s3
2975; CI-NEXT:    s_mov_b32 s7, s3
2976; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2977; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
2978; CI-NEXT:    s_waitcnt vmcnt(0)
2979; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
2980; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
2981; CI-NEXT:    v_rcp_f32_e32 v6, v5
2982; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2983; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2984; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
2985; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
2986; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
2987; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
2988; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2989; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2990; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
2991; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
2992; CI-NEXT:    v_trunc_f32_e32 v4, v4
2993; CI-NEXT:    v_fma_f32 v1, -v4, v3, v1
2994; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
2995; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2996; CI-NEXT:    v_rcp_f32_e32 v5, v4
2997; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2998; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
2999; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
3000; CI-NEXT:    v_mul_f32_e32 v6, v3, v5
3001; CI-NEXT:    v_fma_f32 v7, -v4, v6, v3
3002; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
3003; CI-NEXT:    v_fma_f32 v3, -v4, v6, v3
3004; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3005; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
3006; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
3007; CI-NEXT:    v_trunc_f32_e32 v3, v3
3008; CI-NEXT:    v_fma_f32 v0, -v3, v2, v0
3009; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3010; CI-NEXT:    s_endpgm
3011;
3012; VI-LABEL: frem_v2f32:
3013; VI:       ; %bb.0:
3014; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3015; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
3016; VI-NEXT:    s_waitcnt lgkmcnt(0)
3017; VI-NEXT:    v_mov_b32_e32 v0, s0
3018; VI-NEXT:    s_add_u32 s0, s4, 32
3019; VI-NEXT:    v_mov_b32_e32 v1, s1
3020; VI-NEXT:    s_addc_u32 s1, s5, 0
3021; VI-NEXT:    v_mov_b32_e32 v5, s1
3022; VI-NEXT:    v_mov_b32_e32 v2, s2
3023; VI-NEXT:    v_mov_b32_e32 v3, s3
3024; VI-NEXT:    v_mov_b32_e32 v4, s0
3025; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
3026; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
3027; VI-NEXT:    s_waitcnt vmcnt(0)
3028; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v3
3029; VI-NEXT:    v_div_scale_f32 v6, vcc, v3, v5, v3
3030; VI-NEXT:    v_rcp_f32_e32 v8, v7
3031; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3032; VI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
3033; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
3034; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
3035; VI-NEXT:    v_fma_f32 v10, -v7, v9, v6
3036; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
3037; VI-NEXT:    v_fma_f32 v6, -v7, v9, v6
3038; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3039; VI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
3040; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v3
3041; VI-NEXT:    v_trunc_f32_e32 v6, v6
3042; VI-NEXT:    v_fma_f32 v3, -v6, v5, v3
3043; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v2
3044; VI-NEXT:    v_div_scale_f32 v5, vcc, v2, v4, v2
3045; VI-NEXT:    v_rcp_f32_e32 v7, v6
3046; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3047; VI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
3048; VI-NEXT:    v_fma_f32 v7, v8, v7, v7
3049; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
3050; VI-NEXT:    v_fma_f32 v9, -v6, v8, v5
3051; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
3052; VI-NEXT:    v_fma_f32 v5, -v6, v8, v5
3053; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3054; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
3055; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v2
3056; VI-NEXT:    v_trunc_f32_e32 v5, v5
3057; VI-NEXT:    v_fma_f32 v2, -v5, v4, v2
3058; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
3059; VI-NEXT:    s_endpgm
3060;
3061; GFX9-LABEL: frem_v2f32:
3062; GFX9:       ; %bb.0:
3063; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3064; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3065; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3066; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3067; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
3068; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
3069; GFX9-NEXT:    s_waitcnt vmcnt(0)
3070; GFX9-NEXT:    v_div_scale_f32 v6, s[2:3], v3, v3, v1
3071; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
3072; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
3073; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3074; GFX9-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
3075; GFX9-NEXT:    v_fma_f32 v7, v8, v7, v7
3076; GFX9-NEXT:    v_mul_f32_e32 v8, v5, v7
3077; GFX9-NEXT:    v_fma_f32 v9, -v6, v8, v5
3078; GFX9-NEXT:    v_fma_f32 v8, v9, v7, v8
3079; GFX9-NEXT:    v_fma_f32 v5, -v6, v8, v5
3080; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3081; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
3082; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
3083; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3084; GFX9-NEXT:    v_fma_f32 v1, -v5, v3, v1
3085; GFX9-NEXT:    v_div_scale_f32 v5, s[2:3], v2, v2, v0
3086; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
3087; GFX9-NEXT:    v_rcp_f32_e32 v6, v5
3088; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3089; GFX9-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
3090; GFX9-NEXT:    v_fma_f32 v6, v7, v6, v6
3091; GFX9-NEXT:    v_mul_f32_e32 v7, v3, v6
3092; GFX9-NEXT:    v_fma_f32 v8, -v5, v7, v3
3093; GFX9-NEXT:    v_fma_f32 v7, v8, v6, v7
3094; GFX9-NEXT:    v_fma_f32 v3, -v5, v7, v3
3095; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3096; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
3097; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
3098; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3099; GFX9-NEXT:    v_fma_f32 v0, -v3, v2, v0
3100; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
3101; GFX9-NEXT:    s_endpgm
3102;
3103; GFX10-LABEL: frem_v2f32:
3104; GFX10:       ; %bb.0:
3105; GFX10-NEXT:    s_clause 0x1
3106; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3107; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3108; GFX10-NEXT:    v_mov_b32_e32 v4, 0
3109; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3110; GFX10-NEXT:    s_clause 0x1
3111; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
3112; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
3113; GFX10-NEXT:    s_waitcnt vmcnt(0)
3114; GFX10-NEXT:    v_div_scale_f32 v6, s2, v3, v3, v1
3115; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
3116; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
3117; GFX10-NEXT:    s_denorm_mode 15
3118; GFX10-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
3119; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v7
3120; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
3121; GFX10-NEXT:    v_fma_f32 v9, -v6, v8, v5
3122; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v7
3123; GFX10-NEXT:    v_fma_f32 v5, -v6, v8, v5
3124; GFX10-NEXT:    s_denorm_mode 12
3125; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
3126; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
3127; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
3128; GFX10-NEXT:    v_fma_f32 v1, -v5, v3, v1
3129; GFX10-NEXT:    v_div_scale_f32 v5, s2, v2, v2, v0
3130; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
3131; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
3132; GFX10-NEXT:    s_denorm_mode 15
3133; GFX10-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
3134; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v6
3135; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
3136; GFX10-NEXT:    v_fma_f32 v8, -v5, v7, v3
3137; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v6
3138; GFX10-NEXT:    v_fma_f32 v3, -v5, v7, v3
3139; GFX10-NEXT:    s_denorm_mode 12
3140; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
3141; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
3142; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
3143; GFX10-NEXT:    v_fma_f32 v0, -v3, v2, v0
3144; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
3145; GFX10-NEXT:    s_endpgm
3146;
3147; GFX11-LABEL: frem_v2f32:
3148; GFX11:       ; %bb.0:
3149; GFX11-NEXT:    s_clause 0x1
3150; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3151; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3152; GFX11-NEXT:    v_mov_b32_e32 v4, 0
3153; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3154; GFX11-NEXT:    s_clause 0x1
3155; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
3156; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
3157; GFX11-NEXT:    s_waitcnt vmcnt(0)
3158; GFX11-NEXT:    v_div_scale_f32 v6, null, v3, v3, v1
3159; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
3160; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3161; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
3162; GFX11-NEXT:    s_denorm_mode 15
3163; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3164; GFX11-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
3165; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v7
3166; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3167; GFX11-NEXT:    v_mul_f32_e32 v8, v5, v7
3168; GFX11-NEXT:    v_fma_f32 v9, -v6, v8, v5
3169; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3170; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v7
3171; GFX11-NEXT:    v_fma_f32 v5, -v6, v8, v5
3172; GFX11-NEXT:    s_denorm_mode 12
3173; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3174; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
3175; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
3176; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3177; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
3178; GFX11-NEXT:    v_fma_f32 v1, -v5, v3, v1
3179; GFX11-NEXT:    v_div_scale_f32 v5, null, v2, v2, v0
3180; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
3181; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3182; GFX11-NEXT:    v_rcp_f32_e32 v6, v5
3183; GFX11-NEXT:    s_denorm_mode 15
3184; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3185; GFX11-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
3186; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v6
3187; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3188; GFX11-NEXT:    v_mul_f32_e32 v7, v3, v6
3189; GFX11-NEXT:    v_fma_f32 v8, -v5, v7, v3
3190; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3191; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v6
3192; GFX11-NEXT:    v_fma_f32 v3, -v5, v7, v3
3193; GFX11-NEXT:    s_denorm_mode 12
3194; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3195; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
3196; GFX11-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
3197; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3198; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
3199; GFX11-NEXT:    v_fma_f32 v0, -v3, v2, v0
3200; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
3201; GFX11-NEXT:    s_endpgm
3202;
3203; GFX1150-LABEL: frem_v2f32:
3204; GFX1150:       ; %bb.0:
3205; GFX1150-NEXT:    s_clause 0x1
3206; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3207; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3208; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
3209; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
3210; GFX1150-NEXT:    s_clause 0x1
3211; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
3212; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
3213; GFX1150-NEXT:    s_waitcnt vmcnt(0)
3214; GFX1150-NEXT:    v_div_scale_f32 v6, null, v3, v3, v1
3215; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
3216; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3217; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
3218; GFX1150-NEXT:    s_denorm_mode 15
3219; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
3220; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3221; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
3222; GFX1150-NEXT:    v_mul_f32_e32 v8, v5, v7
3223; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3224; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v5
3225; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
3226; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3227; GFX1150-NEXT:    v_fma_f32 v5, -v6, v8, v5
3228; GFX1150-NEXT:    s_denorm_mode 12
3229; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
3230; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3231; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
3232; GFX1150-NEXT:    v_trunc_f32_e32 v5, v5
3233; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3234; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
3235; GFX1150-NEXT:    v_fma_f32 v1, v5, v3, v1
3236; GFX1150-NEXT:    v_div_scale_f32 v5, null, v2, v2, v0
3237; GFX1150-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
3238; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3239; GFX1150-NEXT:    v_rcp_f32_e32 v6, v5
3240; GFX1150-NEXT:    s_denorm_mode 15
3241; GFX1150-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
3242; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3243; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v6
3244; GFX1150-NEXT:    v_mul_f32_e32 v7, v3, v6
3245; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3246; GFX1150-NEXT:    v_fma_f32 v8, -v5, v7, v3
3247; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v6
3248; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3249; GFX1150-NEXT:    v_fma_f32 v3, -v5, v7, v3
3250; GFX1150-NEXT:    s_denorm_mode 12
3251; GFX1150-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
3252; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3253; GFX1150-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
3254; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
3255; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3256; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
3257; GFX1150-NEXT:    v_fmac_f32_e32 v0, v3, v2
3258; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
3259; GFX1150-NEXT:    s_endpgm
3260                        ptr addrspace(1) %in2) #0 {
3261   %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
3262   %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
3263   %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8
3264   %r2 = frem <2 x float> %r0, %r1
3265   store <2 x float> %r2, ptr addrspace(1) %out, align 8
3266   ret void
3267}
3268
3269define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
3270; SI-LABEL: frem_v4f32:
3271; SI:       ; %bb.0:
3272; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
3273; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
3274; SI-NEXT:    s_mov_b32 s3, 0xf000
3275; SI-NEXT:    s_mov_b32 s2, -1
3276; SI-NEXT:    s_waitcnt lgkmcnt(0)
3277; SI-NEXT:    s_mov_b32 s0, s8
3278; SI-NEXT:    s_mov_b32 s1, s9
3279; SI-NEXT:    s_mov_b32 s8, s10
3280; SI-NEXT:    s_mov_b32 s9, s11
3281; SI-NEXT:    s_mov_b32 s10, s2
3282; SI-NEXT:    s_mov_b32 s11, s3
3283; SI-NEXT:    s_mov_b32 s6, s2
3284; SI-NEXT:    s_mov_b32 s7, s3
3285; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3286; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64
3287; SI-NEXT:    s_waitcnt vmcnt(0)
3288; SI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
3289; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
3290; SI-NEXT:    v_rcp_f32_e32 v10, v9
3291; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3292; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
3293; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
3294; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
3295; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
3296; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
3297; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
3298; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3299; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
3300; SI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
3301; SI-NEXT:    v_trunc_f32_e32 v8, v8
3302; SI-NEXT:    v_fma_f32 v3, -v8, v7, v3
3303; SI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
3304; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
3305; SI-NEXT:    v_rcp_f32_e32 v9, v8
3306; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3307; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
3308; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
3309; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
3310; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
3311; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
3312; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
3313; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3314; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
3315; SI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3316; SI-NEXT:    v_trunc_f32_e32 v7, v7
3317; SI-NEXT:    v_fma_f32 v2, -v7, v6, v2
3318; SI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
3319; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
3320; SI-NEXT:    v_rcp_f32_e32 v8, v7
3321; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3322; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
3323; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
3324; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
3325; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
3326; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
3327; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
3328; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3329; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
3330; SI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3331; SI-NEXT:    v_trunc_f32_e32 v6, v6
3332; SI-NEXT:    v_fma_f32 v1, -v6, v5, v1
3333; SI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
3334; SI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
3335; SI-NEXT:    v_rcp_f32_e32 v7, v6
3336; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3337; SI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
3338; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
3339; SI-NEXT:    v_mul_f32_e32 v8, v5, v7
3340; SI-NEXT:    v_fma_f32 v9, -v6, v8, v5
3341; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
3342; SI-NEXT:    v_fma_f32 v5, -v6, v8, v5
3343; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3344; SI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
3345; SI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3346; SI-NEXT:    v_trunc_f32_e32 v5, v5
3347; SI-NEXT:    v_fma_f32 v0, -v5, v4, v0
3348; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3349; SI-NEXT:    s_endpgm
3350;
3351; CI-LABEL: frem_v4f32:
3352; CI:       ; %bb.0:
3353; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
3354; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
3355; CI-NEXT:    s_mov_b32 s3, 0xf000
3356; CI-NEXT:    s_mov_b32 s2, -1
3357; CI-NEXT:    s_mov_b32 s6, s2
3358; CI-NEXT:    s_waitcnt lgkmcnt(0)
3359; CI-NEXT:    s_mov_b32 s0, s8
3360; CI-NEXT:    s_mov_b32 s1, s9
3361; CI-NEXT:    s_mov_b32 s8, s10
3362; CI-NEXT:    s_mov_b32 s9, s11
3363; CI-NEXT:    s_mov_b32 s10, s2
3364; CI-NEXT:    s_mov_b32 s11, s3
3365; CI-NEXT:    s_mov_b32 s7, s3
3366; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3367; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64
3368; CI-NEXT:    s_waitcnt vmcnt(0)
3369; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
3370; CI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
3371; CI-NEXT:    v_rcp_f32_e32 v10, v9
3372; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3373; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
3374; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
3375; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
3376; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
3377; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
3378; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
3379; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3380; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
3381; CI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
3382; CI-NEXT:    v_trunc_f32_e32 v8, v8
3383; CI-NEXT:    v_fma_f32 v3, -v8, v7, v3
3384; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
3385; CI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
3386; CI-NEXT:    v_rcp_f32_e32 v9, v8
3387; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3388; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
3389; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
3390; CI-NEXT:    v_mul_f32_e32 v10, v7, v9
3391; CI-NEXT:    v_fma_f32 v11, -v8, v10, v7
3392; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
3393; CI-NEXT:    v_fma_f32 v7, -v8, v10, v7
3394; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3395; CI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
3396; CI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3397; CI-NEXT:    v_trunc_f32_e32 v7, v7
3398; CI-NEXT:    v_fma_f32 v2, -v7, v6, v2
3399; CI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
3400; CI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
3401; CI-NEXT:    v_rcp_f32_e32 v8, v7
3402; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3403; CI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
3404; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
3405; CI-NEXT:    v_mul_f32_e32 v9, v6, v8
3406; CI-NEXT:    v_fma_f32 v10, -v7, v9, v6
3407; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
3408; CI-NEXT:    v_fma_f32 v6, -v7, v9, v6
3409; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3410; CI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
3411; CI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3412; CI-NEXT:    v_trunc_f32_e32 v6, v6
3413; CI-NEXT:    v_fma_f32 v1, -v6, v5, v1
3414; CI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
3415; CI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
3416; CI-NEXT:    v_rcp_f32_e32 v7, v6
3417; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3418; CI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
3419; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
3420; CI-NEXT:    v_mul_f32_e32 v8, v5, v7
3421; CI-NEXT:    v_fma_f32 v9, -v6, v8, v5
3422; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
3423; CI-NEXT:    v_fma_f32 v5, -v6, v8, v5
3424; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3425; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
3426; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3427; CI-NEXT:    v_trunc_f32_e32 v5, v5
3428; CI-NEXT:    v_fma_f32 v0, -v5, v4, v0
3429; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3430; CI-NEXT:    s_endpgm
3431;
3432; VI-LABEL: frem_v4f32:
3433; VI:       ; %bb.0:
3434; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3435; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
3436; VI-NEXT:    s_waitcnt lgkmcnt(0)
3437; VI-NEXT:    v_mov_b32_e32 v8, s0
3438; VI-NEXT:    s_add_u32 s0, s4, 64
3439; VI-NEXT:    v_mov_b32_e32 v9, s1
3440; VI-NEXT:    s_addc_u32 s1, s5, 0
3441; VI-NEXT:    v_mov_b32_e32 v5, s1
3442; VI-NEXT:    v_mov_b32_e32 v0, s2
3443; VI-NEXT:    v_mov_b32_e32 v1, s3
3444; VI-NEXT:    v_mov_b32_e32 v4, s0
3445; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3446; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
3447; VI-NEXT:    s_waitcnt vmcnt(0)
3448; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v7, v7, v3
3449; VI-NEXT:    v_div_scale_f32 v10, vcc, v3, v7, v3
3450; VI-NEXT:    v_rcp_f32_e32 v12, v11
3451; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3452; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
3453; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
3454; VI-NEXT:    v_mul_f32_e32 v13, v10, v12
3455; VI-NEXT:    v_fma_f32 v14, -v11, v13, v10
3456; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
3457; VI-NEXT:    v_fma_f32 v10, -v11, v13, v10
3458; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3459; VI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
3460; VI-NEXT:    v_div_fixup_f32 v10, v10, v7, v3
3461; VI-NEXT:    v_trunc_f32_e32 v10, v10
3462; VI-NEXT:    v_fma_f32 v3, -v10, v7, v3
3463; VI-NEXT:    v_div_scale_f32 v10, s[0:1], v6, v6, v2
3464; VI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
3465; VI-NEXT:    v_rcp_f32_e32 v11, v10
3466; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3467; VI-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
3468; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
3469; VI-NEXT:    v_mul_f32_e32 v12, v7, v11
3470; VI-NEXT:    v_fma_f32 v13, -v10, v12, v7
3471; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
3472; VI-NEXT:    v_fma_f32 v7, -v10, v12, v7
3473; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3474; VI-NEXT:    v_div_fmas_f32 v7, v7, v11, v12
3475; VI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3476; VI-NEXT:    v_trunc_f32_e32 v7, v7
3477; VI-NEXT:    v_fma_f32 v2, -v7, v6, v2
3478; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
3479; VI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
3480; VI-NEXT:    v_rcp_f32_e32 v10, v7
3481; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3482; VI-NEXT:    v_fma_f32 v11, -v7, v10, 1.0
3483; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
3484; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
3485; VI-NEXT:    v_fma_f32 v12, -v7, v11, v6
3486; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
3487; VI-NEXT:    v_fma_f32 v6, -v7, v11, v6
3488; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3489; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
3490; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3491; VI-NEXT:    v_trunc_f32_e32 v6, v6
3492; VI-NEXT:    v_fma_f32 v1, -v6, v5, v1
3493; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
3494; VI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
3495; VI-NEXT:    v_rcp_f32_e32 v7, v6
3496; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3497; VI-NEXT:    v_fma_f32 v10, -v6, v7, 1.0
3498; VI-NEXT:    v_fma_f32 v7, v10, v7, v7
3499; VI-NEXT:    v_mul_f32_e32 v10, v5, v7
3500; VI-NEXT:    v_fma_f32 v11, -v6, v10, v5
3501; VI-NEXT:    v_fma_f32 v10, v11, v7, v10
3502; VI-NEXT:    v_fma_f32 v5, -v6, v10, v5
3503; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3504; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v10
3505; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3506; VI-NEXT:    v_trunc_f32_e32 v5, v5
3507; VI-NEXT:    v_fma_f32 v0, -v5, v4, v0
3508; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3509; VI-NEXT:    s_endpgm
3510;
3511; GFX9-LABEL: frem_v4f32:
3512; GFX9:       ; %bb.0:
3513; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3514; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3515; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3517; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
3518; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7] offset:64
3519; GFX9-NEXT:    s_waitcnt vmcnt(0)
3520; GFX9-NEXT:    v_div_scale_f32 v10, s[2:3], v7, v7, v3
3521; GFX9-NEXT:    v_div_scale_f32 v9, vcc, v3, v7, v3
3522; GFX9-NEXT:    v_rcp_f32_e32 v11, v10
3523; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3524; GFX9-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
3525; GFX9-NEXT:    v_fma_f32 v11, v12, v11, v11
3526; GFX9-NEXT:    v_mul_f32_e32 v12, v9, v11
3527; GFX9-NEXT:    v_fma_f32 v13, -v10, v12, v9
3528; GFX9-NEXT:    v_fma_f32 v12, v13, v11, v12
3529; GFX9-NEXT:    v_fma_f32 v9, -v10, v12, v9
3530; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3531; GFX9-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
3532; GFX9-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
3533; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
3534; GFX9-NEXT:    v_fma_f32 v3, -v9, v7, v3
3535; GFX9-NEXT:    v_div_scale_f32 v9, s[2:3], v6, v6, v2
3536; GFX9-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
3537; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
3538; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3539; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
3540; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
3541; GFX9-NEXT:    v_mul_f32_e32 v11, v7, v10
3542; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v7
3543; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
3544; GFX9-NEXT:    v_fma_f32 v7, -v9, v11, v7
3545; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3546; GFX9-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
3547; GFX9-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3548; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
3549; GFX9-NEXT:    v_fma_f32 v2, -v7, v6, v2
3550; GFX9-NEXT:    v_div_scale_f32 v7, s[2:3], v5, v5, v1
3551; GFX9-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
3552; GFX9-NEXT:    v_rcp_f32_e32 v9, v7
3553; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3554; GFX9-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
3555; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
3556; GFX9-NEXT:    v_mul_f32_e32 v10, v6, v9
3557; GFX9-NEXT:    v_fma_f32 v11, -v7, v10, v6
3558; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
3559; GFX9-NEXT:    v_fma_f32 v6, -v7, v10, v6
3560; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3561; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
3562; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3563; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3564; GFX9-NEXT:    v_fma_f32 v1, -v6, v5, v1
3565; GFX9-NEXT:    v_div_scale_f32 v6, s[2:3], v4, v4, v0
3566; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
3567; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
3568; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3569; GFX9-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
3570; GFX9-NEXT:    v_fma_f32 v7, v9, v7, v7
3571; GFX9-NEXT:    v_mul_f32_e32 v9, v5, v7
3572; GFX9-NEXT:    v_fma_f32 v10, -v6, v9, v5
3573; GFX9-NEXT:    v_fma_f32 v9, v10, v7, v9
3574; GFX9-NEXT:    v_fma_f32 v5, -v6, v9, v5
3575; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3576; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
3577; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3578; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3579; GFX9-NEXT:    v_fma_f32 v0, -v5, v4, v0
3580; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
3581; GFX9-NEXT:    s_endpgm
3582;
3583; GFX10-LABEL: frem_v4f32:
3584; GFX10:       ; %bb.0:
3585; GFX10-NEXT:    s_clause 0x1
3586; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
3587; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
3588; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3589; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3590; GFX10-NEXT:    s_clause 0x1
3591; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
3592; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7] offset:64
3593; GFX10-NEXT:    s_waitcnt vmcnt(0)
3594; GFX10-NEXT:    v_div_scale_f32 v10, s2, v7, v7, v3
3595; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
3596; GFX10-NEXT:    v_rcp_f32_e32 v11, v10
3597; GFX10-NEXT:    s_denorm_mode 15
3598; GFX10-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
3599; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v11
3600; GFX10-NEXT:    v_mul_f32_e32 v12, v9, v11
3601; GFX10-NEXT:    v_fma_f32 v13, -v10, v12, v9
3602; GFX10-NEXT:    v_fmac_f32_e32 v12, v13, v11
3603; GFX10-NEXT:    v_fma_f32 v9, -v10, v12, v9
3604; GFX10-NEXT:    s_denorm_mode 12
3605; GFX10-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
3606; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
3607; GFX10-NEXT:    v_trunc_f32_e32 v9, v9
3608; GFX10-NEXT:    v_fma_f32 v3, -v9, v7, v3
3609; GFX10-NEXT:    v_div_scale_f32 v9, s2, v6, v6, v2
3610; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
3611; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
3612; GFX10-NEXT:    s_denorm_mode 15
3613; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
3614; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
3615; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
3616; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
3617; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
3618; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
3619; GFX10-NEXT:    s_denorm_mode 12
3620; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
3621; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3622; GFX10-NEXT:    v_trunc_f32_e32 v7, v7
3623; GFX10-NEXT:    v_fma_f32 v2, -v7, v6, v2
3624; GFX10-NEXT:    v_div_scale_f32 v7, s2, v5, v5, v1
3625; GFX10-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3626; GFX10-NEXT:    v_rcp_f32_e32 v9, v7
3627; GFX10-NEXT:    s_denorm_mode 15
3628; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
3629; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v9
3630; GFX10-NEXT:    v_mul_f32_e32 v10, v6, v9
3631; GFX10-NEXT:    v_fma_f32 v11, -v7, v10, v6
3632; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v9
3633; GFX10-NEXT:    v_fma_f32 v6, -v7, v10, v6
3634; GFX10-NEXT:    s_denorm_mode 12
3635; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
3636; GFX10-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3637; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
3638; GFX10-NEXT:    v_fma_f32 v1, -v6, v5, v1
3639; GFX10-NEXT:    v_div_scale_f32 v6, s2, v4, v4, v0
3640; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3641; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
3642; GFX10-NEXT:    s_denorm_mode 15
3643; GFX10-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
3644; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v7
3645; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v7
3646; GFX10-NEXT:    v_fma_f32 v10, -v6, v9, v5
3647; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v7
3648; GFX10-NEXT:    v_fma_f32 v5, -v6, v9, v5
3649; GFX10-NEXT:    s_denorm_mode 12
3650; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
3651; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3652; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
3653; GFX10-NEXT:    v_fma_f32 v0, -v5, v4, v0
3654; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
3655; GFX10-NEXT:    s_endpgm
3656;
3657; GFX11-LABEL: frem_v4f32:
3658; GFX11:       ; %bb.0:
3659; GFX11-NEXT:    s_clause 0x1
3660; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3661; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3662; GFX11-NEXT:    v_mov_b32_e32 v8, 0
3663; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3664; GFX11-NEXT:    s_clause 0x1
3665; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
3666; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[4:5] offset:64
3667; GFX11-NEXT:    s_waitcnt vmcnt(0)
3668; GFX11-NEXT:    v_div_scale_f32 v10, null, v7, v7, v3
3669; GFX11-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
3670; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3671; GFX11-NEXT:    v_rcp_f32_e32 v11, v10
3672; GFX11-NEXT:    s_denorm_mode 15
3673; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3674; GFX11-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
3675; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v11
3676; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3677; GFX11-NEXT:    v_mul_f32_e32 v12, v9, v11
3678; GFX11-NEXT:    v_fma_f32 v13, -v10, v12, v9
3679; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3680; GFX11-NEXT:    v_fmac_f32_e32 v12, v13, v11
3681; GFX11-NEXT:    v_fma_f32 v9, -v10, v12, v9
3682; GFX11-NEXT:    s_denorm_mode 12
3683; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3684; GFX11-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
3685; GFX11-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
3686; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3687; GFX11-NEXT:    v_trunc_f32_e32 v9, v9
3688; GFX11-NEXT:    v_fma_f32 v3, -v9, v7, v3
3689; GFX11-NEXT:    v_div_scale_f32 v9, null, v6, v6, v2
3690; GFX11-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
3691; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3692; GFX11-NEXT:    v_rcp_f32_e32 v10, v9
3693; GFX11-NEXT:    s_denorm_mode 15
3694; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3695; GFX11-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
3696; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v10
3697; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3698; GFX11-NEXT:    v_mul_f32_e32 v11, v7, v10
3699; GFX11-NEXT:    v_fma_f32 v12, -v9, v11, v7
3700; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3701; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v10
3702; GFX11-NEXT:    v_fma_f32 v7, -v9, v11, v7
3703; GFX11-NEXT:    s_denorm_mode 12
3704; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3705; GFX11-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
3706; GFX11-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3708; GFX11-NEXT:    v_trunc_f32_e32 v7, v7
3709; GFX11-NEXT:    v_fma_f32 v2, -v7, v6, v2
3710; GFX11-NEXT:    v_div_scale_f32 v7, null, v5, v5, v1
3711; GFX11-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3712; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3713; GFX11-NEXT:    v_rcp_f32_e32 v9, v7
3714; GFX11-NEXT:    s_denorm_mode 15
3715; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3716; GFX11-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
3717; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v9
3718; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3719; GFX11-NEXT:    v_mul_f32_e32 v10, v6, v9
3720; GFX11-NEXT:    v_fma_f32 v11, -v7, v10, v6
3721; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3722; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v9
3723; GFX11-NEXT:    v_fma_f32 v6, -v7, v10, v6
3724; GFX11-NEXT:    s_denorm_mode 12
3725; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3726; GFX11-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
3727; GFX11-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3728; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3729; GFX11-NEXT:    v_trunc_f32_e32 v6, v6
3730; GFX11-NEXT:    v_fma_f32 v1, -v6, v5, v1
3731; GFX11-NEXT:    v_div_scale_f32 v6, null, v4, v4, v0
3732; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3733; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3734; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
3735; GFX11-NEXT:    s_denorm_mode 15
3736; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3737; GFX11-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
3738; GFX11-NEXT:    v_fmac_f32_e32 v7, v9, v7
3739; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3740; GFX11-NEXT:    v_mul_f32_e32 v9, v5, v7
3741; GFX11-NEXT:    v_fma_f32 v10, -v6, v9, v5
3742; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3743; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v7
3744; GFX11-NEXT:    v_fma_f32 v5, -v6, v9, v5
3745; GFX11-NEXT:    s_denorm_mode 12
3746; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3747; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
3748; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3749; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3750; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
3751; GFX11-NEXT:    v_fma_f32 v0, -v5, v4, v0
3752; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
3753; GFX11-NEXT:    s_endpgm
3754;
3755; GFX1150-LABEL: frem_v4f32:
3756; GFX1150:       ; %bb.0:
3757; GFX1150-NEXT:    s_clause 0x1
3758; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
3759; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
3760; GFX1150-NEXT:    v_mov_b32_e32 v8, 0
3761; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
3762; GFX1150-NEXT:    s_clause 0x1
3763; GFX1150-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
3764; GFX1150-NEXT:    global_load_b128 v[4:7], v8, s[4:5] offset:64
3765; GFX1150-NEXT:    s_waitcnt vmcnt(0)
3766; GFX1150-NEXT:    v_div_scale_f32 v10, null, v7, v7, v3
3767; GFX1150-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
3768; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3769; GFX1150-NEXT:    v_rcp_f32_e32 v11, v10
3770; GFX1150-NEXT:    s_denorm_mode 15
3771; GFX1150-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
3772; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3773; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v11
3774; GFX1150-NEXT:    v_mul_f32_e32 v12, v9, v11
3775; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3776; GFX1150-NEXT:    v_fma_f32 v13, -v10, v12, v9
3777; GFX1150-NEXT:    v_fmac_f32_e32 v12, v13, v11
3778; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3779; GFX1150-NEXT:    v_fma_f32 v9, -v10, v12, v9
3780; GFX1150-NEXT:    s_denorm_mode 12
3781; GFX1150-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
3782; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3783; GFX1150-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
3784; GFX1150-NEXT:    v_trunc_f32_e32 v9, v9
3785; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3786; GFX1150-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
3787; GFX1150-NEXT:    v_fma_f32 v3, v9, v7, v3
3788; GFX1150-NEXT:    v_div_scale_f32 v9, null, v6, v6, v2
3789; GFX1150-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
3790; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3791; GFX1150-NEXT:    v_rcp_f32_e32 v10, v9
3792; GFX1150-NEXT:    s_denorm_mode 15
3793; GFX1150-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
3794; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3795; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v10
3796; GFX1150-NEXT:    v_mul_f32_e32 v11, v7, v10
3797; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3798; GFX1150-NEXT:    v_fma_f32 v12, -v9, v11, v7
3799; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v10
3800; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3801; GFX1150-NEXT:    v_fma_f32 v7, -v9, v11, v7
3802; GFX1150-NEXT:    s_denorm_mode 12
3803; GFX1150-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
3804; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3805; GFX1150-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3806; GFX1150-NEXT:    v_trunc_f32_e32 v7, v7
3807; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3808; GFX1150-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
3809; GFX1150-NEXT:    v_fma_f32 v2, v7, v6, v2
3810; GFX1150-NEXT:    v_div_scale_f32 v7, null, v5, v5, v1
3811; GFX1150-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3812; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3813; GFX1150-NEXT:    v_rcp_f32_e32 v9, v7
3814; GFX1150-NEXT:    s_denorm_mode 15
3815; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
3816; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3817; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v9
3818; GFX1150-NEXT:    v_mul_f32_e32 v10, v6, v9
3819; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3820; GFX1150-NEXT:    v_fma_f32 v11, -v7, v10, v6
3821; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v9
3822; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3823; GFX1150-NEXT:    v_fma_f32 v6, -v7, v10, v6
3824; GFX1150-NEXT:    s_denorm_mode 12
3825; GFX1150-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
3826; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3827; GFX1150-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3828; GFX1150-NEXT:    v_trunc_f32_e32 v6, v6
3829; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3830; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
3831; GFX1150-NEXT:    v_fma_f32 v1, v6, v5, v1
3832; GFX1150-NEXT:    v_div_scale_f32 v6, null, v4, v4, v0
3833; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3834; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3835; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
3836; GFX1150-NEXT:    s_denorm_mode 15
3837; GFX1150-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
3838; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3839; GFX1150-NEXT:    v_fmac_f32_e32 v7, v9, v7
3840; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v7
3841; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3842; GFX1150-NEXT:    v_fma_f32 v10, -v6, v9, v5
3843; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v7
3844; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3845; GFX1150-NEXT:    v_fma_f32 v5, -v6, v9, v5
3846; GFX1150-NEXT:    s_denorm_mode 12
3847; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
3848; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3849; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3850; GFX1150-NEXT:    v_trunc_f32_e32 v5, v5
3851; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3852; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
3853; GFX1150-NEXT:    v_fmac_f32_e32 v0, v5, v4
3854; GFX1150-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
3855; GFX1150-NEXT:    s_endpgm
3856                        ptr addrspace(1) %in2) #0 {
3857   %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
3858   %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
3859   %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16
3860   %r2 = frem <4 x float> %r0, %r1
3861   store <4 x float> %r2, ptr addrspace(1) %out, align 16
3862   ret void
3863}
3864
3865define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
3866; SI-LABEL: frem_v2f64:
3867; SI:       ; %bb.0:
3868; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
3869; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
3870; SI-NEXT:    s_mov_b32 s7, 0xf000
3871; SI-NEXT:    s_mov_b32 s6, -1
3872; SI-NEXT:    s_waitcnt lgkmcnt(0)
3873; SI-NEXT:    s_mov_b32 s4, s0
3874; SI-NEXT:    s_mov_b32 s5, s1
3875; SI-NEXT:    s_mov_b32 s0, s2
3876; SI-NEXT:    s_mov_b32 s1, s3
3877; SI-NEXT:    s_mov_b32 s2, s6
3878; SI-NEXT:    s_mov_b32 s3, s7
3879; SI-NEXT:    s_mov_b32 s10, s6
3880; SI-NEXT:    s_mov_b32 s11, s7
3881; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
3882; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
3883; SI-NEXT:    s_waitcnt vmcnt(0)
3884; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
3885; SI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
3886; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3887; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3888; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3889; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3890; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
3891; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3892; SI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
3893; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
3894; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v13
3895; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
3896; SI-NEXT:    s_nop 1
3897; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
3898; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3899; SI-NEXT:    v_readfirstlane_b32 s8, v9
3900; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
3901; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
3902; SI-NEXT:    s_mov_b32 s3, 0xfffff
3903; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
3904; SI-NEXT:    v_not_b32_e32 v10, s0
3905; SI-NEXT:    v_and_b32_e32 v10, v8, v10
3906; SI-NEXT:    v_not_b32_e32 v11, s1
3907; SI-NEXT:    v_and_b32_e32 v9, v9, v11
3908; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
3909; SI-NEXT:    s_cmp_lt_i32 s9, 0
3910; SI-NEXT:    s_cselect_b64 vcc, -1, 0
3911; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
3912; SI-NEXT:    v_mov_b32_e32 v11, s0
3913; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
3914; SI-NEXT:    s_cmp_gt_i32 s9, 51
3915; SI-NEXT:    s_cselect_b64 vcc, -1, 0
3916; SI-NEXT:    v_mov_b32_e32 v11, s8
3917; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
3918; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
3919; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3920; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3921; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
3922; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3923; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3924; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3925; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3926; SI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
3927; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
3928; SI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
3929; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
3930; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v11
3931; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
3932; SI-NEXT:    s_nop 1
3933; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
3934; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3935; SI-NEXT:    v_readfirstlane_b32 s8, v7
3936; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
3937; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
3938; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
3939; SI-NEXT:    v_not_b32_e32 v8, s0
3940; SI-NEXT:    v_and_b32_e32 v8, v6, v8
3941; SI-NEXT:    v_not_b32_e32 v9, s1
3942; SI-NEXT:    v_and_b32_e32 v7, v7, v9
3943; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
3944; SI-NEXT:    s_cmp_lt_i32 s9, 0
3945; SI-NEXT:    s_cselect_b64 vcc, -1, 0
3946; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
3947; SI-NEXT:    v_mov_b32_e32 v9, s0
3948; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
3949; SI-NEXT:    s_cmp_gt_i32 s9, 51
3950; SI-NEXT:    s_cselect_b64 vcc, -1, 0
3951; SI-NEXT:    v_mov_b32_e32 v9, s8
3952; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
3953; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
3954; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3955; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
3956; SI-NEXT:    s_endpgm
3957;
3958; CI-LABEL: frem_v2f64:
3959; CI:       ; %bb.0:
3960; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
3961; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
3962; CI-NEXT:    s_mov_b32 s3, 0xf000
3963; CI-NEXT:    s_mov_b32 s2, -1
3964; CI-NEXT:    s_mov_b32 s6, s2
3965; CI-NEXT:    s_waitcnt lgkmcnt(0)
3966; CI-NEXT:    s_mov_b32 s0, s8
3967; CI-NEXT:    s_mov_b32 s1, s9
3968; CI-NEXT:    s_mov_b32 s8, s10
3969; CI-NEXT:    s_mov_b32 s9, s11
3970; CI-NEXT:    s_mov_b32 s10, s2
3971; CI-NEXT:    s_mov_b32 s11, s3
3972; CI-NEXT:    s_mov_b32 s7, s3
3973; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3974; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64
3975; CI-NEXT:    s_waitcnt vmcnt(0)
3976; CI-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
3977; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
3978; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3979; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3980; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3981; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3982; CI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
3983; CI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3984; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3985; CI-NEXT:    s_nop 1
3986; CI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3987; CI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3988; CI-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
3989; CI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3990; CI-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
3991; CI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
3992; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3993; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3994; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3995; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3996; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
3997; CI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
3998; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3999; CI-NEXT:    s_nop 1
4000; CI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
4001; CI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
4002; CI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
4003; CI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
4004; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4005; CI-NEXT:    s_endpgm
4006;
4007; VI-LABEL: frem_v2f64:
4008; VI:       ; %bb.0:
4009; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4010; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
4011; VI-NEXT:    s_waitcnt lgkmcnt(0)
4012; VI-NEXT:    v_mov_b32_e32 v8, s0
4013; VI-NEXT:    s_add_u32 s0, s4, 64
4014; VI-NEXT:    v_mov_b32_e32 v9, s1
4015; VI-NEXT:    s_addc_u32 s1, s5, 0
4016; VI-NEXT:    v_mov_b32_e32 v5, s1
4017; VI-NEXT:    v_mov_b32_e32 v0, s2
4018; VI-NEXT:    v_mov_b32_e32 v1, s3
4019; VI-NEXT:    v_mov_b32_e32 v4, s0
4020; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
4021; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
4022; VI-NEXT:    s_waitcnt vmcnt(0)
4023; VI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
4024; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
4025; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
4026; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
4027; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
4028; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
4029; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
4030; VI-NEXT:    v_mul_f64 v[16:17], v[14:15], v[12:13]
4031; VI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
4032; VI-NEXT:    s_nop 1
4033; VI-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
4034; VI-NEXT:    v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
4035; VI-NEXT:    v_trunc_f64_e32 v[10:11], v[10:11]
4036; VI-NEXT:    v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
4037; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
4038; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
4039; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
4040; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4041; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
4042; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4043; VI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
4044; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
4045; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
4046; VI-NEXT:    s_nop 1
4047; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
4048; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
4049; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
4050; VI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
4051; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
4052; VI-NEXT:    s_endpgm
4053;
4054; GFX9-LABEL: frem_v2f64:
4055; GFX9:       ; %bb.0:
4056; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4057; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4058; GFX9-NEXT:    v_mov_b32_e32 v16, 0
4059; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4060; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[2:3]
4061; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:64
4062; GFX9-NEXT:    s_waitcnt vmcnt(0)
4063; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[2:3], v[6:7], v[6:7], v[2:3]
4064; GFX9-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
4065; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4066; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4067; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4068; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4069; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
4070; GFX9-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
4071; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
4072; GFX9-NEXT:    s_nop 1
4073; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
4074; GFX9-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
4075; GFX9-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
4076; GFX9-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
4077; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[2:3], v[4:5], v[4:5], v[0:1]
4078; GFX9-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
4079; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4080; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4081; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4082; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4083; GFX9-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
4084; GFX9-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
4085; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
4086; GFX9-NEXT:    s_nop 1
4087; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
4088; GFX9-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
4089; GFX9-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
4090; GFX9-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
4091; GFX9-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
4092; GFX9-NEXT:    s_endpgm
4093;
4094; GFX10-LABEL: frem_v2f64:
4095; GFX10:       ; %bb.0:
4096; GFX10-NEXT:    s_clause 0x1
4097; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4098; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4099; GFX10-NEXT:    v_mov_b32_e32 v16, 0
4100; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4101; GFX10-NEXT:    s_clause 0x1
4102; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[2:3]
4103; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:64
4104; GFX10-NEXT:    s_waitcnt vmcnt(0)
4105; GFX10-NEXT:    v_div_scale_f64 v[8:9], s2, v[6:7], v[6:7], v[2:3]
4106; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
4107; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4108; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4109; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4110; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4111; GFX10-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
4112; GFX10-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
4113; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
4114; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
4115; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
4116; GFX10-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
4117; GFX10-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
4118; GFX10-NEXT:    v_div_scale_f64 v[6:7], s2, v[4:5], v[4:5], v[0:1]
4119; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
4120; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4121; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4122; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4123; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4124; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
4125; GFX10-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
4126; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
4127; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
4128; GFX10-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
4129; GFX10-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
4130; GFX10-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
4131; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
4132; GFX10-NEXT:    s_endpgm
4133;
4134; GFX11-LABEL: frem_v2f64:
4135; GFX11:       ; %bb.0:
4136; GFX11-NEXT:    s_clause 0x1
4137; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4138; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
4139; GFX11-NEXT:    v_mov_b32_e32 v16, 0
4140; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
4141; GFX11-NEXT:    s_clause 0x1
4142; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
4143; GFX11-NEXT:    global_load_b128 v[4:7], v16, s[4:5] offset:64
4144; GFX11-NEXT:    s_waitcnt vmcnt(0)
4145; GFX11-NEXT:    v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
4146; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
4147; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
4148; GFX11-NEXT:    s_waitcnt_depctr 0xfff
4149; GFX11-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4150; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4151; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4152; GFX11-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4153; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4154; GFX11-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
4155; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4156; GFX11-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
4157; GFX11-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
4158; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4159; GFX11-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
4160; GFX11-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
4161; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4162; GFX11-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
4163; GFX11-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
4164; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
4165; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
4166; GFX11-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
4167; GFX11-NEXT:    s_waitcnt_depctr 0xfff
4168; GFX11-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4169; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4170; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4171; GFX11-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4172; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4173; GFX11-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
4174; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4175; GFX11-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
4176; GFX11-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
4177; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4178; GFX11-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
4179; GFX11-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
4180; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4181; GFX11-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
4182; GFX11-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
4183; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
4184; GFX11-NEXT:    s_endpgm
4185;
4186; GFX1150-LABEL: frem_v2f64:
4187; GFX1150:       ; %bb.0:
4188; GFX1150-NEXT:    s_clause 0x1
4189; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
4190; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
4191; GFX1150-NEXT:    v_mov_b32_e32 v16, 0
4192; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
4193; GFX1150-NEXT:    s_clause 0x1
4194; GFX1150-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
4195; GFX1150-NEXT:    global_load_b128 v[4:7], v16, s[4:5] offset:64
4196; GFX1150-NEXT:    s_waitcnt vmcnt(0)
4197; GFX1150-NEXT:    v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
4198; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
4199; GFX1150-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
4200; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4201; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4202; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4203; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
4204; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
4205; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
4206; GFX1150-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
4207; GFX1150-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
4208; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4209; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
4210; GFX1150-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
4211; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4212; GFX1150-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
4213; GFX1150-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
4214; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
4215; GFX1150-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
4216; GFX1150-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
4217; GFX1150-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
4218; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4219; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4220; GFX1150-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4221; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4222; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
4223; GFX1150-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
4224; GFX1150-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
4225; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4226; GFX1150-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
4227; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
4228; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4229; GFX1150-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
4230; GFX1150-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
4231; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4232; GFX1150-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
4233; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
4234; GFX1150-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
4235; GFX1150-NEXT:    s_endpgm
4236                        ptr addrspace(1) %in2) #0 {
4237   %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
4238   %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
4239   %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16
4240   %r2 = frem <2 x double> %r0, %r1
4241   store <2 x double> %r2, ptr addrspace(1) %out, align 16
4242   ret void
4243}
4244
4245attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
4246attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
4247