xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
3; RUN:  llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
4
5define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
6; CI-LABEL: frem_f16:
7; CI:       ; %bb.0:
8; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
9; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
10; CI-NEXT:    s_waitcnt lgkmcnt(0)
11; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
12; CI-NEXT:    s_load_dword s3, s[4:5], 0x2
13; CI-NEXT:    s_waitcnt lgkmcnt(0)
14; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
15; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
16; CI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
17; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
18; CI-NEXT:    v_rcp_f32_e32 v4, v2
19; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
20; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
21; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
22; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
23; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
24; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
25; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
26; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
27; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
28; CI-NEXT:    s_mov_b32 s2, -1
29; CI-NEXT:    s_mov_b32 s3, 0xf000
30; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
31; CI-NEXT:    v_trunc_f32_e32 v2, v2
32; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
33; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
34; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
35; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
36; CI-NEXT:    s_endpgm
37;
38; VI-LABEL: frem_f16:
39; VI:       ; %bb.0:
40; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
41; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
44; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
45; VI-NEXT:    s_waitcnt lgkmcnt(0)
46; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
47; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
48; VI-NEXT:    v_mov_b32_e32 v1, s3
49; VI-NEXT:    v_rcp_f32_e32 v3, v2
50; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
51; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
52; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
53; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
54; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
55; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
56; VI-NEXT:    v_add_f32_e32 v0, v0, v4
57; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
58; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
59; VI-NEXT:    v_trunc_f16_e32 v0, v0
60; VI-NEXT:    v_fma_f16 v2, -v0, v1, s2
61; VI-NEXT:    v_mov_b32_e32 v0, s0
62; VI-NEXT:    v_mov_b32_e32 v1, s1
63; VI-NEXT:    flat_store_short v[0:1], v2
64; VI-NEXT:    s_endpgm
65   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
66   %r0 = load half, ptr addrspace(1) %in1, align 4
67   %r1 = load half, ptr addrspace(1) %gep2, align 4
68   %r2 = frem half %r0, %r1
69   store half %r2, ptr addrspace(1) %out, align 4
70   ret void
71}
72
73define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
74; CI-LABEL: fast_frem_f16:
75; CI:       ; %bb.0:
76; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
77; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
78; CI-NEXT:    s_waitcnt lgkmcnt(0)
79; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
80; CI-NEXT:    s_load_dword s3, s[4:5], 0x2
81; CI-NEXT:    s_waitcnt lgkmcnt(0)
82; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
83; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
84; CI-NEXT:    s_mov_b32 s2, -1
85; CI-NEXT:    s_mov_b32 s3, 0xf000
86; CI-NEXT:    v_rcp_f32_e32 v2, v1
87; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
88; CI-NEXT:    v_trunc_f32_e32 v2, v2
89; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
90; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
91; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
92; CI-NEXT:    s_endpgm
93;
94; VI-LABEL: fast_frem_f16:
95; VI:       ; %bb.0:
96; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
97; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
100; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
101; VI-NEXT:    s_waitcnt lgkmcnt(0)
102; VI-NEXT:    v_mov_b32_e32 v1, s2
103; VI-NEXT:    v_rcp_f16_e32 v0, s3
104; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
105; VI-NEXT:    v_trunc_f16_e32 v0, v0
106; VI-NEXT:    v_fma_f16 v2, -v0, s3, v1
107; VI-NEXT:    v_mov_b32_e32 v0, s0
108; VI-NEXT:    v_mov_b32_e32 v1, s1
109; VI-NEXT:    flat_store_short v[0:1], v2
110; VI-NEXT:    s_endpgm
111   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
112   %r0 = load half, ptr addrspace(1) %in1, align 4
113   %r1 = load half, ptr addrspace(1) %gep2, align 4
114   %r2 = frem fast half %r0, %r1
115   store half %r2, ptr addrspace(1) %out, align 4
116   ret void
117}
118
119define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
120; CI-LABEL: unsafe_frem_f16:
121; CI:       ; %bb.0:
122; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
123; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
124; CI-NEXT:    s_waitcnt lgkmcnt(0)
125; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
126; CI-NEXT:    s_load_dword s3, s[4:5], 0x2
127; CI-NEXT:    s_waitcnt lgkmcnt(0)
128; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
129; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
130; CI-NEXT:    s_mov_b32 s2, -1
131; CI-NEXT:    s_mov_b32 s3, 0xf000
132; CI-NEXT:    v_rcp_f32_e32 v2, v1
133; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
134; CI-NEXT:    v_trunc_f32_e32 v2, v2
135; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
136; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
137; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
138; CI-NEXT:    s_endpgm
139;
140; VI-LABEL: unsafe_frem_f16:
141; VI:       ; %bb.0:
142; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
143; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
144; VI-NEXT:    s_waitcnt lgkmcnt(0)
145; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
146; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
147; VI-NEXT:    s_waitcnt lgkmcnt(0)
148; VI-NEXT:    v_mov_b32_e32 v1, s2
149; VI-NEXT:    v_rcp_f16_e32 v0, s3
150; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
151; VI-NEXT:    v_trunc_f16_e32 v0, v0
152; VI-NEXT:    v_fma_f16 v2, -v0, s3, v1
153; VI-NEXT:    v_mov_b32_e32 v0, s0
154; VI-NEXT:    v_mov_b32_e32 v1, s1
155; VI-NEXT:    flat_store_short v[0:1], v2
156; VI-NEXT:    s_endpgm
157   %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
158   %r0 = load half, ptr addrspace(1) %in1, align 4
159   %r1 = load half, ptr addrspace(1) %gep2, align 4
160   %r2 = frem half %r0, %r1
161   store half %r2, ptr addrspace(1) %out, align 4
162   ret void
163}
164
165define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
166; CI-LABEL: frem_f32:
167; CI:       ; %bb.0:
168; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
169; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
170; CI-NEXT:    s_waitcnt lgkmcnt(0)
171; CI-NEXT:    s_load_dword s6, s[2:3], 0x0
172; CI-NEXT:    s_load_dword s2, s[4:5], 0x4
173; CI-NEXT:    s_waitcnt lgkmcnt(0)
174; CI-NEXT:    v_mov_b32_e32 v0, s2
175; CI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s6
176; CI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
177; CI-NEXT:    v_rcp_f32_e32 v3, v1
178; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
179; CI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
180; CI-NEXT:    v_fma_f32 v3, v4, v3, v3
181; CI-NEXT:    v_mul_f32_e32 v4, v2, v3
182; CI-NEXT:    v_fma_f32 v5, -v1, v4, v2
183; CI-NEXT:    v_fma_f32 v4, v5, v3, v4
184; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
185; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
186; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
187; CI-NEXT:    s_mov_b32 s2, -1
188; CI-NEXT:    s_mov_b32 s3, 0xf000
189; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s6
190; CI-NEXT:    v_trunc_f32_e32 v1, v1
191; CI-NEXT:    v_fma_f32 v0, -v1, v0, s6
192; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
193; CI-NEXT:    s_endpgm
194;
195; VI-LABEL: frem_f32:
196; VI:       ; %bb.0:
197; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
198; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
199; VI-NEXT:    s_waitcnt lgkmcnt(0)
200; VI-NEXT:    s_load_dword s6, s[2:3], 0x0
201; VI-NEXT:    s_load_dword s2, s[4:5], 0x10
202; VI-NEXT:    s_waitcnt lgkmcnt(0)
203; VI-NEXT:    v_mov_b32_e32 v0, s2
204; VI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s6
205; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
206; VI-NEXT:    v_rcp_f32_e32 v3, v1
207; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
208; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
209; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
210; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
211; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
212; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
213; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
214; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
215; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
216; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s6
217; VI-NEXT:    v_trunc_f32_e32 v1, v1
218; VI-NEXT:    v_fma_f32 v2, -v1, v0, s6
219; VI-NEXT:    v_mov_b32_e32 v0, s0
220; VI-NEXT:    v_mov_b32_e32 v1, s1
221; VI-NEXT:    flat_store_dword v[0:1], v2
222; VI-NEXT:    s_endpgm
223   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
224   %r0 = load float, ptr addrspace(1) %in1, align 4
225   %r1 = load float, ptr addrspace(1) %gep2, align 4
226   %r2 = frem float %r0, %r1
227   store float %r2, ptr addrspace(1) %out, align 4
228   ret void
229}
230
231define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
232; CI-LABEL: fast_frem_f32:
233; CI:       ; %bb.0:
234; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
235; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
236; CI-NEXT:    s_waitcnt lgkmcnt(0)
237; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
238; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
239; CI-NEXT:    s_waitcnt lgkmcnt(0)
240; CI-NEXT:    v_mov_b32_e32 v1, s2
241; CI-NEXT:    v_rcp_f32_e32 v0, s3
242; CI-NEXT:    v_mul_f32_e32 v0, s2, v0
243; CI-NEXT:    v_trunc_f32_e32 v0, v0
244; CI-NEXT:    v_fma_f32 v0, -v0, s3, v1
245; CI-NEXT:    s_mov_b32 s2, -1
246; CI-NEXT:    s_mov_b32 s3, 0xf000
247; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
248; CI-NEXT:    s_endpgm
249;
250; VI-LABEL: fast_frem_f32:
251; VI:       ; %bb.0:
252; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
253; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
254; VI-NEXT:    s_waitcnt lgkmcnt(0)
255; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
256; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
257; VI-NEXT:    s_waitcnt lgkmcnt(0)
258; VI-NEXT:    v_mov_b32_e32 v1, s2
259; VI-NEXT:    v_rcp_f32_e32 v0, s3
260; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
261; VI-NEXT:    v_trunc_f32_e32 v0, v0
262; VI-NEXT:    v_fma_f32 v2, -v0, s3, v1
263; VI-NEXT:    v_mov_b32_e32 v0, s0
264; VI-NEXT:    v_mov_b32_e32 v1, s1
265; VI-NEXT:    flat_store_dword v[0:1], v2
266; VI-NEXT:    s_endpgm
267   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
268   %r0 = load float, ptr addrspace(1) %in1, align 4
269   %r1 = load float, ptr addrspace(1) %gep2, align 4
270   %r2 = frem fast float %r0, %r1
271   store float %r2, ptr addrspace(1) %out, align 4
272   ret void
273}
274
275define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
276; CI-LABEL: unsafe_frem_f32:
277; CI:       ; %bb.0:
278; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
279; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
280; CI-NEXT:    s_waitcnt lgkmcnt(0)
281; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
282; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
283; CI-NEXT:    s_waitcnt lgkmcnt(0)
284; CI-NEXT:    v_mov_b32_e32 v1, s2
285; CI-NEXT:    v_rcp_f32_e32 v0, s3
286; CI-NEXT:    v_mul_f32_e32 v0, s2, v0
287; CI-NEXT:    v_trunc_f32_e32 v0, v0
288; CI-NEXT:    v_fma_f32 v0, -v0, s3, v1
289; CI-NEXT:    s_mov_b32 s2, -1
290; CI-NEXT:    s_mov_b32 s3, 0xf000
291; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
292; CI-NEXT:    s_endpgm
293;
294; VI-LABEL: unsafe_frem_f32:
295; VI:       ; %bb.0:
296; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
297; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
298; VI-NEXT:    s_waitcnt lgkmcnt(0)
299; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
300; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
301; VI-NEXT:    s_waitcnt lgkmcnt(0)
302; VI-NEXT:    v_mov_b32_e32 v1, s2
303; VI-NEXT:    v_rcp_f32_e32 v0, s3
304; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
305; VI-NEXT:    v_trunc_f32_e32 v0, v0
306; VI-NEXT:    v_fma_f32 v2, -v0, s3, v1
307; VI-NEXT:    v_mov_b32_e32 v0, s0
308; VI-NEXT:    v_mov_b32_e32 v1, s1
309; VI-NEXT:    flat_store_dword v[0:1], v2
310; VI-NEXT:    s_endpgm
311   %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
312   %r0 = load float, ptr addrspace(1) %in1, align 4
313   %r1 = load float, ptr addrspace(1) %gep2, align 4
314   %r2 = frem float %r0, %r1
315   store float %r2, ptr addrspace(1) %out, align 4
316   ret void
317}
318
319define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
320; CI-LABEL: frem_f64:
321; CI:       ; %bb.0:
322; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
323; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
324; CI-NEXT:    s_waitcnt lgkmcnt(0)
325; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
326; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
327; CI-NEXT:    s_waitcnt lgkmcnt(0)
328; CI-NEXT:    v_mov_b32_e32 v0, s4
329; CI-NEXT:    v_mov_b32_e32 v1, s5
330; CI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
331; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
332; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
333; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
334; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
335; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
336; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
337; CI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
338; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
339; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
340; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
341; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
342; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
343; CI-NEXT:    s_mov_b32 s2, -1
344; CI-NEXT:    s_mov_b32 s3, 0xf000
345; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
346; CI-NEXT:    s_endpgm
347;
348; VI-LABEL: frem_f64:
349; VI:       ; %bb.0:
350; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
351; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
352; VI-NEXT:    s_waitcnt lgkmcnt(0)
353; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
354; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
355; VI-NEXT:    s_waitcnt lgkmcnt(0)
356; VI-NEXT:    v_mov_b32_e32 v0, s4
357; VI-NEXT:    v_mov_b32_e32 v1, s5
358; VI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
359; VI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
360; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
361; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
362; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
363; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
364; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
365; VI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
366; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
367; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
368; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
369; VI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
370; VI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
371; VI-NEXT:    v_mov_b32_e32 v3, s1
372; VI-NEXT:    v_mov_b32_e32 v2, s0
373; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
374; VI-NEXT:    s_endpgm
375   %r0 = load double, ptr addrspace(1) %in1, align 8
376   %r1 = load double, ptr addrspace(1) %in2, align 8
377   %r2 = frem double %r0, %r1
378   store double %r2, ptr addrspace(1) %out, align 8
379   ret void
380}
381
382define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
383; CI-LABEL: fast_frem_f64:
384; CI:       ; %bb.0:
385; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
386; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
387; CI-NEXT:    s_waitcnt lgkmcnt(0)
388; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
389; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
390; CI-NEXT:    s_waitcnt lgkmcnt(0)
391; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
392; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
393; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
394; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
395; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
396; CI-NEXT:    v_mov_b32_e32 v2, s2
397; CI-NEXT:    v_mov_b32_e32 v3, s3
398; CI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
399; CI-NEXT:    s_mov_b32 s2, -1
400; CI-NEXT:    s_mov_b32 s3, 0xf000
401; CI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
402; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
403; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
404; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
405; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
406; CI-NEXT:    s_endpgm
407;
408; VI-LABEL: fast_frem_f64:
409; VI:       ; %bb.0:
410; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
411; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
412; VI-NEXT:    s_waitcnt lgkmcnt(0)
413; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
414; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
415; VI-NEXT:    s_waitcnt lgkmcnt(0)
416; VI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
417; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
418; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
419; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
420; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
421; VI-NEXT:    v_mov_b32_e32 v2, s2
422; VI-NEXT:    v_mov_b32_e32 v3, s3
423; VI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
424; VI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
425; VI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
426; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
427; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
428; VI-NEXT:    v_mov_b32_e32 v3, s1
429; VI-NEXT:    v_mov_b32_e32 v2, s0
430; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
431; VI-NEXT:    s_endpgm
432   %r0 = load double, ptr addrspace(1) %in1, align 8
433   %r1 = load double, ptr addrspace(1) %in2, align 8
434   %r2 = frem fast double %r0, %r1
435   store double %r2, ptr addrspace(1) %out, align 8
436   ret void
437}
438
439define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
440; CI-LABEL: unsafe_frem_f64:
441; CI:       ; %bb.0:
442; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
443; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
444; CI-NEXT:    s_waitcnt lgkmcnt(0)
445; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
446; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
447; CI-NEXT:    s_waitcnt lgkmcnt(0)
448; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
449; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
450; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
451; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
452; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
453; CI-NEXT:    v_mov_b32_e32 v2, s2
454; CI-NEXT:    v_mov_b32_e32 v3, s3
455; CI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
456; CI-NEXT:    s_mov_b32 s2, -1
457; CI-NEXT:    s_mov_b32 s3, 0xf000
458; CI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
459; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
460; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
461; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
462; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
463; CI-NEXT:    s_endpgm
464;
465; VI-LABEL: unsafe_frem_f64:
466; VI:       ; %bb.0:
467; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
468; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
469; VI-NEXT:    s_waitcnt lgkmcnt(0)
470; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
471; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
472; VI-NEXT:    s_waitcnt lgkmcnt(0)
473; VI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
474; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
475; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
476; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
477; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
478; VI-NEXT:    v_mov_b32_e32 v2, s2
479; VI-NEXT:    v_mov_b32_e32 v3, s3
480; VI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
481; VI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
482; VI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
483; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
484; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
485; VI-NEXT:    v_mov_b32_e32 v3, s1
486; VI-NEXT:    v_mov_b32_e32 v2, s0
487; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
488; VI-NEXT:    s_endpgm
489                             ptr addrspace(1) %in2) #1 {
490   %r0 = load double, ptr addrspace(1) %in1, align 8
491   %r1 = load double, ptr addrspace(1) %in2, align 8
492   %r2 = frem double %r0, %r1
493   store double %r2, ptr addrspace(1) %out, align 8
494   ret void
495}
496
497define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
498; CI-LABEL: frem_v2f16:
499; CI:       ; %bb.0:
500; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
501; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
502; CI-NEXT:    s_waitcnt lgkmcnt(0)
503; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
504; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
505; CI-NEXT:    s_waitcnt lgkmcnt(0)
506; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
507; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
508; CI-NEXT:    s_lshr_b32 s4, s2, 16
509; CI-NEXT:    s_lshr_b32 s5, s3, 16
510; CI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
511; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
512; CI-NEXT:    v_rcp_f32_e32 v4, v2
513; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
514; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
515; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
516; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
517; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
518; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
519; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
520; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
521; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
522; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
523; CI-NEXT:    v_trunc_f32_e32 v2, v2
524; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
525; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
526; CI-NEXT:    v_cvt_f32_f16_e32 v2, s5
527; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
528; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
529; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, v1
530; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v2, v1
531; CI-NEXT:    v_rcp_f32_e32 v5, v3
532; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
533; CI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
534; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
535; CI-NEXT:    v_mul_f32_e32 v6, v4, v5
536; CI-NEXT:    v_fma_f32 v7, -v3, v6, v4
537; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
538; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
539; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
540; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
541; CI-NEXT:    s_mov_b32 s2, -1
542; CI-NEXT:    s_mov_b32 s3, 0xf000
543; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
544; CI-NEXT:    v_trunc_f32_e32 v3, v3
545; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
546; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
547; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
548; CI-NEXT:    v_or_b32_e32 v0, v0, v1
549; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
550; CI-NEXT:    s_endpgm
551;
552; VI-LABEL: frem_v2f16:
553; VI:       ; %bb.0:
554; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
555; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
556; VI-NEXT:    s_waitcnt lgkmcnt(0)
557; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
558; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
559; VI-NEXT:    s_waitcnt lgkmcnt(0)
560; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
561; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
562; VI-NEXT:    s_lshr_b32 s5, s3, 16
563; VI-NEXT:    v_mov_b32_e32 v1, s3
564; VI-NEXT:    s_lshr_b32 s4, s2, 16
565; VI-NEXT:    v_rcp_f32_e32 v3, v2
566; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
567; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
568; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
569; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
570; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
571; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
572; VI-NEXT:    v_add_f32_e32 v0, v0, v4
573; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
574; VI-NEXT:    v_cvt_f32_f16_e32 v3, s5
575; VI-NEXT:    v_mov_b32_e32 v2, s5
576; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
577; VI-NEXT:    v_trunc_f16_e32 v0, v0
578; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
579; VI-NEXT:    v_cvt_f32_f16_e32 v1, s4
580; VI-NEXT:    v_rcp_f32_e32 v4, v3
581; VI-NEXT:    v_mul_f32_e32 v5, v1, v4
582; VI-NEXT:    v_mad_f32 v6, -v3, v5, v1
583; VI-NEXT:    v_mac_f32_e32 v5, v6, v4
584; VI-NEXT:    v_mad_f32 v1, -v3, v5, v1
585; VI-NEXT:    v_mul_f32_e32 v1, v1, v4
586; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
587; VI-NEXT:    v_add_f32_e32 v1, v1, v5
588; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
589; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s4
590; VI-NEXT:    v_trunc_f16_e32 v1, v1
591; VI-NEXT:    v_fma_f16 v1, -v1, v2, s4
592; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
593; VI-NEXT:    v_or_b32_e32 v2, v0, v1
594; VI-NEXT:    v_mov_b32_e32 v0, s0
595; VI-NEXT:    v_mov_b32_e32 v1, s1
596; VI-NEXT:    flat_store_dword v[0:1], v2
597; VI-NEXT:    s_endpgm
598   %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
599   %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
600   %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8
601   %r2 = frem <2 x half> %r0, %r1
602   store <2 x half> %r2, ptr addrspace(1) %out, align 8
603   ret void
604}
605
606define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
607; CI-LABEL: frem_v4f16:
608; CI:       ; %bb.0:
609; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
610; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
611; CI-NEXT:    s_waitcnt lgkmcnt(0)
612; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
613; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
614; CI-NEXT:    s_waitcnt lgkmcnt(0)
615; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
616; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
617; CI-NEXT:    s_lshr_b32 s8, s2, 16
618; CI-NEXT:    s_lshr_b32 s9, s3, 16
619; CI-NEXT:    s_lshr_b32 s10, s4, 16
620; CI-NEXT:    v_div_scale_f32 v2, s[6:7], v1, v1, v0
621; CI-NEXT:    s_lshr_b32 s11, s5, 16
622; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
623; CI-NEXT:    v_rcp_f32_e32 v4, v2
624; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
625; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
626; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
627; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
628; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
629; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
630; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
631; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
632; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
633; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
634; CI-NEXT:    v_trunc_f32_e32 v2, v2
635; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
636; CI-NEXT:    v_cvt_f32_f16_e32 v1, s8
637; CI-NEXT:    v_cvt_f32_f16_e32 v2, s10
638; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
639; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
640; CI-NEXT:    v_div_scale_f32 v3, s[6:7], v2, v2, v1
641; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v2, v1
642; CI-NEXT:    v_rcp_f32_e32 v5, v3
643; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
644; CI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
645; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
646; CI-NEXT:    v_mul_f32_e32 v6, v4, v5
647; CI-NEXT:    v_fma_f32 v7, -v3, v6, v4
648; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
649; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
650; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
651; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
652; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
653; CI-NEXT:    v_trunc_f32_e32 v3, v3
654; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
655; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
656; CI-NEXT:    v_cvt_f32_f16_e32 v3, s5
657; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
658; CI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, v2
659; CI-NEXT:    v_div_scale_f32 v5, vcc, v2, v3, v2
660; CI-NEXT:    v_rcp_f32_e32 v6, v4
661; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
662; CI-NEXT:    v_fma_f32 v7, -v4, v6, 1.0
663; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
664; CI-NEXT:    v_mul_f32_e32 v7, v5, v6
665; CI-NEXT:    v_fma_f32 v8, -v4, v7, v5
666; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
667; CI-NEXT:    v_fma_f32 v4, -v4, v7, v5
668; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
669; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
670; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v2
671; CI-NEXT:    v_trunc_f32_e32 v4, v4
672; CI-NEXT:    v_fma_f32 v2, -v4, v3, v2
673; CI-NEXT:    v_cvt_f32_f16_e32 v3, s9
674; CI-NEXT:    v_cvt_f32_f16_e32 v4, s11
675; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
676; CI-NEXT:    v_div_scale_f32 v5, s[2:3], v4, v4, v3
677; CI-NEXT:    v_div_scale_f32 v6, vcc, v3, v4, v3
678; CI-NEXT:    v_rcp_f32_e32 v7, v5
679; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
680; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
681; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
682; CI-NEXT:    v_mul_f32_e32 v8, v6, v7
683; CI-NEXT:    v_fma_f32 v9, -v5, v8, v6
684; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
685; CI-NEXT:    v_fma_f32 v5, -v5, v8, v6
686; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
687; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
688; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
689; CI-NEXT:    v_or_b32_e32 v0, v0, v1
690; CI-NEXT:    s_mov_b32 s2, -1
691; CI-NEXT:    s_mov_b32 s3, 0xf000
692; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v3
693; CI-NEXT:    v_trunc_f32_e32 v5, v5
694; CI-NEXT:    v_fma_f32 v3, -v5, v4, v3
695; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
696; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
697; CI-NEXT:    v_or_b32_e32 v1, v2, v1
698; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
699; CI-NEXT:    s_endpgm
700;
701; VI-LABEL: frem_v4f16:
702; VI:       ; %bb.0:
703; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
704; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
705; VI-NEXT:    s_waitcnt lgkmcnt(0)
706; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
707; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
708; VI-NEXT:    s_waitcnt lgkmcnt(0)
709; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
710; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
711; VI-NEXT:    s_lshr_b32 s8, s4, 16
712; VI-NEXT:    v_mov_b32_e32 v1, s4
713; VI-NEXT:    s_lshr_b32 s6, s2, 16
714; VI-NEXT:    v_rcp_f32_e32 v3, v2
715; VI-NEXT:    s_lshr_b32 s9, s5, 16
716; VI-NEXT:    s_lshr_b32 s7, s3, 16
717; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
718; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
719; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
720; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
721; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
722; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
723; VI-NEXT:    v_add_f32_e32 v0, v0, v4
724; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
725; VI-NEXT:    v_cvt_f32_f16_e32 v3, s8
726; VI-NEXT:    v_mov_b32_e32 v2, s8
727; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
728; VI-NEXT:    v_trunc_f16_e32 v0, v0
729; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
730; VI-NEXT:    v_cvt_f32_f16_e32 v1, s6
731; VI-NEXT:    v_rcp_f32_e32 v4, v3
732; VI-NEXT:    v_mul_f32_e32 v5, v1, v4
733; VI-NEXT:    v_mad_f32 v6, -v3, v5, v1
734; VI-NEXT:    v_mac_f32_e32 v5, v6, v4
735; VI-NEXT:    v_mad_f32 v1, -v3, v5, v1
736; VI-NEXT:    v_mul_f32_e32 v1, v1, v4
737; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
738; VI-NEXT:    v_add_f32_e32 v1, v1, v5
739; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
740; VI-NEXT:    v_cvt_f32_f16_e32 v4, s5
741; VI-NEXT:    v_mov_b32_e32 v3, s5
742; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s6
743; VI-NEXT:    v_trunc_f16_e32 v1, v1
744; VI-NEXT:    v_fma_f16 v1, -v1, v2, s6
745; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
746; VI-NEXT:    v_rcp_f32_e32 v5, v4
747; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
748; VI-NEXT:    v_or_b32_e32 v0, v0, v1
749; VI-NEXT:    v_mul_f32_e32 v6, v2, v5
750; VI-NEXT:    v_mad_f32 v7, -v4, v6, v2
751; VI-NEXT:    v_mac_f32_e32 v6, v7, v5
752; VI-NEXT:    v_mad_f32 v2, -v4, v6, v2
753; VI-NEXT:    v_mul_f32_e32 v2, v2, v5
754; VI-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
755; VI-NEXT:    v_add_f32_e32 v2, v2, v6
756; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
757; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
758; VI-NEXT:    v_mov_b32_e32 v4, s9
759; VI-NEXT:    v_div_fixup_f16 v2, v2, v3, s3
760; VI-NEXT:    v_trunc_f16_e32 v2, v2
761; VI-NEXT:    v_fma_f16 v2, -v2, v3, s3
762; VI-NEXT:    v_cvt_f32_f16_e32 v3, s7
763; VI-NEXT:    v_rcp_f32_e32 v6, v5
764; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
765; VI-NEXT:    v_mad_f32 v8, -v5, v7, v3
766; VI-NEXT:    v_mac_f32_e32 v7, v8, v6
767; VI-NEXT:    v_mad_f32 v3, -v5, v7, v3
768; VI-NEXT:    v_mul_f32_e32 v3, v3, v6
769; VI-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
770; VI-NEXT:    v_add_f32_e32 v3, v3, v7
771; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
772; VI-NEXT:    v_div_fixup_f16 v3, v3, v4, s7
773; VI-NEXT:    v_trunc_f16_e32 v3, v3
774; VI-NEXT:    v_fma_f16 v3, -v3, v4, s7
775; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
776; VI-NEXT:    v_or_b32_e32 v1, v2, v1
777; VI-NEXT:    v_mov_b32_e32 v3, s1
778; VI-NEXT:    v_mov_b32_e32 v2, s0
779; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
780; VI-NEXT:    s_endpgm
781   %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
782   %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
783   %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16
784   %r2 = frem <4 x half> %r0, %r1
785   store <4 x half> %r2, ptr addrspace(1) %out, align 16
786   ret void
787}
788
789define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
790; CI-LABEL: frem_v2f32:
791; CI:       ; %bb.0:
792; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
793; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
794; CI-NEXT:    s_waitcnt lgkmcnt(0)
795; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
796; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
797; CI-NEXT:    s_waitcnt lgkmcnt(0)
798; CI-NEXT:    v_mov_b32_e32 v0, s4
799; CI-NEXT:    v_div_scale_f32 v1, s[6:7], v0, v0, s2
800; CI-NEXT:    v_div_scale_f32 v2, vcc, s2, v0, s2
801; CI-NEXT:    v_rcp_f32_e32 v3, v1
802; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
803; CI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
804; CI-NEXT:    v_fma_f32 v3, v4, v3, v3
805; CI-NEXT:    v_mul_f32_e32 v4, v2, v3
806; CI-NEXT:    v_fma_f32 v5, -v1, v4, v2
807; CI-NEXT:    v_fma_f32 v4, v5, v3, v4
808; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
809; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
810; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
811; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
812; CI-NEXT:    v_trunc_f32_e32 v1, v1
813; CI-NEXT:    v_fma_f32 v0, -v1, v0, s2
814; CI-NEXT:    v_mov_b32_e32 v1, s5
815; CI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, s3
816; CI-NEXT:    v_div_scale_f32 v3, vcc, s3, v1, s3
817; CI-NEXT:    v_rcp_f32_e32 v4, v2
818; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
819; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
820; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
821; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
822; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
823; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
824; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
825; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
826; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
827; CI-NEXT:    s_mov_b32 s2, -1
828; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, s3
829; CI-NEXT:    v_trunc_f32_e32 v2, v2
830; CI-NEXT:    v_fma_f32 v1, -v2, v1, s3
831; CI-NEXT:    s_mov_b32 s3, 0xf000
832; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
833; CI-NEXT:    s_endpgm
834;
835; VI-LABEL: frem_v2f32:
836; VI:       ; %bb.0:
837; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
838; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
839; VI-NEXT:    s_waitcnt lgkmcnt(0)
840; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
841; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
842; VI-NEXT:    s_waitcnt lgkmcnt(0)
843; VI-NEXT:    v_mov_b32_e32 v0, s4
844; VI-NEXT:    v_div_scale_f32 v1, s[6:7], v0, v0, s2
845; VI-NEXT:    v_div_scale_f32 v2, vcc, s2, v0, s2
846; VI-NEXT:    v_rcp_f32_e32 v3, v1
847; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
848; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
849; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
850; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
851; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
852; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
853; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
854; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
855; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
856; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
857; VI-NEXT:    v_trunc_f32_e32 v1, v1
858; VI-NEXT:    v_fma_f32 v0, -v1, v0, s2
859; VI-NEXT:    v_mov_b32_e32 v1, s5
860; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, s3
861; VI-NEXT:    v_div_scale_f32 v3, vcc, s3, v1, s3
862; VI-NEXT:    v_rcp_f32_e32 v4, v2
863; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
864; VI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
865; VI-NEXT:    v_fma_f32 v4, v5, v4, v4
866; VI-NEXT:    v_mul_f32_e32 v5, v3, v4
867; VI-NEXT:    v_fma_f32 v6, -v2, v5, v3
868; VI-NEXT:    v_fma_f32 v5, v6, v4, v5
869; VI-NEXT:    v_fma_f32 v2, -v2, v5, v3
870; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
871; VI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
872; VI-NEXT:    v_div_fixup_f32 v2, v2, v1, s3
873; VI-NEXT:    v_trunc_f32_e32 v2, v2
874; VI-NEXT:    v_fma_f32 v1, -v2, v1, s3
875; VI-NEXT:    v_mov_b32_e32 v3, s1
876; VI-NEXT:    v_mov_b32_e32 v2, s0
877; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
878; VI-NEXT:    s_endpgm
879   %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
880   %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
881   %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8
882   %r2 = frem <2 x float> %r0, %r1
883   store <2 x float> %r2, ptr addrspace(1) %out, align 8
884   ret void
885}
886
887define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
888; CI-LABEL: frem_v4f32:
889; CI:       ; %bb.0:
890; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
891; CI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
892; CI-NEXT:    s_waitcnt lgkmcnt(0)
893; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
894; CI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
895; CI-NEXT:    s_waitcnt lgkmcnt(0)
896; CI-NEXT:    v_mov_b32_e32 v0, s8
897; CI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s4
898; CI-NEXT:    v_div_scale_f32 v2, vcc, s4, v0, s4
899; CI-NEXT:    v_rcp_f32_e32 v3, v1
900; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
901; CI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
902; CI-NEXT:    v_fma_f32 v3, v4, v3, v3
903; CI-NEXT:    v_mul_f32_e32 v4, v2, v3
904; CI-NEXT:    v_fma_f32 v5, -v1, v4, v2
905; CI-NEXT:    v_fma_f32 v4, v5, v3, v4
906; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
907; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
908; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
909; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s4
910; CI-NEXT:    v_trunc_f32_e32 v1, v1
911; CI-NEXT:    v_fma_f32 v0, -v1, v0, s4
912; CI-NEXT:    v_mov_b32_e32 v1, s9
913; CI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, s5
914; CI-NEXT:    v_div_scale_f32 v3, vcc, s5, v1, s5
915; CI-NEXT:    v_rcp_f32_e32 v4, v2
916; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
917; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
918; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
919; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
920; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
921; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
922; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
923; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
924; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
925; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, s5
926; CI-NEXT:    v_trunc_f32_e32 v2, v2
927; CI-NEXT:    v_fma_f32 v1, -v2, v1, s5
928; CI-NEXT:    v_mov_b32_e32 v2, s10
929; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, s6
930; CI-NEXT:    v_div_scale_f32 v4, vcc, s6, v2, s6
931; CI-NEXT:    v_rcp_f32_e32 v5, v3
932; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
933; CI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
934; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
935; CI-NEXT:    v_mul_f32_e32 v6, v4, v5
936; CI-NEXT:    v_fma_f32 v7, -v3, v6, v4
937; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
938; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
939; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
940; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
941; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, s6
942; CI-NEXT:    v_trunc_f32_e32 v3, v3
943; CI-NEXT:    v_fma_f32 v2, -v3, v2, s6
944; CI-NEXT:    v_mov_b32_e32 v3, s11
945; CI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, s7
946; CI-NEXT:    v_div_scale_f32 v5, vcc, s7, v3, s7
947; CI-NEXT:    v_rcp_f32_e32 v6, v4
948; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
949; CI-NEXT:    v_fma_f32 v7, -v4, v6, 1.0
950; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
951; CI-NEXT:    v_mul_f32_e32 v7, v5, v6
952; CI-NEXT:    v_fma_f32 v8, -v4, v7, v5
953; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
954; CI-NEXT:    v_fma_f32 v4, -v4, v7, v5
955; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
956; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
957; CI-NEXT:    s_mov_b32 s2, -1
958; CI-NEXT:    s_mov_b32 s3, 0xf000
959; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, s7
960; CI-NEXT:    v_trunc_f32_e32 v4, v4
961; CI-NEXT:    v_fma_f32 v3, -v4, v3, s7
962; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
963; CI-NEXT:    s_endpgm
964;
965; VI-LABEL: frem_v4f32:
966; VI:       ; %bb.0:
967; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
968; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
969; VI-NEXT:    s_waitcnt lgkmcnt(0)
970; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
971; VI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x40
972; VI-NEXT:    s_waitcnt lgkmcnt(0)
973; VI-NEXT:    v_mov_b32_e32 v0, s8
974; VI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s4
975; VI-NEXT:    v_div_scale_f32 v2, vcc, s4, v0, s4
976; VI-NEXT:    v_rcp_f32_e32 v3, v1
977; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
978; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
979; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
980; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
981; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
982; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
983; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
984; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
985; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
986; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s4
987; VI-NEXT:    v_trunc_f32_e32 v1, v1
988; VI-NEXT:    v_fma_f32 v0, -v1, v0, s4
989; VI-NEXT:    v_mov_b32_e32 v1, s9
990; VI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, s5
991; VI-NEXT:    v_div_scale_f32 v3, vcc, s5, v1, s5
992; VI-NEXT:    v_rcp_f32_e32 v4, v2
993; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
994; VI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
995; VI-NEXT:    v_fma_f32 v4, v5, v4, v4
996; VI-NEXT:    v_mul_f32_e32 v5, v3, v4
997; VI-NEXT:    v_fma_f32 v6, -v2, v5, v3
998; VI-NEXT:    v_fma_f32 v5, v6, v4, v5
999; VI-NEXT:    v_fma_f32 v2, -v2, v5, v3
1000; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1001; VI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
1002; VI-NEXT:    v_div_fixup_f32 v2, v2, v1, s5
1003; VI-NEXT:    v_trunc_f32_e32 v2, v2
1004; VI-NEXT:    v_fma_f32 v1, -v2, v1, s5
1005; VI-NEXT:    v_mov_b32_e32 v2, s10
1006; VI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, s6
1007; VI-NEXT:    v_div_scale_f32 v4, vcc, s6, v2, s6
1008; VI-NEXT:    v_rcp_f32_e32 v5, v3
1009; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1010; VI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
1011; VI-NEXT:    v_fma_f32 v5, v6, v5, v5
1012; VI-NEXT:    v_mul_f32_e32 v6, v4, v5
1013; VI-NEXT:    v_fma_f32 v7, -v3, v6, v4
1014; VI-NEXT:    v_fma_f32 v6, v7, v5, v6
1015; VI-NEXT:    v_fma_f32 v3, -v3, v6, v4
1016; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1017; VI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
1018; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, s6
1019; VI-NEXT:    v_trunc_f32_e32 v3, v3
1020; VI-NEXT:    v_fma_f32 v2, -v3, v2, s6
1021; VI-NEXT:    v_mov_b32_e32 v3, s11
1022; VI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, s7
1023; VI-NEXT:    v_div_scale_f32 v5, vcc, s7, v3, s7
1024; VI-NEXT:    v_rcp_f32_e32 v6, v4
1025; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1026; VI-NEXT:    v_fma_f32 v7, -v4, v6, 1.0
1027; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
1028; VI-NEXT:    v_mul_f32_e32 v7, v5, v6
1029; VI-NEXT:    v_fma_f32 v8, -v4, v7, v5
1030; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
1031; VI-NEXT:    v_fma_f32 v4, -v4, v7, v5
1032; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1033; VI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1034; VI-NEXT:    v_div_fixup_f32 v4, v4, v3, s7
1035; VI-NEXT:    v_trunc_f32_e32 v4, v4
1036; VI-NEXT:    v_fma_f32 v3, -v4, v3, s7
1037; VI-NEXT:    v_mov_b32_e32 v5, s1
1038; VI-NEXT:    v_mov_b32_e32 v4, s0
1039; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1040; VI-NEXT:    s_endpgm
1041   %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
1042   %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
1043   %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16
1044   %r2 = frem <4 x float> %r0, %r1
1045   store <4 x float> %r2, ptr addrspace(1) %out, align 16
1046   ret void
1047}
1048
1049define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
1050; CI-LABEL: frem_v2f64:
1051; CI:       ; %bb.0:
1052; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1053; CI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1054; CI-NEXT:    s_waitcnt lgkmcnt(0)
1055; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
1056; CI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
1057; CI-NEXT:    s_waitcnt lgkmcnt(0)
1058; CI-NEXT:    v_mov_b32_e32 v0, s8
1059; CI-NEXT:    v_mov_b32_e32 v1, s9
1060; CI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
1061; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
1062; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1063; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1064; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1065; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1066; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1067; CI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
1068; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
1069; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
1070; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
1071; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
1072; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
1073; CI-NEXT:    v_mov_b32_e32 v2, s10
1074; CI-NEXT:    v_mov_b32_e32 v3, s11
1075; CI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
1076; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
1077; CI-NEXT:    s_mov_b32 s2, -1
1078; CI-NEXT:    s_mov_b32 s3, 0xf000
1079; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1080; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1081; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1082; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1083; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1084; CI-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
1085; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
1086; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
1087; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
1088; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1089; CI-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
1090; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1091; CI-NEXT:    s_endpgm
1092;
1093; VI-LABEL: frem_v2f64:
1094; VI:       ; %bb.0:
1095; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1096; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1097; VI-NEXT:    s_waitcnt lgkmcnt(0)
1098; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
1099; VI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x40
1100; VI-NEXT:    s_waitcnt lgkmcnt(0)
1101; VI-NEXT:    v_mov_b32_e32 v0, s8
1102; VI-NEXT:    v_mov_b32_e32 v1, s9
1103; VI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
1104; VI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
1105; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1106; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1107; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1108; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1109; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1110; VI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
1111; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
1112; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
1113; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
1114; VI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
1115; VI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
1116; VI-NEXT:    v_mov_b32_e32 v2, s10
1117; VI-NEXT:    v_mov_b32_e32 v3, s11
1118; VI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
1119; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
1120; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1121; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1122; VI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1123; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1124; VI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1125; VI-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
1126; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
1127; VI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
1128; VI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
1129; VI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1130; VI-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
1131; VI-NEXT:    v_mov_b32_e32 v5, s1
1132; VI-NEXT:    v_mov_b32_e32 v4, s0
1133; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1134; VI-NEXT:    s_endpgm
1135   %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
1136   %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
1137   %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16
1138   %r2 = frem <2 x double> %r0, %r1
1139   store <2 x double> %r2, ptr addrspace(1) %out, align 16
1140   ret void
1141}
1142
1143attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1144attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1145