xref: /llvm-project/llvm/test/CodeGen/AMDGPU/udiv.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
7
8define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
9; SI-LABEL: udiv_i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_u32_e32 v2, v1
24; SI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
25; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
26; SI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
27; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
28; SI-NEXT:    v_mul_lo_u32 v3, v3, v2
29; SI-NEXT:    v_mul_hi_u32 v3, v2, v3
30; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
31; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
32; SI-NEXT:    v_mul_lo_u32 v3, v2, v1
33; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
34; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
35; SI-NEXT:    v_sub_i32_e32 v3, vcc, v0, v1
36; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
37; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
38; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
39; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
40; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
41; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
42; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
43; SI-NEXT:    s_endpgm
44;
45; VI-LABEL: udiv_i32:
46; VI:       ; %bb.0:
47; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
48; VI-NEXT:    s_mov_b32 s7, 0xf000
49; VI-NEXT:    s_mov_b32 s6, -1
50; VI-NEXT:    s_mov_b32 s10, s6
51; VI-NEXT:    s_mov_b32 s11, s7
52; VI-NEXT:    s_waitcnt lgkmcnt(0)
53; VI-NEXT:    s_mov_b32 s8, s2
54; VI-NEXT:    s_mov_b32 s9, s3
55; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
56; VI-NEXT:    s_mov_b32 s4, s0
57; VI-NEXT:    s_mov_b32 s5, s1
58; VI-NEXT:    s_waitcnt vmcnt(0)
59; VI-NEXT:    v_cvt_f32_u32_e32 v2, v1
60; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
61; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
62; VI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
63; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
64; VI-NEXT:    v_mul_lo_u32 v3, v3, v2
65; VI-NEXT:    v_mul_hi_u32 v3, v2, v3
66; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
67; VI-NEXT:    v_mul_hi_u32 v2, v0, v2
68; VI-NEXT:    v_mul_lo_u32 v3, v2, v1
69; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
70; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
71; VI-NEXT:    v_sub_u32_e32 v3, vcc, v0, v1
72; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
73; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
74; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
75; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v2
76; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
77; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
78; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
79; VI-NEXT:    s_endpgm
80;
81; GCN-LABEL: udiv_i32:
82; GCN:       ; %bb.0:
83; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
84; GCN-NEXT:    s_waitcnt lgkmcnt(0)
85; GCN-NEXT:    v_mov_b32_e32 v0, s2
86; GCN-NEXT:    v_mov_b32_e32 v1, s3
87; GCN-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
88; GCN-NEXT:    s_waitcnt vmcnt(0)
89; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
90; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
91; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
92; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
93; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
94; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
95; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
96; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
97; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
98; GCN-NEXT:    v_mov_b32_e32 v2, s0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_mul_lo_u32 v5, v4, v1
101; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
102; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v5
103; GCN-NEXT:    v_sub_u32_e32 v5, vcc, v0, v1
104; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
105; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
106; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
107; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
108; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
109; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
110; GCN-NEXT:    flat_store_dword v[2:3], v0
111; GCN-NEXT:    s_endpgm
112;
113; GFX1030-LABEL: udiv_i32:
114; GFX1030:       ; %bb.0:
115; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
116; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
117; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX1030-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
119; GFX1030-NEXT:    s_waitcnt vmcnt(0)
120; GFX1030-NEXT:    v_readfirstlane_b32 s2, v1
121; GFX1030-NEXT:    v_readfirstlane_b32 s5, v0
122; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s2
123; GFX1030-NEXT:    s_sub_i32 s4, 0, s2
124; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
125; GFX1030-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
126; GFX1030-NEXT:    v_cvt_u32_f32_e32 v1, v1
127; GFX1030-NEXT:    v_readfirstlane_b32 s3, v1
128; GFX1030-NEXT:    s_mul_i32 s4, s4, s3
129; GFX1030-NEXT:    s_mul_hi_u32 s4, s3, s4
130; GFX1030-NEXT:    s_add_i32 s3, s3, s4
131; GFX1030-NEXT:    s_mul_hi_u32 s3, s5, s3
132; GFX1030-NEXT:    s_mul_i32 s4, s3, s2
133; GFX1030-NEXT:    s_sub_i32 s4, s5, s4
134; GFX1030-NEXT:    s_add_i32 s5, s3, 1
135; GFX1030-NEXT:    s_sub_i32 s6, s4, s2
136; GFX1030-NEXT:    s_cmp_ge_u32 s4, s2
137; GFX1030-NEXT:    s_cselect_b32 s3, s5, s3
138; GFX1030-NEXT:    s_cselect_b32 s4, s6, s4
139; GFX1030-NEXT:    s_add_i32 s5, s3, 1
140; GFX1030-NEXT:    s_cmp_ge_u32 s4, s2
141; GFX1030-NEXT:    s_cselect_b32 s2, s5, s3
142; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
143; GFX1030-NEXT:    global_store_dword v2, v0, s[0:1]
144; GFX1030-NEXT:    s_endpgm
145;
146; EG-LABEL: udiv_i32:
147; EG:       ; %bb.0:
148; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
149; EG-NEXT:    TEX 0 @6
150; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
151; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
152; EG-NEXT:    CF_END
153; EG-NEXT:    PAD
154; EG-NEXT:    Fetch clause starting at 6:
155; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
156; EG-NEXT:    ALU clause starting at 8:
157; EG-NEXT:     MOV * T0.X, KC0[2].Z,
158; EG-NEXT:    ALU clause starting at 9:
159; EG-NEXT:     SUB_INT T0.W, 0.0, T0.Y,
160; EG-NEXT:     RECIP_UINT * T0.Z, T0.Y,
161; EG-NEXT:     MULLO_INT * T0.W, PV.W, PS,
162; EG-NEXT:     MULHI * T0.W, T0.Z, PS,
163; EG-NEXT:     ADD_INT * T0.W, T0.Z, PS,
164; EG-NEXT:     MULHI * T0.Z, T0.X, PV.W,
165; EG-NEXT:     MULLO_INT * T0.W, PS, T0.Y,
166; EG-NEXT:     SUB_INT * T0.W, T0.X, PS,
167; EG-NEXT:     ADD_INT T1.Z, T0.Z, 1,
168; EG-NEXT:     SETGE_UINT T1.W, PV.W, T0.Y,
169; EG-NEXT:     SUB_INT * T2.W, PV.W, T0.Y,
170; EG-NEXT:     CNDE_INT T0.W, PV.W, T0.W, PS,
171; EG-NEXT:     CNDE_INT * T1.W, PV.W, T0.Z, PV.Z,
172; EG-NEXT:     ADD_INT T2.W, PS, 1,
173; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.Y,
174; EG-NEXT:     CNDE_INT T0.X, PS, T1.W, PV.W,
175; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
176; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
177  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
178  %a = load i32, ptr addrspace(1) %in
179  %b = load i32, ptr addrspace(1) %b_ptr
180  %result = udiv i32 %a, %b
181  store i32 %result, ptr addrspace(1) %out
182  ret void
183}
184
185define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
186; SI-LABEL: s_udiv_i32:
187; SI:       ; %bb.0:
188; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
189; SI-NEXT:    s_mov_b32 s7, 0xf000
190; SI-NEXT:    s_mov_b32 s6, -1
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    v_cvt_f32_u32_e32 v0, s3
193; SI-NEXT:    s_sub_i32 s4, 0, s3
194; SI-NEXT:    s_mov_b32 s5, s1
195; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
196; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
197; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
198; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
199; SI-NEXT:    s_mov_b32 s4, s0
200; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
201; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
202; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
203; SI-NEXT:    v_readfirstlane_b32 s0, v0
204; SI-NEXT:    s_mul_i32 s0, s0, s3
205; SI-NEXT:    s_sub_i32 s0, s2, s0
206; SI-NEXT:    s_sub_i32 s1, s0, s3
207; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
208; SI-NEXT:    s_cmp_ge_u32 s0, s3
209; SI-NEXT:    s_cselect_b64 vcc, -1, 0
210; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
211; SI-NEXT:    s_cselect_b32 s0, s1, s0
212; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
213; SI-NEXT:    s_cmp_ge_u32 s0, s3
214; SI-NEXT:    s_cselect_b64 vcc, -1, 0
215; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
216; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
217; SI-NEXT:    s_endpgm
218;
219; VI-LABEL: s_udiv_i32:
220; VI:       ; %bb.0:
221; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
222; VI-NEXT:    s_mov_b32 s7, 0xf000
223; VI-NEXT:    s_mov_b32 s6, -1
224; VI-NEXT:    s_waitcnt lgkmcnt(0)
225; VI-NEXT:    v_cvt_f32_u32_e32 v0, s3
226; VI-NEXT:    s_sub_i32 s4, 0, s3
227; VI-NEXT:    s_mov_b32 s5, s1
228; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
229; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
230; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
231; VI-NEXT:    v_mul_lo_u32 v1, s4, v0
232; VI-NEXT:    s_mov_b32 s4, s0
233; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
234; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
235; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
236; VI-NEXT:    v_readfirstlane_b32 s0, v0
237; VI-NEXT:    s_mul_i32 s0, s0, s3
238; VI-NEXT:    s_sub_i32 s0, s2, s0
239; VI-NEXT:    s_sub_i32 s1, s0, s3
240; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
241; VI-NEXT:    s_cmp_ge_u32 s0, s3
242; VI-NEXT:    s_cselect_b64 vcc, -1, 0
243; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
244; VI-NEXT:    s_cselect_b32 s0, s1, s0
245; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
246; VI-NEXT:    s_cmp_ge_u32 s0, s3
247; VI-NEXT:    s_cselect_b64 vcc, -1, 0
248; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
249; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
250; VI-NEXT:    s_endpgm
251;
252; GCN-LABEL: s_udiv_i32:
253; GCN:       ; %bb.0:
254; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
255; GCN-NEXT:    s_waitcnt lgkmcnt(0)
256; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
257; GCN-NEXT:    s_sub_i32 s4, 0, s3
258; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
259; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
260; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
261; GCN-NEXT:    v_mul_lo_u32 v1, s4, v0
262; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
263; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
264; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
265; GCN-NEXT:    v_readfirstlane_b32 s4, v0
266; GCN-NEXT:    s_mul_i32 s4, s4, s3
267; GCN-NEXT:    s_sub_i32 s2, s2, s4
268; GCN-NEXT:    s_sub_i32 s4, s2, s3
269; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
270; GCN-NEXT:    s_cmp_ge_u32 s2, s3
271; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
272; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
273; GCN-NEXT:    s_cselect_b32 s2, s4, s2
274; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
275; GCN-NEXT:    s_cmp_ge_u32 s2, s3
276; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
277; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
278; GCN-NEXT:    v_mov_b32_e32 v0, s0
279; GCN-NEXT:    v_mov_b32_e32 v1, s1
280; GCN-NEXT:    flat_store_dword v[0:1], v2
281; GCN-NEXT:    s_endpgm
282;
283; GFX1030-LABEL: s_udiv_i32:
284; GFX1030:       ; %bb.0:
285; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
286; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s3
288; GFX1030-NEXT:    s_sub_i32 s5, 0, s3
289; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
290; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
291; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
292; GFX1030-NEXT:    v_readfirstlane_b32 s4, v0
293; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
294; GFX1030-NEXT:    s_mul_i32 s5, s5, s4
295; GFX1030-NEXT:    s_mul_hi_u32 s5, s4, s5
296; GFX1030-NEXT:    s_add_i32 s4, s4, s5
297; GFX1030-NEXT:    s_mul_hi_u32 s4, s2, s4
298; GFX1030-NEXT:    s_mul_i32 s5, s4, s3
299; GFX1030-NEXT:    s_sub_i32 s2, s2, s5
300; GFX1030-NEXT:    s_add_i32 s5, s4, 1
301; GFX1030-NEXT:    s_sub_i32 s6, s2, s3
302; GFX1030-NEXT:    s_cmp_ge_u32 s2, s3
303; GFX1030-NEXT:    s_cselect_b32 s4, s5, s4
304; GFX1030-NEXT:    s_cselect_b32 s2, s6, s2
305; GFX1030-NEXT:    s_add_i32 s5, s4, 1
306; GFX1030-NEXT:    s_cmp_ge_u32 s2, s3
307; GFX1030-NEXT:    s_cselect_b32 s2, s5, s4
308; GFX1030-NEXT:    v_mov_b32_e32 v1, s2
309; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
310; GFX1030-NEXT:    s_endpgm
311;
312; EG-LABEL: s_udiv_i32:
313; EG:       ; %bb.0:
314; EG-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
315; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
316; EG-NEXT:    CF_END
317; EG-NEXT:    PAD
318; EG-NEXT:    ALU clause starting at 4:
319; EG-NEXT:     SUB_INT T0.W, 0.0, KC0[2].W,
320; EG-NEXT:     RECIP_UINT * T0.X, KC0[2].W,
321; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
322; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
323; EG-NEXT:     ADD_INT * T0.W, T0.X, PS,
324; EG-NEXT:     MULHI * T0.X, KC0[2].Z, PV.W,
325; EG-NEXT:     MULLO_INT * T0.Y, PS, KC0[2].W,
326; EG-NEXT:     SUB_INT * T0.W, KC0[2].Z, PS,
327; EG-NEXT:     SUB_INT T0.Z, PV.W, KC0[2].W,
328; EG-NEXT:     SETGE_UINT T1.W, PV.W, KC0[2].W,
329; EG-NEXT:     ADD_INT * T2.W, T0.X, 1,
330; EG-NEXT:     CNDE_INT T2.W, PV.W, T0.X, PS,
331; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, PV.Z,
332; EG-NEXT:     SETGE_UINT T0.W, PS, KC0[2].W,
333; EG-NEXT:     ADD_INT * T1.W, PV.W, 1,
334; EG-NEXT:     CNDE_INT T0.X, PV.W, T2.W, PS,
335; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
336; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
337  %result = udiv i32 %a, %b
338  store i32 %result, ptr addrspace(1) %out
339  ret void
340}
341
342
343; The code generated by udiv is long and complex and may frequently
344; change. The goal of this test is to make sure the ISel doesn't fail
345; when it gets a v4i32 udiv
346define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
347; SI-LABEL: udiv_v2i32:
348; SI:       ; %bb.0:
349; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
350; SI-NEXT:    s_mov_b32 s7, 0xf000
351; SI-NEXT:    s_mov_b32 s6, -1
352; SI-NEXT:    s_mov_b32 s10, s6
353; SI-NEXT:    s_mov_b32 s11, s7
354; SI-NEXT:    s_waitcnt lgkmcnt(0)
355; SI-NEXT:    s_mov_b32 s8, s2
356; SI-NEXT:    s_mov_b32 s9, s3
357; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
358; SI-NEXT:    s_mov_b32 s4, s0
359; SI-NEXT:    s_mov_b32 s5, s1
360; SI-NEXT:    s_waitcnt vmcnt(0)
361; SI-NEXT:    v_cvt_f32_u32_e32 v4, v2
362; SI-NEXT:    v_cvt_f32_u32_e32 v5, v3
363; SI-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
364; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
365; SI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
366; SI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
367; SI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
368; SI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
369; SI-NEXT:    v_cvt_u32_f32_e32 v4, v4
370; SI-NEXT:    v_cvt_u32_f32_e32 v5, v5
371; SI-NEXT:    v_mul_lo_u32 v6, v6, v4
372; SI-NEXT:    v_mul_lo_u32 v7, v7, v5
373; SI-NEXT:    v_mul_hi_u32 v6, v4, v6
374; SI-NEXT:    v_mul_hi_u32 v7, v5, v7
375; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
376; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
377; SI-NEXT:    v_mul_hi_u32 v4, v0, v4
378; SI-NEXT:    v_mul_hi_u32 v5, v1, v5
379; SI-NEXT:    v_mul_lo_u32 v6, v4, v2
380; SI-NEXT:    v_mul_lo_u32 v8, v5, v3
381; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
382; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
383; SI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
384; SI-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
385; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
386; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
387; SI-NEXT:    v_sub_i32_e32 v6, vcc, v0, v2
388; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
389; SI-NEXT:    v_sub_i32_e32 v7, vcc, v1, v3
390; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
391; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
392; SI-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
393; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
394; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
395; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
396; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
397; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
398; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
399; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
400; SI-NEXT:    s_endpgm
401;
402; VI-LABEL: udiv_v2i32:
403; VI:       ; %bb.0:
404; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
405; VI-NEXT:    s_mov_b32 s7, 0xf000
406; VI-NEXT:    s_mov_b32 s6, -1
407; VI-NEXT:    s_mov_b32 s10, s6
408; VI-NEXT:    s_mov_b32 s11, s7
409; VI-NEXT:    s_waitcnt lgkmcnt(0)
410; VI-NEXT:    s_mov_b32 s8, s2
411; VI-NEXT:    s_mov_b32 s9, s3
412; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
413; VI-NEXT:    s_mov_b32 s4, s0
414; VI-NEXT:    s_mov_b32 s5, s1
415; VI-NEXT:    s_waitcnt vmcnt(0)
416; VI-NEXT:    v_cvt_f32_u32_e32 v4, v2
417; VI-NEXT:    v_cvt_f32_u32_e32 v5, v3
418; VI-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
419; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
420; VI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
421; VI-NEXT:    v_sub_u32_e32 v7, vcc, 0, v3
422; VI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
423; VI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
424; VI-NEXT:    v_cvt_u32_f32_e32 v4, v4
425; VI-NEXT:    v_cvt_u32_f32_e32 v5, v5
426; VI-NEXT:    v_mul_lo_u32 v6, v6, v4
427; VI-NEXT:    v_mul_lo_u32 v7, v7, v5
428; VI-NEXT:    v_mul_hi_u32 v6, v4, v6
429; VI-NEXT:    v_mul_hi_u32 v7, v5, v7
430; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
431; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
432; VI-NEXT:    v_mul_hi_u32 v4, v0, v4
433; VI-NEXT:    v_mul_hi_u32 v5, v1, v5
434; VI-NEXT:    v_mul_lo_u32 v6, v4, v2
435; VI-NEXT:    v_mul_lo_u32 v8, v5, v3
436; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
437; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v6
438; VI-NEXT:    v_sub_u32_e32 v1, vcc, v1, v8
439; VI-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
440; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
441; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
442; VI-NEXT:    v_sub_u32_e32 v6, vcc, v0, v2
443; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
444; VI-NEXT:    v_sub_u32_e32 v7, vcc, v1, v3
445; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
446; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
447; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
448; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
449; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
450; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
451; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
452; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
453; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
454; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
455; VI-NEXT:    s_endpgm
456;
457; GCN-LABEL: udiv_v2i32:
458; GCN:       ; %bb.0:
459; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
460; GCN-NEXT:    s_waitcnt lgkmcnt(0)
461; GCN-NEXT:    v_mov_b32_e32 v0, s2
462; GCN-NEXT:    v_mov_b32_e32 v1, s3
463; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
464; GCN-NEXT:    s_waitcnt vmcnt(0)
465; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
466; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
467; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
468; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
469; GCN-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
470; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v4
471; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
472; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v5
473; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
474; GCN-NEXT:    v_mul_lo_u32 v5, v4, v6
475; GCN-NEXT:    v_sub_u32_e32 v4, vcc, 0, v3
476; GCN-NEXT:    v_mul_lo_u32 v8, v4, v7
477; GCN-NEXT:    v_mul_hi_u32 v9, v6, v5
478; GCN-NEXT:    v_mov_b32_e32 v4, s0
479; GCN-NEXT:    v_mov_b32_e32 v5, s1
480; GCN-NEXT:    v_mul_hi_u32 v8, v7, v8
481; GCN-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
482; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
483; GCN-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
484; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
485; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
486; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
487; GCN-NEXT:    v_mul_lo_u32 v10, v7, v3
488; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
489; GCN-NEXT:    v_add_u32_e32 v11, vcc, 1, v7
490; GCN-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
491; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
492; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
493; GCN-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
494; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
495; GCN-NEXT:    v_sub_u32_e32 v9, vcc, v1, v3
496; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
497; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
498; GCN-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
499; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
500; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v7
501; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
502; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
503; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
504; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
505; GCN-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
506; GCN-NEXT:    s_endpgm
507;
508; GFX1030-LABEL: udiv_v2i32:
509; GFX1030:       ; %bb.0:
510; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
511; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
512; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
513; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
514; GFX1030-NEXT:    s_waitcnt vmcnt(0)
515; GFX1030-NEXT:    v_readfirstlane_b32 s2, v2
516; GFX1030-NEXT:    v_readfirstlane_b32 s3, v3
517; GFX1030-NEXT:    v_readfirstlane_b32 s6, v0
518; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, s2
519; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, s3
520; GFX1030-NEXT:    s_sub_i32 s5, 0, s2
521; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
522; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
523; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
524; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v3
525; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
526; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
527; GFX1030-NEXT:    v_readfirstlane_b32 s4, v2
528; GFX1030-NEXT:    v_readfirstlane_b32 s8, v0
529; GFX1030-NEXT:    s_mul_i32 s5, s5, s4
530; GFX1030-NEXT:    s_mul_hi_u32 s5, s4, s5
531; GFX1030-NEXT:    s_add_i32 s4, s4, s5
532; GFX1030-NEXT:    s_mul_hi_u32 s4, s6, s4
533; GFX1030-NEXT:    s_mul_i32 s5, s4, s2
534; GFX1030-NEXT:    s_sub_i32 s5, s6, s5
535; GFX1030-NEXT:    s_add_i32 s6, s4, 1
536; GFX1030-NEXT:    s_sub_i32 s7, s5, s2
537; GFX1030-NEXT:    s_cmp_ge_u32 s5, s2
538; GFX1030-NEXT:    s_cselect_b32 s4, s6, s4
539; GFX1030-NEXT:    s_cselect_b32 s5, s7, s5
540; GFX1030-NEXT:    s_add_i32 s6, s4, 1
541; GFX1030-NEXT:    s_cmp_ge_u32 s5, s2
542; GFX1030-NEXT:    v_readfirstlane_b32 s5, v1
543; GFX1030-NEXT:    s_cselect_b32 s2, s6, s4
544; GFX1030-NEXT:    s_sub_i32 s4, 0, s3
545; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
546; GFX1030-NEXT:    s_mul_i32 s4, s4, s8
547; GFX1030-NEXT:    s_mul_hi_u32 s4, s8, s4
548; GFX1030-NEXT:    s_add_i32 s8, s8, s4
549; GFX1030-NEXT:    s_mul_hi_u32 s4, s5, s8
550; GFX1030-NEXT:    s_mul_i32 s6, s4, s3
551; GFX1030-NEXT:    s_sub_i32 s5, s5, s6
552; GFX1030-NEXT:    s_add_i32 s6, s4, 1
553; GFX1030-NEXT:    s_sub_i32 s7, s5, s3
554; GFX1030-NEXT:    s_cmp_ge_u32 s5, s3
555; GFX1030-NEXT:    s_cselect_b32 s4, s6, s4
556; GFX1030-NEXT:    s_cselect_b32 s5, s7, s5
557; GFX1030-NEXT:    s_add_i32 s6, s4, 1
558; GFX1030-NEXT:    s_cmp_ge_u32 s5, s3
559; GFX1030-NEXT:    s_cselect_b32 s3, s6, s4
560; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
561; GFX1030-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
562; GFX1030-NEXT:    s_endpgm
563;
564; EG-LABEL: udiv_v2i32:
565; EG:       ; %bb.0:
566; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
567; EG-NEXT:    TEX 0 @6
568; EG-NEXT:    ALU 33, @9, KC0[CB0:0-32], KC1[]
569; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
570; EG-NEXT:    CF_END
571; EG-NEXT:    PAD
572; EG-NEXT:    Fetch clause starting at 6:
573; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
574; EG-NEXT:    ALU clause starting at 8:
575; EG-NEXT:     MOV * T0.X, KC0[2].Z,
576; EG-NEXT:    ALU clause starting at 9:
577; EG-NEXT:     SUB_INT T1.W, 0.0, T0.W,
578; EG-NEXT:     RECIP_UINT * T1.X, T0.W,
579; EG-NEXT:     MULLO_INT * T1.Y, PV.W, PS,
580; EG-NEXT:     SUB_INT T1.W, 0.0, T0.Z,
581; EG-NEXT:     RECIP_UINT * T1.Z, T0.Z,
582; EG-NEXT:     MULLO_INT * T1.W, PV.W, PS,
583; EG-NEXT:     MULHI * T1.W, T1.Z, PS,
584; EG-NEXT:     ADD_INT T1.W, T1.Z, PS,
585; EG-NEXT:     MULHI * T1.Y, T1.X, T1.Y,
586; EG-NEXT:     ADD_INT T2.W, T1.X, PS,
587; EG-NEXT:     MULHI * T1.X, T0.X, PV.W,
588; EG-NEXT:     MULHI * T1.Y, T0.Y, PV.W,
589; EG-NEXT:     MULLO_INT * T1.Z, PS, T0.W,
590; EG-NEXT:     SUB_INT T1.W, T0.Y, PS,
591; EG-NEXT:     MULLO_INT * T0.Y, T1.X, T0.Z,
592; EG-NEXT:     SUB_INT T0.Y, T0.X, PS,
593; EG-NEXT:     ADD_INT T1.Z, T1.Y, 1,
594; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
595; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
596; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.W, PS,
597; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PV.Z,
598; EG-NEXT:     ADD_INT T1.Z, T1.X, 1,
599; EG-NEXT:     SETGE_UINT T1.W, PV.Y, T0.Z,
600; EG-NEXT:     SUB_INT * T2.W, PV.Y, T0.Z,
601; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PS,
602; EG-NEXT:     CNDE_INT T1.Z, PV.W, T1.X, PV.Z,
603; EG-NEXT:     ADD_INT T1.W, PV.Y, 1,
604; EG-NEXT:     SETGE_UINT * T0.W, PV.X, T0.W,
605; EG-NEXT:     CNDE_INT T1.Y, PS, T1.Y, PV.W,
606; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
607; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T0.Z,
608; EG-NEXT:     CNDE_INT T1.X, PS, T1.Z, PV.W,
609; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
610; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
611  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
612  %a = load <2 x i32>, ptr addrspace(1) %in
613  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
614  %result = udiv <2 x i32> %a, %b
615  store <2 x i32> %result, ptr addrspace(1) %out
616  ret void
617}
618
619define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
620; SI-LABEL: udiv_v4i32:
621; SI:       ; %bb.0:
622; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
623; SI-NEXT:    s_mov_b32 s11, 0xf000
624; SI-NEXT:    s_mov_b32 s10, -1
625; SI-NEXT:    s_mov_b32 s6, s10
626; SI-NEXT:    s_mov_b32 s7, s11
627; SI-NEXT:    s_waitcnt lgkmcnt(0)
628; SI-NEXT:    s_mov_b32 s4, s2
629; SI-NEXT:    s_mov_b32 s5, s3
630; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
631; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
632; SI-NEXT:    s_mov_b32 s8, s0
633; SI-NEXT:    s_mov_b32 s9, s1
634; SI-NEXT:    s_waitcnt vmcnt(1)
635; SI-NEXT:    v_cvt_f32_u32_e32 v8, v0
636; SI-NEXT:    v_cvt_f32_u32_e32 v10, v1
637; SI-NEXT:    v_cvt_f32_u32_e32 v12, v2
638; SI-NEXT:    v_cvt_f32_u32_e32 v14, v3
639; SI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
640; SI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
641; SI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
642; SI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
643; SI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
644; SI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
645; SI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
646; SI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
647; SI-NEXT:    v_cvt_u32_f32_e32 v8, v8
648; SI-NEXT:    v_cvt_u32_f32_e32 v10, v10
649; SI-NEXT:    v_cvt_u32_f32_e32 v12, v12
650; SI-NEXT:    v_cvt_u32_f32_e32 v14, v14
651; SI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
652; SI-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
653; SI-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
654; SI-NEXT:    v_sub_i32_e32 v15, vcc, 0, v3
655; SI-NEXT:    v_mul_lo_u32 v9, v9, v8
656; SI-NEXT:    v_mul_lo_u32 v11, v11, v10
657; SI-NEXT:    v_mul_lo_u32 v13, v13, v12
658; SI-NEXT:    v_mul_lo_u32 v15, v15, v14
659; SI-NEXT:    v_mul_hi_u32 v9, v8, v9
660; SI-NEXT:    v_mul_hi_u32 v11, v10, v11
661; SI-NEXT:    v_mul_hi_u32 v13, v12, v13
662; SI-NEXT:    v_mul_hi_u32 v15, v14, v15
663; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
664; SI-NEXT:    v_add_i32_e32 v9, vcc, v10, v11
665; SI-NEXT:    v_add_i32_e32 v10, vcc, v12, v13
666; SI-NEXT:    v_add_i32_e32 v11, vcc, v14, v15
667; SI-NEXT:    s_waitcnt vmcnt(0)
668; SI-NEXT:    v_mul_hi_u32 v8, v4, v8
669; SI-NEXT:    v_mul_hi_u32 v9, v5, v9
670; SI-NEXT:    v_mul_hi_u32 v10, v6, v10
671; SI-NEXT:    v_mul_hi_u32 v11, v7, v11
672; SI-NEXT:    v_mul_lo_u32 v12, v8, v0
673; SI-NEXT:    v_mul_lo_u32 v14, v9, v1
674; SI-NEXT:    v_mul_lo_u32 v16, v10, v2
675; SI-NEXT:    v_mul_lo_u32 v18, v11, v3
676; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
677; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v14
678; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v16
679; SI-NEXT:    v_sub_i32_e32 v7, vcc, v7, v18
680; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
681; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
682; SI-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
683; SI-NEXT:    v_add_i32_e32 v19, vcc, 1, v11
684; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
685; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
686; SI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
687; SI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
688; SI-NEXT:    v_sub_i32_e32 v12, vcc, v4, v0
689; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
690; SI-NEXT:    v_sub_i32_e32 v13, vcc, v5, v1
691; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
692; SI-NEXT:    v_sub_i32_e32 v14, vcc, v6, v2
693; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
694; SI-NEXT:    v_sub_i32_e32 v15, vcc, v7, v3
695; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
696; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
697; SI-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
698; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
699; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v9
700; SI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
701; SI-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
702; SI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
703; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v11
704; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
705; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
706; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
707; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
708; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
709; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
710; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
711; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
712; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
713; SI-NEXT:    s_endpgm
714;
715; VI-LABEL: udiv_v4i32:
716; VI:       ; %bb.0:
717; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
718; VI-NEXT:    s_mov_b32 s11, 0xf000
719; VI-NEXT:    s_mov_b32 s10, -1
720; VI-NEXT:    s_mov_b32 s6, s10
721; VI-NEXT:    s_mov_b32 s7, s11
722; VI-NEXT:    s_waitcnt lgkmcnt(0)
723; VI-NEXT:    s_mov_b32 s4, s2
724; VI-NEXT:    s_mov_b32 s5, s3
725; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
726; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
727; VI-NEXT:    s_mov_b32 s8, s0
728; VI-NEXT:    s_mov_b32 s9, s1
729; VI-NEXT:    s_waitcnt vmcnt(1)
730; VI-NEXT:    v_cvt_f32_u32_e32 v8, v0
731; VI-NEXT:    v_cvt_f32_u32_e32 v10, v1
732; VI-NEXT:    v_cvt_f32_u32_e32 v12, v2
733; VI-NEXT:    v_cvt_f32_u32_e32 v14, v3
734; VI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
735; VI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
736; VI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
737; VI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
738; VI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
739; VI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
740; VI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
741; VI-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
742; VI-NEXT:    v_cvt_u32_f32_e32 v8, v8
743; VI-NEXT:    v_cvt_u32_f32_e32 v10, v10
744; VI-NEXT:    v_cvt_u32_f32_e32 v12, v12
745; VI-NEXT:    v_cvt_u32_f32_e32 v14, v14
746; VI-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
747; VI-NEXT:    v_sub_u32_e32 v11, vcc, 0, v1
748; VI-NEXT:    v_sub_u32_e32 v13, vcc, 0, v2
749; VI-NEXT:    v_sub_u32_e32 v15, vcc, 0, v3
750; VI-NEXT:    v_mul_lo_u32 v9, v9, v8
751; VI-NEXT:    v_mul_lo_u32 v11, v11, v10
752; VI-NEXT:    v_mul_lo_u32 v13, v13, v12
753; VI-NEXT:    v_mul_lo_u32 v15, v15, v14
754; VI-NEXT:    v_mul_hi_u32 v9, v8, v9
755; VI-NEXT:    v_mul_hi_u32 v11, v10, v11
756; VI-NEXT:    v_mul_hi_u32 v13, v12, v13
757; VI-NEXT:    v_mul_hi_u32 v15, v14, v15
758; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
759; VI-NEXT:    v_add_u32_e32 v9, vcc, v10, v11
760; VI-NEXT:    v_add_u32_e32 v10, vcc, v12, v13
761; VI-NEXT:    v_add_u32_e32 v11, vcc, v14, v15
762; VI-NEXT:    s_waitcnt vmcnt(0)
763; VI-NEXT:    v_mul_hi_u32 v8, v4, v8
764; VI-NEXT:    v_mul_hi_u32 v9, v5, v9
765; VI-NEXT:    v_mul_hi_u32 v10, v6, v10
766; VI-NEXT:    v_mul_hi_u32 v11, v7, v11
767; VI-NEXT:    v_mul_lo_u32 v12, v8, v0
768; VI-NEXT:    v_mul_lo_u32 v14, v9, v1
769; VI-NEXT:    v_mul_lo_u32 v16, v10, v2
770; VI-NEXT:    v_mul_lo_u32 v18, v11, v3
771; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
772; VI-NEXT:    v_sub_u32_e32 v5, vcc, v5, v14
773; VI-NEXT:    v_sub_u32_e32 v6, vcc, v6, v16
774; VI-NEXT:    v_sub_u32_e32 v7, vcc, v7, v18
775; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v8
776; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v9
777; VI-NEXT:    v_add_u32_e32 v17, vcc, 1, v10
778; VI-NEXT:    v_add_u32_e32 v19, vcc, 1, v11
779; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
780; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
781; VI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
782; VI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
783; VI-NEXT:    v_sub_u32_e32 v12, vcc, v4, v0
784; VI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
785; VI-NEXT:    v_sub_u32_e32 v13, vcc, v5, v1
786; VI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
787; VI-NEXT:    v_sub_u32_e32 v14, vcc, v6, v2
788; VI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
789; VI-NEXT:    v_sub_u32_e32 v15, vcc, v7, v3
790; VI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
791; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
792; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
793; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
794; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v9
795; VI-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
796; VI-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
797; VI-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[6:7]
798; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
799; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
800; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc
801; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
802; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
803; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
804; VI-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc
805; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
806; VI-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc
807; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
808; VI-NEXT:    s_endpgm
809;
810; GCN-LABEL: udiv_v4i32:
811; GCN:       ; %bb.0:
812; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
813; GCN-NEXT:    s_waitcnt lgkmcnt(0)
814; GCN-NEXT:    s_add_u32 s4, s2, 16
815; GCN-NEXT:    s_addc_u32 s5, s3, 0
816; GCN-NEXT:    v_mov_b32_e32 v0, s4
817; GCN-NEXT:    v_mov_b32_e32 v1, s5
818; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
819; GCN-NEXT:    v_mov_b32_e32 v5, s3
820; GCN-NEXT:    v_mov_b32_e32 v4, s2
821; GCN-NEXT:    flat_load_dwordx4 v[6:9], v[4:5]
822; GCN-NEXT:    v_mov_b32_e32 v4, s0
823; GCN-NEXT:    v_mov_b32_e32 v5, s1
824; GCN-NEXT:    s_waitcnt vmcnt(1)
825; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v0
826; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v1
827; GCN-NEXT:    v_cvt_f32_u32_e32 v14, v2
828; GCN-NEXT:    v_cvt_f32_u32_e32 v16, v3
829; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
830; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
831; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
832; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
833; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
834; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
835; GCN-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
836; GCN-NEXT:    v_mul_f32_e32 v16, 0x4f7ffffe, v16
837; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
838; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
839; GCN-NEXT:    v_cvt_u32_f32_e32 v14, v14
840; GCN-NEXT:    v_cvt_u32_f32_e32 v16, v16
841; GCN-NEXT:    v_sub_u32_e32 v11, vcc, 0, v0
842; GCN-NEXT:    v_sub_u32_e32 v13, vcc, 0, v1
843; GCN-NEXT:    v_sub_u32_e32 v15, vcc, 0, v2
844; GCN-NEXT:    v_sub_u32_e32 v17, vcc, 0, v3
845; GCN-NEXT:    v_mul_lo_u32 v11, v11, v10
846; GCN-NEXT:    v_mul_lo_u32 v13, v13, v12
847; GCN-NEXT:    v_mul_lo_u32 v15, v15, v14
848; GCN-NEXT:    v_mul_lo_u32 v17, v17, v16
849; GCN-NEXT:    v_mul_hi_u32 v11, v10, v11
850; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
851; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
852; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
853; GCN-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
854; GCN-NEXT:    v_add_u32_e32 v11, vcc, v12, v13
855; GCN-NEXT:    v_add_u32_e32 v12, vcc, v14, v15
856; GCN-NEXT:    v_add_u32_e32 v13, vcc, v16, v17
857; GCN-NEXT:    s_waitcnt vmcnt(0)
858; GCN-NEXT:    v_mul_hi_u32 v10, v6, v10
859; GCN-NEXT:    v_mul_hi_u32 v11, v7, v11
860; GCN-NEXT:    v_mul_hi_u32 v12, v8, v12
861; GCN-NEXT:    v_mul_hi_u32 v13, v9, v13
862; GCN-NEXT:    v_mul_lo_u32 v14, v10, v0
863; GCN-NEXT:    v_mul_lo_u32 v16, v11, v1
864; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
865; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
866; GCN-NEXT:    v_sub_u32_e32 v6, vcc, v6, v14
867; GCN-NEXT:    v_sub_u32_e32 v7, vcc, v7, v16
868; GCN-NEXT:    v_sub_u32_e32 v8, vcc, v8, v18
869; GCN-NEXT:    v_sub_u32_e32 v9, vcc, v9, v19
870; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
871; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
872; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
873; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
874; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v6, v0
875; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v7, v1
876; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
877; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v3
878; GCN-NEXT:    v_sub_u32_e32 v18, vcc, v6, v0
879; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[0:1]
880; GCN-NEXT:    v_sub_u32_e32 v15, vcc, v7, v1
881; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[2:3]
882; GCN-NEXT:    v_sub_u32_e32 v17, vcc, v8, v2
883; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
884; GCN-NEXT:    v_sub_u32_e32 v14, vcc, v9, v3
885; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[6:7]
886; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v18, s[0:1]
887; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v10
888; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[2:3]
889; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
890; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v17, s[4:5]
891; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v12
892; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[6:7]
893; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v13
894; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v0
895; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v16, vcc
896; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v1
897; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v15, vcc
898; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
899; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v17, vcc
900; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v3
901; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc
902; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
903; GCN-NEXT:    s_endpgm
904;
905; GFX1030-LABEL: udiv_v4i32:
906; GFX1030:       ; %bb.0:
907; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
908; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
909; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX1030-NEXT:    s_clause 0x1
911; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
912; GFX1030-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3]
913; GFX1030-NEXT:    s_waitcnt vmcnt(1)
914; GFX1030-NEXT:    v_readfirstlane_b32 s2, v0
915; GFX1030-NEXT:    v_readfirstlane_b32 s3, v1
916; GFX1030-NEXT:    s_waitcnt vmcnt(0)
917; GFX1030-NEXT:    v_readfirstlane_b32 s7, v4
918; GFX1030-NEXT:    v_readfirstlane_b32 s5, v2
919; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s2
920; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s3
921; GFX1030-NEXT:    s_sub_i32 s6, 0, s2
922; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
923; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
924; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
925; GFX1030-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
926; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
927; GFX1030-NEXT:    v_cvt_u32_f32_e32 v1, v1
928; GFX1030-NEXT:    v_readfirstlane_b32 s4, v0
929; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s5
930; GFX1030-NEXT:    v_readfirstlane_b32 s9, v1
931; GFX1030-NEXT:    s_mul_i32 s6, s6, s4
932; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
933; GFX1030-NEXT:    s_mul_hi_u32 s6, s4, s6
934; GFX1030-NEXT:    s_add_i32 s4, s4, s6
935; GFX1030-NEXT:    s_mul_hi_u32 s4, s7, s4
936; GFX1030-NEXT:    s_mul_i32 s6, s4, s2
937; GFX1030-NEXT:    s_sub_i32 s6, s7, s6
938; GFX1030-NEXT:    s_add_i32 s7, s4, 1
939; GFX1030-NEXT:    s_sub_i32 s8, s6, s2
940; GFX1030-NEXT:    s_cmp_ge_u32 s6, s2
941; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
942; GFX1030-NEXT:    s_cselect_b32 s4, s7, s4
943; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
944; GFX1030-NEXT:    s_add_i32 s7, s4, 1
945; GFX1030-NEXT:    s_cmp_ge_u32 s6, s2
946; GFX1030-NEXT:    v_readfirstlane_b32 s2, v3
947; GFX1030-NEXT:    s_cselect_b32 s4, s7, s4
948; GFX1030-NEXT:    s_sub_i32 s6, 0, s3
949; GFX1030-NEXT:    v_readfirstlane_b32 s7, v5
950; GFX1030-NEXT:    s_mul_i32 s6, s6, s9
951; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
952; GFX1030-NEXT:    s_mul_hi_u32 s6, s9, s6
953; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s2
954; GFX1030-NEXT:    s_add_i32 s9, s9, s6
955; GFX1030-NEXT:    s_mul_hi_u32 s6, s7, s9
956; GFX1030-NEXT:    v_readfirstlane_b32 s10, v0
957; GFX1030-NEXT:    s_mul_i32 s8, s6, s3
958; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
959; GFX1030-NEXT:    s_sub_i32 s7, s7, s8
960; GFX1030-NEXT:    s_add_i32 s8, s6, 1
961; GFX1030-NEXT:    s_sub_i32 s9, s7, s3
962; GFX1030-NEXT:    s_cmp_ge_u32 s7, s3
963; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
964; GFX1030-NEXT:    s_cselect_b32 s7, s9, s7
965; GFX1030-NEXT:    s_add_i32 s8, s6, 1
966; GFX1030-NEXT:    s_cmp_ge_u32 s7, s3
967; GFX1030-NEXT:    v_readfirstlane_b32 s7, v6
968; GFX1030-NEXT:    s_cselect_b32 s3, s8, s6
969; GFX1030-NEXT:    s_sub_i32 s6, 0, s5
970; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
971; GFX1030-NEXT:    s_mul_i32 s6, s6, s10
972; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
973; GFX1030-NEXT:    s_mul_hi_u32 s6, s10, s6
974; GFX1030-NEXT:    s_add_i32 s10, s10, s6
975; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
976; GFX1030-NEXT:    s_mul_hi_u32 s6, s7, s10
977; GFX1030-NEXT:    s_mul_i32 s8, s6, s5
978; GFX1030-NEXT:    s_sub_i32 s7, s7, s8
979; GFX1030-NEXT:    s_add_i32 s8, s6, 1
980; GFX1030-NEXT:    s_sub_i32 s9, s7, s5
981; GFX1030-NEXT:    s_cmp_ge_u32 s7, s5
982; GFX1030-NEXT:    v_readfirstlane_b32 s10, v0
983; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
984; GFX1030-NEXT:    s_cselect_b32 s7, s9, s7
985; GFX1030-NEXT:    s_add_i32 s8, s6, 1
986; GFX1030-NEXT:    s_cmp_ge_u32 s7, s5
987; GFX1030-NEXT:    v_readfirstlane_b32 s7, v7
988; GFX1030-NEXT:    s_cselect_b32 s5, s8, s6
989; GFX1030-NEXT:    s_sub_i32 s6, 0, s2
990; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
991; GFX1030-NEXT:    s_mul_i32 s6, s6, s10
992; GFX1030-NEXT:    v_mov_b32_e32 v2, s5
993; GFX1030-NEXT:    s_mul_hi_u32 s6, s10, s6
994; GFX1030-NEXT:    s_add_i32 s10, s10, s6
995; GFX1030-NEXT:    s_mul_hi_u32 s6, s7, s10
996; GFX1030-NEXT:    s_mul_i32 s8, s6, s2
997; GFX1030-NEXT:    s_sub_i32 s7, s7, s8
998; GFX1030-NEXT:    s_add_i32 s8, s6, 1
999; GFX1030-NEXT:    s_sub_i32 s9, s7, s2
1000; GFX1030-NEXT:    s_cmp_ge_u32 s7, s2
1001; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
1002; GFX1030-NEXT:    s_cselect_b32 s7, s9, s7
1003; GFX1030-NEXT:    s_add_i32 s8, s6, 1
1004; GFX1030-NEXT:    s_cmp_ge_u32 s7, s2
1005; GFX1030-NEXT:    s_cselect_b32 s2, s8, s6
1006; GFX1030-NEXT:    v_mov_b32_e32 v3, s2
1007; GFX1030-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
1008; GFX1030-NEXT:    s_endpgm
1009;
1010; EG-LABEL: udiv_v4i32:
1011; EG:       ; %bb.0:
1012; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1013; EG-NEXT:    TEX 1 @6
1014; EG-NEXT:    ALU 65, @11, KC0[CB0:0-32], KC1[]
1015; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
1016; EG-NEXT:    CF_END
1017; EG-NEXT:    PAD
1018; EG-NEXT:    Fetch clause starting at 6:
1019; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
1020; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1021; EG-NEXT:    ALU clause starting at 10:
1022; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1023; EG-NEXT:    ALU clause starting at 11:
1024; EG-NEXT:     SUB_INT T2.W, 0.0, T1.W,
1025; EG-NEXT:     RECIP_UINT * T2.X, T1.W,
1026; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
1027; EG-NEXT:     MULHI * T2.Y, T2.X, PS,
1028; EG-NEXT:     ADD_INT * T2.W, T2.X, PS,
1029; EG-NEXT:     MULHI * T2.X, T0.W, PV.W,
1030; EG-NEXT:     MULLO_INT * T2.Y, PS, T1.W,
1031; EG-NEXT:     SUB_INT T2.W, 0.0, T1.X,
1032; EG-NEXT:     RECIP_UINT * T2.Z, T1.X,
1033; EG-NEXT:     MULLO_INT * T2.W, PV.W, PS,
1034; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Y,
1035; EG-NEXT:     RECIP_UINT * T3.X, T1.Y,
1036; EG-NEXT:     MULLO_INT * T3.Y, PV.W, PS,
1037; EG-NEXT:     SUB_INT T3.W, 0.0, T1.Z,
1038; EG-NEXT:     RECIP_UINT * T3.Z, T1.Z,
1039; EG-NEXT:     MULLO_INT * T3.W, PV.W, PS,
1040; EG-NEXT:     MULHI * T3.W, T3.Z, PS,
1041; EG-NEXT:     ADD_INT T3.W, T3.Z, PS,
1042; EG-NEXT:     MULHI * T3.Y, T3.X, T3.Y,
1043; EG-NEXT:     ADD_INT T4.W, T3.X, PS,
1044; EG-NEXT:     MULHI * T3.X, T0.Z, PV.W,
1045; EG-NEXT:     MULHI * T3.Y, T0.Y, PV.W,
1046; EG-NEXT:     MULLO_INT * T3.Z, PS, T1.Y,
1047; EG-NEXT:     SUB_INT T3.W, T0.Y, PS,
1048; EG-NEXT:     MULLO_INT * T0.Y, T3.X, T1.Z,
1049; EG-NEXT:     SUB_INT T4.X, T0.Z, PS,
1050; EG-NEXT:     ADD_INT T0.Y, T3.Y, 1,
1051; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.Y,
1052; EG-NEXT:     SUB_INT T4.W, PV.W, T1.Y,
1053; EG-NEXT:     MULHI * T2.W, T2.Z, T2.W,
1054; EG-NEXT:     CNDE_INT T5.X, PV.Z, T3.W, PV.W,
1055; EG-NEXT:     CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122
1056; EG-NEXT:     SETGE_UINT T0.Z, PV.X, T1.Z,
1057; EG-NEXT:     ADD_INT T2.W, T2.Z, PS,
1058; EG-NEXT:     SUB_INT * T0.W, T0.W, T2.Y,
1059; EG-NEXT:     ADD_INT T6.X, T3.X, 1,
1060; EG-NEXT:     ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212
1061; EG-NEXT:     SETGE_UINT T2.Z, PS, T1.W,
1062; EG-NEXT:     SUB_INT T3.W, PS, T1.W,
1063; EG-NEXT:     MULHI * T2.W, T0.X, PV.W,
1064; EG-NEXT:     SUB_INT T7.X, T4.X, T1.Z,
1065; EG-NEXT:     CNDE_INT T3.Y, PV.Z, T0.W, PV.W,
1066; EG-NEXT:     CNDE_INT T2.Z, PV.Z, T2.X, PV.Y,
1067; EG-NEXT:     CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122
1068; EG-NEXT:     MULLO_INT * T2.X, T2.W, T1.X,
1069; EG-NEXT:     ADD_INT T3.X, T0.W, 1,
1070; EG-NEXT:     ADD_INT T2.Y, T2.Z, 1,
1071; EG-NEXT:     SETGE_UINT T3.Z, T3.Y, T1.W,
1072; EG-NEXT:     SUB_INT T1.W, T0.X, PS, BS:VEC_201
1073; EG-NEXT:     CNDE_INT * T3.W, T0.Z, T4.X, T7.X,
1074; EG-NEXT:     SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122
1075; EG-NEXT:     ADD_INT T3.Y, T2.W, 1,
1076; EG-NEXT:     SETGE_UINT T0.Z, PV.W, T1.X,
1077; EG-NEXT:     SUB_INT T3.W, PV.W, T1.X,
1078; EG-NEXT:     CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y,
1079; EG-NEXT:     CNDE_INT T2.X, PV.Z, T1.W, PV.W,
1080; EG-NEXT:     CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122
1081; EG-NEXT:     CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201
1082; EG-NEXT:     ADD_INT T0.W, T0.Y, 1,
1083; EG-NEXT:     SETGE_UINT * T1.W, T5.X, T1.Y,
1084; EG-NEXT:     CNDE_INT T4.Y, PS, T0.Y, PV.W,
1085; EG-NEXT:     ADD_INT T0.W, PV.Y, 1,
1086; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.X,
1087; EG-NEXT:     CNDE_INT T4.X, PS, T2.Y, PV.W,
1088; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1089; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1090  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
1091  %a = load <4 x i32>, ptr addrspace(1) %in
1092  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
1093  %result = udiv <4 x i32> %a, %b
1094  store <4 x i32> %result, ptr addrspace(1) %out
1095  ret void
1096}
1097
1098define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1099; SI-LABEL: udiv_i32_div_pow2:
1100; SI:       ; %bb.0:
1101; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1102; SI-NEXT:    s_mov_b32 s7, 0xf000
1103; SI-NEXT:    s_mov_b32 s6, -1
1104; SI-NEXT:    s_mov_b32 s10, s6
1105; SI-NEXT:    s_mov_b32 s11, s7
1106; SI-NEXT:    s_waitcnt lgkmcnt(0)
1107; SI-NEXT:    s_mov_b32 s8, s2
1108; SI-NEXT:    s_mov_b32 s9, s3
1109; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1110; SI-NEXT:    s_mov_b32 s4, s0
1111; SI-NEXT:    s_mov_b32 s5, s1
1112; SI-NEXT:    s_waitcnt vmcnt(0)
1113; SI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1114; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1115; SI-NEXT:    s_endpgm
1116;
1117; VI-LABEL: udiv_i32_div_pow2:
1118; VI:       ; %bb.0:
1119; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1120; VI-NEXT:    s_mov_b32 s7, 0xf000
1121; VI-NEXT:    s_mov_b32 s6, -1
1122; VI-NEXT:    s_mov_b32 s10, s6
1123; VI-NEXT:    s_mov_b32 s11, s7
1124; VI-NEXT:    s_waitcnt lgkmcnt(0)
1125; VI-NEXT:    s_mov_b32 s8, s2
1126; VI-NEXT:    s_mov_b32 s9, s3
1127; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1128; VI-NEXT:    s_mov_b32 s4, s0
1129; VI-NEXT:    s_mov_b32 s5, s1
1130; VI-NEXT:    s_waitcnt vmcnt(0)
1131; VI-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
1132; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1133; VI-NEXT:    s_endpgm
1134;
1135; GCN-LABEL: udiv_i32_div_pow2:
1136; GCN:       ; %bb.0:
1137; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1138; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1139; GCN-NEXT:    v_mov_b32_e32 v0, s2
1140; GCN-NEXT:    v_mov_b32_e32 v1, s3
1141; GCN-NEXT:    flat_load_dword v2, v[0:1]
1142; GCN-NEXT:    v_mov_b32_e32 v0, s0
1143; GCN-NEXT:    v_mov_b32_e32 v1, s1
1144; GCN-NEXT:    s_waitcnt vmcnt(0)
1145; GCN-NEXT:    v_lshrrev_b32_e32 v2, 4, v2
1146; GCN-NEXT:    flat_store_dword v[0:1], v2
1147; GCN-NEXT:    s_endpgm
1148;
1149; GFX1030-LABEL: udiv_i32_div_pow2:
1150; GFX1030:       ; %bb.0:
1151; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1152; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1153; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1154; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1155; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1156; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
1157; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1158; GFX1030-NEXT:    s_endpgm
1159;
1160; EG-LABEL: udiv_i32_div_pow2:
1161; EG:       ; %bb.0:
1162; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1163; EG-NEXT:    TEX 0 @6
1164; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1165; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1166; EG-NEXT:    CF_END
1167; EG-NEXT:    PAD
1168; EG-NEXT:    Fetch clause starting at 6:
1169; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1170; EG-NEXT:    ALU clause starting at 8:
1171; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1172; EG-NEXT:    ALU clause starting at 9:
1173; EG-NEXT:     LSHR T0.X, T0.X, literal.x,
1174; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1175; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
1176  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1177  %a = load i32, ptr addrspace(1) %in
1178  %result = udiv i32 %a, 16
1179  store i32 %result, ptr addrspace(1) %out
1180  ret void
1181}
1182
1183define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1184; SI-LABEL: udiv_i32_div_k_even:
1185; SI:       ; %bb.0:
1186; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1187; SI-NEXT:    s_mov_b32 s7, 0xf000
1188; SI-NEXT:    s_mov_b32 s6, -1
1189; SI-NEXT:    s_mov_b32 s10, s6
1190; SI-NEXT:    s_mov_b32 s11, s7
1191; SI-NEXT:    s_waitcnt lgkmcnt(0)
1192; SI-NEXT:    s_mov_b32 s8, s2
1193; SI-NEXT:    s_mov_b32 s9, s3
1194; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1195; SI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1196; SI-NEXT:    s_mov_b32 s4, s0
1197; SI-NEXT:    s_mov_b32 s5, s1
1198; SI-NEXT:    s_waitcnt vmcnt(0)
1199; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1200; SI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1201; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1202; SI-NEXT:    s_endpgm
1203;
1204; VI-LABEL: udiv_i32_div_k_even:
1205; VI:       ; %bb.0:
1206; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1207; VI-NEXT:    s_mov_b32 s7, 0xf000
1208; VI-NEXT:    s_mov_b32 s6, -1
1209; VI-NEXT:    s_mov_b32 s10, s6
1210; VI-NEXT:    s_mov_b32 s11, s7
1211; VI-NEXT:    s_waitcnt lgkmcnt(0)
1212; VI-NEXT:    s_mov_b32 s8, s2
1213; VI-NEXT:    s_mov_b32 s9, s3
1214; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1215; VI-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1216; VI-NEXT:    s_mov_b32 s4, s0
1217; VI-NEXT:    s_mov_b32 s5, s1
1218; VI-NEXT:    s_waitcnt vmcnt(0)
1219; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1220; VI-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
1221; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1222; VI-NEXT:    s_endpgm
1223;
1224; GCN-LABEL: udiv_i32_div_k_even:
1225; GCN:       ; %bb.0:
1226; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1227; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1228; GCN-NEXT:    v_mov_b32_e32 v0, s2
1229; GCN-NEXT:    v_mov_b32_e32 v1, s3
1230; GCN-NEXT:    flat_load_dword v0, v[0:1]
1231; GCN-NEXT:    s_mov_b32 s2, 0xfabbd9c1
1232; GCN-NEXT:    v_mov_b32_e32 v1, s1
1233; GCN-NEXT:    s_waitcnt vmcnt(0)
1234; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1235; GCN-NEXT:    v_mov_b32_e32 v0, s0
1236; GCN-NEXT:    v_lshrrev_b32_e32 v2, 25, v2
1237; GCN-NEXT:    flat_store_dword v[0:1], v2
1238; GCN-NEXT:    s_endpgm
1239;
1240; GFX1030-LABEL: udiv_i32_div_k_even:
1241; GFX1030:       ; %bb.0:
1242; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1243; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1244; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1245; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1246; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1247; GFX1030-NEXT:    v_mul_hi_u32 v1, 0xfabbd9c1, v1
1248; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
1249; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1250; GFX1030-NEXT:    s_endpgm
1251;
1252; EG-LABEL: udiv_i32_div_k_even:
1253; EG:       ; %bb.0:
1254; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1255; EG-NEXT:    TEX 0 @6
1256; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1257; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1258; EG-NEXT:    CF_END
1259; EG-NEXT:    PAD
1260; EG-NEXT:    Fetch clause starting at 6:
1261; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1262; EG-NEXT:    ALU clause starting at 8:
1263; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1264; EG-NEXT:    ALU clause starting at 9:
1265; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1266; EG-NEXT:    -88352319(-4.876880e+35), 0(0.000000e+00)
1267; EG-NEXT:     LSHR T0.X, PS, literal.x,
1268; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1269; EG-NEXT:    25(3.503246e-44), 2(2.802597e-45)
1270  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1271  %a = load i32, ptr addrspace(1) %in
1272  %result = udiv i32 %a, 34259182
1273  store i32 %result, ptr addrspace(1) %out
1274  ret void
1275}
1276
1277define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1278; SI-LABEL: udiv_i32_div_k_odd:
1279; SI:       ; %bb.0:
1280; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1281; SI-NEXT:    s_mov_b32 s7, 0xf000
1282; SI-NEXT:    s_mov_b32 s6, -1
1283; SI-NEXT:    s_mov_b32 s10, s6
1284; SI-NEXT:    s_mov_b32 s11, s7
1285; SI-NEXT:    s_waitcnt lgkmcnt(0)
1286; SI-NEXT:    s_mov_b32 s8, s2
1287; SI-NEXT:    s_mov_b32 s9, s3
1288; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1289; SI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1290; SI-NEXT:    s_mov_b32 s4, s0
1291; SI-NEXT:    s_mov_b32 s5, s1
1292; SI-NEXT:    s_waitcnt vmcnt(0)
1293; SI-NEXT:    v_mul_hi_u32 v0, v0, s2
1294; SI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1295; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1296; SI-NEXT:    s_endpgm
1297;
1298; VI-LABEL: udiv_i32_div_k_odd:
1299; VI:       ; %bb.0:
1300; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1301; VI-NEXT:    s_mov_b32 s7, 0xf000
1302; VI-NEXT:    s_mov_b32 s6, -1
1303; VI-NEXT:    s_mov_b32 s10, s6
1304; VI-NEXT:    s_mov_b32 s11, s7
1305; VI-NEXT:    s_waitcnt lgkmcnt(0)
1306; VI-NEXT:    s_mov_b32 s8, s2
1307; VI-NEXT:    s_mov_b32 s9, s3
1308; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1309; VI-NEXT:    s_mov_b32 s2, 0x7d5deca3
1310; VI-NEXT:    s_mov_b32 s4, s0
1311; VI-NEXT:    s_mov_b32 s5, s1
1312; VI-NEXT:    s_waitcnt vmcnt(0)
1313; VI-NEXT:    v_mul_hi_u32 v0, v0, s2
1314; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1315; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1316; VI-NEXT:    s_endpgm
1317;
1318; GCN-LABEL: udiv_i32_div_k_odd:
1319; GCN:       ; %bb.0:
1320; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1321; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1322; GCN-NEXT:    v_mov_b32_e32 v0, s2
1323; GCN-NEXT:    v_mov_b32_e32 v1, s3
1324; GCN-NEXT:    flat_load_dword v0, v[0:1]
1325; GCN-NEXT:    s_mov_b32 s2, 0x7d5deca3
1326; GCN-NEXT:    v_mov_b32_e32 v1, s1
1327; GCN-NEXT:    s_waitcnt vmcnt(0)
1328; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
1329; GCN-NEXT:    v_mov_b32_e32 v0, s0
1330; GCN-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1331; GCN-NEXT:    flat_store_dword v[0:1], v2
1332; GCN-NEXT:    s_endpgm
1333;
1334; GFX1030-LABEL: udiv_i32_div_k_odd:
1335; GFX1030:       ; %bb.0:
1336; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1337; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1338; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1339; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1340; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1341; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x7d5deca3, v1
1342; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1343; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1344; GFX1030-NEXT:    s_endpgm
1345;
1346; EG-LABEL: udiv_i32_div_k_odd:
1347; EG:       ; %bb.0:
1348; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1349; EG-NEXT:    TEX 0 @6
1350; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1351; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1352; EG-NEXT:    CF_END
1353; EG-NEXT:    PAD
1354; EG-NEXT:    Fetch clause starting at 6:
1355; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1356; EG-NEXT:    ALU clause starting at 8:
1357; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1358; EG-NEXT:    ALU clause starting at 9:
1359; EG-NEXT:     MULHI * T0.X, T0.X, literal.x,
1360; EG-NEXT:    2103307427(1.843675e+37), 0(0.000000e+00)
1361; EG-NEXT:     LSHR T0.X, PS, literal.x,
1362; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1363; EG-NEXT:    24(3.363116e-44), 2(2.802597e-45)
1364  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1365  %a = load i32, ptr addrspace(1) %in
1366  %result = udiv i32 %a, 34259183
1367  store i32 %result, ptr addrspace(1) %out
1368  ret void
1369}
1370
1371define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1372; SI-LABEL: v_udiv_i8:
1373; SI:       ; %bb.0:
1374; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1375; SI-NEXT:    s_mov_b32 s7, 0xf000
1376; SI-NEXT:    s_mov_b32 s6, -1
1377; SI-NEXT:    s_mov_b32 s10, s6
1378; SI-NEXT:    s_mov_b32 s11, s7
1379; SI-NEXT:    s_waitcnt lgkmcnt(0)
1380; SI-NEXT:    s_mov_b32 s8, s2
1381; SI-NEXT:    s_mov_b32 s9, s3
1382; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1383; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1384; SI-NEXT:    s_mov_b32 s4, s0
1385; SI-NEXT:    s_mov_b32 s5, s1
1386; SI-NEXT:    s_waitcnt vmcnt(1)
1387; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1388; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1389; SI-NEXT:    s_waitcnt vmcnt(0)
1390; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1391; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1392; SI-NEXT:    v_trunc_f32_e32 v2, v2
1393; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1394; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1395; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1396; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1397; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1398; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1399; SI-NEXT:    s_endpgm
1400;
1401; VI-LABEL: v_udiv_i8:
1402; VI:       ; %bb.0:
1403; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1404; VI-NEXT:    s_mov_b32 s7, 0xf000
1405; VI-NEXT:    s_mov_b32 s6, -1
1406; VI-NEXT:    s_mov_b32 s10, s6
1407; VI-NEXT:    s_mov_b32 s11, s7
1408; VI-NEXT:    s_waitcnt lgkmcnt(0)
1409; VI-NEXT:    s_mov_b32 s8, s2
1410; VI-NEXT:    s_mov_b32 s9, s3
1411; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1412; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1413; VI-NEXT:    s_mov_b32 s4, s0
1414; VI-NEXT:    s_mov_b32 s5, s1
1415; VI-NEXT:    s_waitcnt vmcnt(1)
1416; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1417; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1418; VI-NEXT:    s_waitcnt vmcnt(0)
1419; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1420; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1421; VI-NEXT:    v_trunc_f32_e32 v2, v2
1422; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1423; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1424; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1425; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1426; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1427; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1428; VI-NEXT:    s_endpgm
1429;
1430; GCN-LABEL: v_udiv_i8:
1431; GCN:       ; %bb.0:
1432; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1433; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1434; GCN-NEXT:    v_mov_b32_e32 v0, s2
1435; GCN-NEXT:    v_mov_b32_e32 v1, s3
1436; GCN-NEXT:    flat_load_ushort v2, v[0:1]
1437; GCN-NEXT:    v_mov_b32_e32 v0, s0
1438; GCN-NEXT:    v_mov_b32_e32 v1, s1
1439; GCN-NEXT:    s_waitcnt vmcnt(0)
1440; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
1441; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
1442; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1443; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
1444; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1445; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1446; GCN-NEXT:    v_mad_f32 v2, -v4, v3, v2
1447; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1448; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1449; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
1450; GCN-NEXT:    flat_store_dword v[0:1], v2
1451; GCN-NEXT:    s_endpgm
1452;
1453; GFX1030-LABEL: v_udiv_i8:
1454; GFX1030:       ; %bb.0:
1455; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1456; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1457; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1458; GFX1030-NEXT:    global_load_ushort v1, v0, s[2:3]
1459; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1460; GFX1030-NEXT:    v_cvt_f32_ubyte1_e32 v2, v1
1461; GFX1030-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1462; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
1463; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
1464; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1465; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
1466; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1467; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1468; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1469; GFX1030-NEXT:    v_and_b32_e32 v1, 0xff, v1
1470; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1471; GFX1030-NEXT:    s_endpgm
1472;
1473; EG-LABEL: v_udiv_i8:
1474; EG:       ; %bb.0:
1475; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1476; EG-NEXT:    TEX 1 @6
1477; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1478; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1479; EG-NEXT:    CF_END
1480; EG-NEXT:    PAD
1481; EG-NEXT:    Fetch clause starting at 6:
1482; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1483; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1484; EG-NEXT:    ALU clause starting at 10:
1485; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1486; EG-NEXT:    ALU clause starting at 11:
1487; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1488; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1489; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1490; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1491; EG-NEXT:     TRUNC * T0.W, PV.W,
1492; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1493; EG-NEXT:     TRUNC * T0.W, PV.W,
1494; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1495; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1496; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1497; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1498; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1499; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1500; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1501; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
1502  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
1503  %num = load i8, ptr addrspace(1) %in
1504  %den = load i8, ptr addrspace(1) %den_ptr
1505  %result = udiv i8 %num, %den
1506  %result.ext = zext i8 %result to i32
1507  store i32 %result.ext, ptr addrspace(1) %out
1508  ret void
1509}
1510
1511define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1512; SI-LABEL: v_udiv_i16:
1513; SI:       ; %bb.0:
1514; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1515; SI-NEXT:    s_mov_b32 s7, 0xf000
1516; SI-NEXT:    s_mov_b32 s6, -1
1517; SI-NEXT:    s_mov_b32 s10, s6
1518; SI-NEXT:    s_mov_b32 s11, s7
1519; SI-NEXT:    s_waitcnt lgkmcnt(0)
1520; SI-NEXT:    s_mov_b32 s8, s2
1521; SI-NEXT:    s_mov_b32 s9, s3
1522; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1523; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1524; SI-NEXT:    s_mov_b32 s4, s0
1525; SI-NEXT:    s_mov_b32 s5, s1
1526; SI-NEXT:    s_waitcnt vmcnt(1)
1527; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1528; SI-NEXT:    s_waitcnt vmcnt(0)
1529; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1530; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1531; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1532; SI-NEXT:    v_trunc_f32_e32 v2, v2
1533; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1534; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1535; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1536; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1537; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1538; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1539; SI-NEXT:    s_endpgm
1540;
1541; VI-LABEL: v_udiv_i16:
1542; VI:       ; %bb.0:
1543; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1544; VI-NEXT:    s_mov_b32 s7, 0xf000
1545; VI-NEXT:    s_mov_b32 s6, -1
1546; VI-NEXT:    s_mov_b32 s10, s6
1547; VI-NEXT:    s_mov_b32 s11, s7
1548; VI-NEXT:    s_waitcnt lgkmcnt(0)
1549; VI-NEXT:    s_mov_b32 s8, s2
1550; VI-NEXT:    s_mov_b32 s9, s3
1551; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1552; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
1553; VI-NEXT:    s_mov_b32 s4, s0
1554; VI-NEXT:    s_mov_b32 s5, s1
1555; VI-NEXT:    s_waitcnt vmcnt(1)
1556; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1557; VI-NEXT:    s_waitcnt vmcnt(0)
1558; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1559; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1560; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1561; VI-NEXT:    v_trunc_f32_e32 v2, v2
1562; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1563; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1564; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1565; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1566; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1567; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1568; VI-NEXT:    s_endpgm
1569;
1570; GCN-LABEL: v_udiv_i16:
1571; GCN:       ; %bb.0:
1572; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1573; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1574; GCN-NEXT:    v_mov_b32_e32 v0, s2
1575; GCN-NEXT:    v_mov_b32_e32 v1, s3
1576; GCN-NEXT:    flat_load_dword v0, v[0:1]
1577; GCN-NEXT:    v_mov_b32_e32 v1, s1
1578; GCN-NEXT:    s_waitcnt vmcnt(0)
1579; GCN-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1580; GCN-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1581; GCN-NEXT:    v_mov_b32_e32 v0, s0
1582; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1583; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1584; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1585; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1586; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1587; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1588; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1589; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1590; GCN-NEXT:    flat_store_dword v[0:1], v2
1591; GCN-NEXT:    s_endpgm
1592;
1593; GFX1030-LABEL: v_udiv_i16:
1594; GFX1030:       ; %bb.0:
1595; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1596; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1597; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX1030-NEXT:    global_load_dword v1, v0, s[2:3]
1599; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1600; GFX1030-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1601; GFX1030-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1602; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
1603; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
1604; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1605; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
1606; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1607; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1608; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1609; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1610; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1611; GFX1030-NEXT:    s_endpgm
1612;
1613; EG-LABEL: v_udiv_i16:
1614; EG:       ; %bb.0:
1615; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1616; EG-NEXT:    TEX 1 @6
1617; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
1618; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1619; EG-NEXT:    CF_END
1620; EG-NEXT:    PAD
1621; EG-NEXT:    Fetch clause starting at 6:
1622; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1623; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1624; EG-NEXT:    ALU clause starting at 10:
1625; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1626; EG-NEXT:    ALU clause starting at 11:
1627; EG-NEXT:     UINT_TO_FLT * T0.Y, T1.X,
1628; EG-NEXT:     RECIP_IEEE * T0.Z, PS,
1629; EG-NEXT:     UINT_TO_FLT * T0.X, T0.X,
1630; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Z,
1631; EG-NEXT:     TRUNC * T0.W, PV.W,
1632; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1633; EG-NEXT:     TRUNC * T0.W, PV.W,
1634; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.Y|,
1635; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1636; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1637; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1638; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1639; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1640; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1641; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1642  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
1643  %num = load i16, ptr addrspace(1) %in
1644  %den = load i16, ptr addrspace(1) %den_ptr
1645  %result = udiv i16 %num, %den
1646  %result.ext = zext i16 %result to i32
1647  store i32 %result.ext, ptr addrspace(1) %out
1648  ret void
1649}
1650
1651define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1652; SI-LABEL: v_udiv_i23:
1653; SI:       ; %bb.0:
1654; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1655; SI-NEXT:    s_mov_b32 s7, 0xf000
1656; SI-NEXT:    s_mov_b32 s6, -1
1657; SI-NEXT:    s_mov_b32 s10, s6
1658; SI-NEXT:    s_mov_b32 s11, s7
1659; SI-NEXT:    s_waitcnt lgkmcnt(0)
1660; SI-NEXT:    s_mov_b32 s8, s2
1661; SI-NEXT:    s_mov_b32 s9, s3
1662; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1663; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1664; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1665; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1666; SI-NEXT:    s_mov_b32 s4, s0
1667; SI-NEXT:    s_mov_b32 s5, s1
1668; SI-NEXT:    s_waitcnt vmcnt(3)
1669; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1670; SI-NEXT:    s_waitcnt vmcnt(2)
1671; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1672; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1673; SI-NEXT:    s_waitcnt vmcnt(1)
1674; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1675; SI-NEXT:    s_waitcnt vmcnt(0)
1676; SI-NEXT:    v_or_b32_e32 v1, v3, v1
1677; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1678; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1679; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1680; SI-NEXT:    v_trunc_f32_e32 v2, v2
1681; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1682; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1683; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1684; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1685; SI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1686; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1687; SI-NEXT:    s_endpgm
1688;
1689; VI-LABEL: v_udiv_i23:
1690; VI:       ; %bb.0:
1691; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1692; VI-NEXT:    s_mov_b32 s7, 0xf000
1693; VI-NEXT:    s_mov_b32 s6, -1
1694; VI-NEXT:    s_mov_b32 s10, s6
1695; VI-NEXT:    s_mov_b32 s11, s7
1696; VI-NEXT:    s_waitcnt lgkmcnt(0)
1697; VI-NEXT:    s_mov_b32 s8, s2
1698; VI-NEXT:    s_mov_b32 s9, s3
1699; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1700; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1701; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1702; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1703; VI-NEXT:    s_mov_b32 s4, s0
1704; VI-NEXT:    s_mov_b32 s5, s1
1705; VI-NEXT:    s_waitcnt vmcnt(3)
1706; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1707; VI-NEXT:    s_waitcnt vmcnt(2)
1708; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1709; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1710; VI-NEXT:    s_waitcnt vmcnt(1)
1711; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1712; VI-NEXT:    s_waitcnt vmcnt(0)
1713; VI-NEXT:    v_or_b32_e32 v1, v3, v1
1714; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1715; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1716; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1717; VI-NEXT:    v_trunc_f32_e32 v2, v2
1718; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1719; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1720; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1721; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1722; VI-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
1723; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1724; VI-NEXT:    s_endpgm
1725;
1726; GCN-LABEL: v_udiv_i23:
1727; GCN:       ; %bb.0:
1728; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1729; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1730; GCN-NEXT:    s_add_u32 s4, s2, 4
1731; GCN-NEXT:    s_addc_u32 s5, s3, 0
1732; GCN-NEXT:    s_add_u32 s6, s2, 2
1733; GCN-NEXT:    s_addc_u32 s7, s3, 0
1734; GCN-NEXT:    v_mov_b32_e32 v0, s6
1735; GCN-NEXT:    v_mov_b32_e32 v1, s7
1736; GCN-NEXT:    s_add_u32 s6, s2, 6
1737; GCN-NEXT:    s_addc_u32 s7, s3, 0
1738; GCN-NEXT:    v_mov_b32_e32 v2, s6
1739; GCN-NEXT:    v_mov_b32_e32 v3, s7
1740; GCN-NEXT:    v_mov_b32_e32 v4, s4
1741; GCN-NEXT:    v_mov_b32_e32 v5, s5
1742; GCN-NEXT:    flat_load_ubyte v6, v[2:3]
1743; GCN-NEXT:    flat_load_ushort v4, v[4:5]
1744; GCN-NEXT:    v_mov_b32_e32 v2, s2
1745; GCN-NEXT:    v_mov_b32_e32 v3, s3
1746; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
1747; GCN-NEXT:    flat_load_ushort v1, v[2:3]
1748; GCN-NEXT:    s_waitcnt vmcnt(3)
1749; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
1750; GCN-NEXT:    s_waitcnt vmcnt(2)
1751; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
1752; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1753; GCN-NEXT:    s_waitcnt vmcnt(1)
1754; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1755; GCN-NEXT:    s_waitcnt vmcnt(0)
1756; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
1757; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1758; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1759; GCN-NEXT:    v_mov_b32_e32 v0, s0
1760; GCN-NEXT:    v_mov_b32_e32 v1, s1
1761; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1762; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1763; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1764; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1765; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1766; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1767; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffff, v2
1768; GCN-NEXT:    flat_store_dword v[0:1], v2
1769; GCN-NEXT:    s_endpgm
1770;
1771; GFX1030-LABEL: v_udiv_i23:
1772; GFX1030:       ; %bb.0:
1773; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1774; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1775; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1776; GFX1030-NEXT:    s_clause 0x3
1777; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1778; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1779; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1780; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1781; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1782; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1783; GFX1030-NEXT:    s_waitcnt vmcnt(2)
1784; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1785; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1786; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1787; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1788; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1789; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
1790; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1791; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1792; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1793; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1794; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1795; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1796; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1797; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1798; GFX1030-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
1799; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1800; GFX1030-NEXT:    s_endpgm
1801;
1802; EG-LABEL: v_udiv_i23:
1803; EG:       ; %bb.0:
1804; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1805; EG-NEXT:    TEX 3 @6
1806; EG-NEXT:    ALU 20, @15, KC0[CB0:0-32], KC1[]
1807; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1808; EG-NEXT:    CF_END
1809; EG-NEXT:    PAD
1810; EG-NEXT:    Fetch clause starting at 6:
1811; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1812; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1813; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1814; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1815; EG-NEXT:    ALU clause starting at 14:
1816; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1817; EG-NEXT:    ALU clause starting at 15:
1818; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1819; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1820; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1821; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1822; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1823; EG-NEXT:     UINT_TO_FLT * T0.X, PV.W,
1824; EG-NEXT:     OR_INT T0.W, T2.X, T1.W,
1825; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1826; EG-NEXT:     UINT_TO_FLT * T0.Z, PV.W,
1827; EG-NEXT:     MUL_IEEE * T0.W, PS, T0.Y,
1828; EG-NEXT:     TRUNC * T0.W, PV.W,
1829; EG-NEXT:     MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
1830; EG-NEXT:     TRUNC * T0.W, PV.W,
1831; EG-NEXT:     SETGE * T1.W, |PV.W|, |T0.X|,
1832; EG-NEXT:     CNDE T1.W, PV.W, 0.0, literal.x,
1833; EG-NEXT:     FLT_TO_UINT * T0.X, T0.W,
1834; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1835; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1836; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
1837; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1838; EG-NEXT:    8388607(1.175494e-38), 2(2.802597e-45)
1839  %den_ptr = getelementptr i23, ptr addrspace(1) %in, i23 1
1840  %num = load i23, ptr addrspace(1) %in
1841  %den = load i23, ptr addrspace(1) %den_ptr
1842  %result = udiv i23 %num, %den
1843  %result.ext = zext i23 %result to i32
1844  store i32 %result.ext, ptr addrspace(1) %out
1845  ret void
1846}
1847
1848define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1849; SI-LABEL: v_udiv_i24:
1850; SI:       ; %bb.0:
1851; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1852; SI-NEXT:    s_mov_b32 s7, 0xf000
1853; SI-NEXT:    s_mov_b32 s6, -1
1854; SI-NEXT:    s_mov_b32 s10, s6
1855; SI-NEXT:    s_mov_b32 s11, s7
1856; SI-NEXT:    s_waitcnt lgkmcnt(0)
1857; SI-NEXT:    s_mov_b32 s8, s2
1858; SI-NEXT:    s_mov_b32 s9, s3
1859; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1860; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1861; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1862; SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1863; SI-NEXT:    s_mov_b32 s4, s0
1864; SI-NEXT:    s_mov_b32 s5, s1
1865; SI-NEXT:    s_waitcnt vmcnt(3)
1866; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1867; SI-NEXT:    s_waitcnt vmcnt(2)
1868; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1869; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1870; SI-NEXT:    s_waitcnt vmcnt(1)
1871; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1872; SI-NEXT:    s_waitcnt vmcnt(0)
1873; SI-NEXT:    v_or_b32_e32 v1, v3, v1
1874; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1875; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1876; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
1877; SI-NEXT:    v_trunc_f32_e32 v2, v2
1878; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1879; SI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1880; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1881; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1882; SI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1883; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1884; SI-NEXT:    s_endpgm
1885;
1886; VI-LABEL: v_udiv_i24:
1887; VI:       ; %bb.0:
1888; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1889; VI-NEXT:    s_mov_b32 s7, 0xf000
1890; VI-NEXT:    s_mov_b32 s6, -1
1891; VI-NEXT:    s_mov_b32 s10, s6
1892; VI-NEXT:    s_mov_b32 s11, s7
1893; VI-NEXT:    s_waitcnt lgkmcnt(0)
1894; VI-NEXT:    s_mov_b32 s8, s2
1895; VI-NEXT:    s_mov_b32 s9, s3
1896; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1897; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1898; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1899; VI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1900; VI-NEXT:    s_mov_b32 s4, s0
1901; VI-NEXT:    s_mov_b32 s5, s1
1902; VI-NEXT:    s_waitcnt vmcnt(3)
1903; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1904; VI-NEXT:    s_waitcnt vmcnt(2)
1905; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1906; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1907; VI-NEXT:    s_waitcnt vmcnt(1)
1908; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
1909; VI-NEXT:    s_waitcnt vmcnt(0)
1910; VI-NEXT:    v_or_b32_e32 v1, v3, v1
1911; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
1912; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1913; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
1914; VI-NEXT:    v_trunc_f32_e32 v2, v2
1915; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
1916; VI-NEXT:    v_mad_f32 v1, -v2, v0, v1
1917; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1918; VI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1919; VI-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
1920; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1921; VI-NEXT:    s_endpgm
1922;
1923; GCN-LABEL: v_udiv_i24:
1924; GCN:       ; %bb.0:
1925; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1926; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1927; GCN-NEXT:    s_add_u32 s4, s2, 4
1928; GCN-NEXT:    s_addc_u32 s5, s3, 0
1929; GCN-NEXT:    s_add_u32 s6, s2, 2
1930; GCN-NEXT:    s_addc_u32 s7, s3, 0
1931; GCN-NEXT:    v_mov_b32_e32 v0, s6
1932; GCN-NEXT:    v_mov_b32_e32 v1, s7
1933; GCN-NEXT:    s_add_u32 s6, s2, 6
1934; GCN-NEXT:    s_addc_u32 s7, s3, 0
1935; GCN-NEXT:    v_mov_b32_e32 v2, s6
1936; GCN-NEXT:    v_mov_b32_e32 v3, s7
1937; GCN-NEXT:    v_mov_b32_e32 v4, s4
1938; GCN-NEXT:    v_mov_b32_e32 v5, s5
1939; GCN-NEXT:    flat_load_ubyte v6, v[2:3]
1940; GCN-NEXT:    flat_load_ushort v4, v[4:5]
1941; GCN-NEXT:    v_mov_b32_e32 v2, s2
1942; GCN-NEXT:    v_mov_b32_e32 v3, s3
1943; GCN-NEXT:    flat_load_ubyte v0, v[0:1]
1944; GCN-NEXT:    flat_load_ushort v1, v[2:3]
1945; GCN-NEXT:    s_waitcnt vmcnt(3)
1946; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
1947; GCN-NEXT:    s_waitcnt vmcnt(2)
1948; GCN-NEXT:    v_or_b32_e32 v2, v4, v2
1949; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
1950; GCN-NEXT:    s_waitcnt vmcnt(1)
1951; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1952; GCN-NEXT:    s_waitcnt vmcnt(0)
1953; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
1954; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
1955; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1956; GCN-NEXT:    v_mov_b32_e32 v0, s0
1957; GCN-NEXT:    v_mov_b32_e32 v1, s1
1958; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
1959; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1960; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1961; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
1962; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
1963; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1964; GCN-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
1965; GCN-NEXT:    flat_store_dword v[0:1], v2
1966; GCN-NEXT:    s_endpgm
1967;
1968; GFX1030-LABEL: v_udiv_i24:
1969; GFX1030:       ; %bb.0:
1970; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1971; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
1972; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
1973; GFX1030-NEXT:    s_clause 0x3
1974; GFX1030-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1975; GFX1030-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1976; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
1977; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
1978; GFX1030-NEXT:    s_waitcnt vmcnt(3)
1979; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1980; GFX1030-NEXT:    s_waitcnt vmcnt(2)
1981; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
1982; GFX1030-NEXT:    s_waitcnt vmcnt(1)
1983; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1984; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
1985; GFX1030-NEXT:    s_waitcnt vmcnt(0)
1986; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
1987; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1988; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
1989; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
1990; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
1991; GFX1030-NEXT:    v_fma_f32 v2, -v3, v1, v2
1992; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
1993; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1994; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1995; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
1996; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
1997; GFX1030-NEXT:    s_endpgm
1998;
1999; EG-LABEL: v_udiv_i24:
2000; EG:       ; %bb.0:
2001; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
2002; EG-NEXT:    TEX 3 @6
2003; EG-NEXT:    ALU 23, @15, KC0[CB0:0-32], KC1[]
2004; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2005; EG-NEXT:    CF_END
2006; EG-NEXT:    PAD
2007; EG-NEXT:    Fetch clause starting at 6:
2008; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
2009; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
2010; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
2011; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
2012; EG-NEXT:    ALU clause starting at 14:
2013; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2014; EG-NEXT:    ALU clause starting at 15:
2015; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
2016; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2017; EG-NEXT:     OR_INT * T0.W, T0.X, PV.W,
2018; EG-NEXT:     SUB_INT T1.W, 0.0, PV.W,
2019; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2020; EG-NEXT:     MULLO_INT * T0.Y, PV.W, PS,
2021; EG-NEXT:     LSHL T1.W, T3.X, literal.x,
2022; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
2023; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2024; EG-NEXT:     ADD_INT T2.W, T0.X, PS,
2025; EG-NEXT:     OR_INT * T1.W, T2.X, PV.W,
2026; EG-NEXT:     MULHI * T0.X, PS, PV.W,
2027; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2028; EG-NEXT:     SUB_INT * T1.W, T1.W, PS,
2029; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
2030; EG-NEXT:     SETGE_UINT T2.W, PV.W, T0.W,
2031; EG-NEXT:     SUB_INT * T3.W, PV.W, T0.W,
2032; EG-NEXT:     CNDE_INT T1.W, PV.W, T1.W, PS,
2033; EG-NEXT:     CNDE_INT * T2.W, PV.W, T0.X, PV.Z,
2034; EG-NEXT:     ADD_INT T3.W, PS, 1,
2035; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
2036; EG-NEXT:     CNDE_INT T0.X, PS, T2.W, PV.W,
2037; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2038; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2039  %den_ptr = getelementptr i24, ptr addrspace(1) %in, i24 1
2040  %num = load i24, ptr addrspace(1) %in
2041  %den = load i24, ptr addrspace(1) %den_ptr
2042  %result = udiv i24 %num, %den
2043  %result.ext = zext i24 %result to i32
2044  store i32 %result.ext, ptr addrspace(1) %out
2045  ret void
2046}
2047
2048define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) {
2049; SI-LABEL: scalarize_mulhu_4xi32:
2050; SI:       ; %bb.0:
2051; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2052; SI-NEXT:    s_mov_b32 s7, 0xf000
2053; SI-NEXT:    s_mov_b32 s6, -1
2054; SI-NEXT:    s_waitcnt lgkmcnt(0)
2055; SI-NEXT:    s_mov_b32 s4, s0
2056; SI-NEXT:    s_mov_b32 s5, s1
2057; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2058; SI-NEXT:    s_mov_b32 s0, 0x1389c755
2059; SI-NEXT:    s_mov_b32 s4, s2
2060; SI-NEXT:    s_mov_b32 s5, s3
2061; SI-NEXT:    s_waitcnt vmcnt(0)
2062; SI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2063; SI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2064; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2065; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2066; SI-NEXT:    v_mul_hi_u32 v0, v0, s0
2067; SI-NEXT:    v_mul_hi_u32 v1, v1, s0
2068; SI-NEXT:    v_mul_hi_u32 v2, v2, s0
2069; SI-NEXT:    v_mul_hi_u32 v3, v3, s0
2070; SI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2071; SI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2072; SI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2073; SI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2074; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2075; SI-NEXT:    s_endpgm
2076;
2077; VI-LABEL: scalarize_mulhu_4xi32:
2078; VI:       ; %bb.0:
2079; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2080; VI-NEXT:    s_mov_b32 s7, 0xf000
2081; VI-NEXT:    s_mov_b32 s6, -1
2082; VI-NEXT:    s_waitcnt lgkmcnt(0)
2083; VI-NEXT:    s_mov_b32 s4, s0
2084; VI-NEXT:    s_mov_b32 s5, s1
2085; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2086; VI-NEXT:    s_mov_b32 s0, 0x1389c755
2087; VI-NEXT:    s_mov_b32 s4, s2
2088; VI-NEXT:    s_mov_b32 s5, s3
2089; VI-NEXT:    s_waitcnt vmcnt(0)
2090; VI-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2091; VI-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2092; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2093; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2094; VI-NEXT:    v_mul_hi_u32 v0, v0, s0
2095; VI-NEXT:    v_mul_hi_u32 v1, v1, s0
2096; VI-NEXT:    v_mul_hi_u32 v2, v2, s0
2097; VI-NEXT:    v_mul_hi_u32 v3, v3, s0
2098; VI-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2099; VI-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2100; VI-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2101; VI-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2102; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2103; VI-NEXT:    s_endpgm
2104;
2105; GCN-LABEL: scalarize_mulhu_4xi32:
2106; GCN:       ; %bb.0:
2107; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2108; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2109; GCN-NEXT:    v_mov_b32_e32 v0, s0
2110; GCN-NEXT:    v_mov_b32_e32 v1, s1
2111; GCN-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2112; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2113; GCN-NEXT:    v_mov_b32_e32 v4, s2
2114; GCN-NEXT:    v_mov_b32_e32 v5, s3
2115; GCN-NEXT:    s_waitcnt vmcnt(0)
2116; GCN-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2117; GCN-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2118; GCN-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2119; GCN-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2120; GCN-NEXT:    v_mul_hi_u32 v0, v0, s0
2121; GCN-NEXT:    v_mul_hi_u32 v1, v1, s0
2122; GCN-NEXT:    v_mul_hi_u32 v2, v2, s0
2123; GCN-NEXT:    v_mul_hi_u32 v3, v3, s0
2124; GCN-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2125; GCN-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2126; GCN-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2127; GCN-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2128; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2129; GCN-NEXT:    s_endpgm
2130;
2131; GFX1030-LABEL: scalarize_mulhu_4xi32:
2132; GFX1030:       ; %bb.0:
2133; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2134; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
2135; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2136; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1]
2137; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2138; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
2139; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 2, v1
2140; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
2141; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
2142; GFX1030-NEXT:    v_mul_hi_u32 v0, 0x1389c755, v0
2143; GFX1030-NEXT:    v_mul_hi_u32 v1, 0x1389c755, v1
2144; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x1389c755, v2
2145; GFX1030-NEXT:    v_mul_hi_u32 v3, 0x1389c755, v3
2146; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 10, v0
2147; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
2148; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 10, v2
2149; GFX1030-NEXT:    v_lshrrev_b32_e32 v3, 10, v3
2150; GFX1030-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
2151; GFX1030-NEXT:    s_endpgm
2152;
2153; EG-LABEL: scalarize_mulhu_4xi32:
2154; EG:       ; %bb.0:
2155; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2156; EG-NEXT:    TEX 0 @6
2157; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
2158; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2159; EG-NEXT:    CF_END
2160; EG-NEXT:    PAD
2161; EG-NEXT:    Fetch clause starting at 6:
2162; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2163; EG-NEXT:    ALU clause starting at 8:
2164; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2165; EG-NEXT:    ALU clause starting at 9:
2166; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
2167; EG-NEXT:     LSHR * T1.W, T0.Z, literal.x,
2168; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2169; EG-NEXT:     MULHI * T0.Z, PV.W, literal.x,
2170; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2171; EG-NEXT:     LSHR T1.Z, T0.Y, literal.x,
2172; EG-NEXT:     LSHR T0.W, PS, literal.y,
2173; EG-NEXT:     MULHI * T0.Y, T1.W, literal.z,
2174; EG-NEXT:    2(2.802597e-45), 10(1.401298e-44)
2175; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2176; EG-NEXT:     LSHR T0.Z, PS, literal.x,
2177; EG-NEXT:     LSHR T1.W, T0.X, literal.y,
2178; EG-NEXT:     MULHI * T0.X, PV.Z, literal.z,
2179; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2180; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2181; EG-NEXT:     LSHR T0.Y, PS, literal.x,
2182; EG-NEXT:     MULHI * T0.X, PV.W, literal.y,
2183; EG-NEXT:    10(1.401298e-44), 327796565(3.478022e-27)
2184; EG-NEXT:     LSHR T0.X, PS, literal.x,
2185; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.y,
2186; EG-NEXT:    10(1.401298e-44), 2(2.802597e-45)
2187  %1 = load <4 x i32>, ptr addrspace(1) %in, align 16
2188  %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2189  store <4 x i32> %2, ptr addrspace(1) %out, align 16
2190  ret void
2191}
2192
2193define amdgpu_kernel void @test_udiv2(i32 %p) {
2194; SI-LABEL: test_udiv2:
2195; SI:       ; %bb.0:
2196; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
2197; SI-NEXT:    s_mov_b32 s3, 0xf000
2198; SI-NEXT:    s_mov_b32 s2, -1
2199; SI-NEXT:    s_waitcnt lgkmcnt(0)
2200; SI-NEXT:    s_lshr_b32 s0, s0, 1
2201; SI-NEXT:    v_mov_b32_e32 v0, s0
2202; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2203; SI-NEXT:    s_waitcnt vmcnt(0)
2204; SI-NEXT:    s_endpgm
2205;
2206; VI-LABEL: test_udiv2:
2207; VI:       ; %bb.0:
2208; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
2209; VI-NEXT:    s_mov_b32 s3, 0xf000
2210; VI-NEXT:    s_mov_b32 s2, -1
2211; VI-NEXT:    s_waitcnt lgkmcnt(0)
2212; VI-NEXT:    s_lshr_b32 s0, s0, 1
2213; VI-NEXT:    v_mov_b32_e32 v0, s0
2214; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2215; VI-NEXT:    s_waitcnt vmcnt(0)
2216; VI-NEXT:    s_endpgm
2217;
2218; GCN-LABEL: test_udiv2:
2219; GCN:       ; %bb.0:
2220; GCN-NEXT:    s_load_dword s0, s[8:9], 0x0
2221; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2222; GCN-NEXT:    s_lshr_b32 s0, s0, 1
2223; GCN-NEXT:    v_mov_b32_e32 v0, s0
2224; GCN-NEXT:    flat_store_dword v[0:1], v0
2225; GCN-NEXT:    s_waitcnt vmcnt(0)
2226; GCN-NEXT:    s_endpgm
2227;
2228; GFX1030-LABEL: test_udiv2:
2229; GFX1030:       ; %bb.0:
2230; GFX1030-NEXT:    s_load_dword s0, s[8:9], 0x0
2231; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2232; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2233; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2234; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2235; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2236; GFX1030-NEXT:    s_endpgm
2237;
2238; EG-LABEL: test_udiv2:
2239; EG:       ; %bb.0:
2240; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2241; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2242; EG-NEXT:    CF_END
2243; EG-NEXT:    PAD
2244; EG-NEXT:    ALU clause starting at 4:
2245; EG-NEXT:     MOV T0.X, literal.x,
2246; EG-NEXT:     LSHR * T1.X, KC0[2].Y, 1,
2247; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2248  %i = udiv i32 %p, 2
2249  store volatile i32 %i, ptr addrspace(1) undef
2250  ret void
2251}
2252
2253define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
2254; SI-LABEL: test_udiv_3_mulhu:
2255; SI:       ; %bb.0:
2256; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
2257; SI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2258; SI-NEXT:    s_mov_b32 s3, 0xf000
2259; SI-NEXT:    s_mov_b32 s2, -1
2260; SI-NEXT:    s_waitcnt lgkmcnt(0)
2261; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
2262; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2263; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2264; SI-NEXT:    s_waitcnt vmcnt(0)
2265; SI-NEXT:    s_endpgm
2266;
2267; VI-LABEL: test_udiv_3_mulhu:
2268; VI:       ; %bb.0:
2269; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
2270; VI-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2271; VI-NEXT:    s_mov_b32 s3, 0xf000
2272; VI-NEXT:    s_mov_b32 s2, -1
2273; VI-NEXT:    s_waitcnt lgkmcnt(0)
2274; VI-NEXT:    v_mul_hi_u32 v0, s0, v0
2275; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2276; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2277; VI-NEXT:    s_waitcnt vmcnt(0)
2278; VI-NEXT:    s_endpgm
2279;
2280; GCN-LABEL: test_udiv_3_mulhu:
2281; GCN:       ; %bb.0:
2282; GCN-NEXT:    s_load_dword s0, s[8:9], 0x0
2283; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
2284; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2285; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
2286; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
2287; GCN-NEXT:    flat_store_dword v[0:1], v0
2288; GCN-NEXT:    s_waitcnt vmcnt(0)
2289; GCN-NEXT:    s_endpgm
2290;
2291; GFX1030-LABEL: test_udiv_3_mulhu:
2292; GFX1030:       ; %bb.0:
2293; GFX1030-NEXT:    s_load_dword s0, s[8:9], 0x0
2294; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
2295; GFX1030-NEXT:    s_mul_hi_u32 s0, s0, 0xaaaaaaab
2296; GFX1030-NEXT:    s_lshr_b32 s0, s0, 1
2297; GFX1030-NEXT:    v_mov_b32_e32 v0, s0
2298; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
2299; GFX1030-NEXT:    s_waitcnt_vscnt null, 0x0
2300; GFX1030-NEXT:    s_endpgm
2301;
2302; EG-LABEL: test_udiv_3_mulhu:
2303; EG:       ; %bb.0:
2304; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2305; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2306; EG-NEXT:    CF_END
2307; EG-NEXT:    PAD
2308; EG-NEXT:    ALU clause starting at 4:
2309; EG-NEXT:     MULHI * T0.X, KC0[2].Y, literal.x,
2310; EG-NEXT:    -1431655765(-3.031649e-13), 0(0.000000e+00)
2311; EG-NEXT:     LSHR T0.X, PS, 1,
2312; EG-NEXT:     MOV * T1.X, literal.x,
2313; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2314   %i = udiv i32 %p, 3
2315   store volatile i32 %i, ptr addrspace(1) undef
2316   ret void
2317}
2318
2319define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) {
2320; SI-LABEL: fdiv_test_denormals:
2321; SI:       ; %bb.0: ; %bb
2322; SI-NEXT:    s_mov_b32 s0, 0
2323; SI-NEXT:    s_mov_b32 s3, 0xf000
2324; SI-NEXT:    s_mov_b32 s2, -1
2325; SI-NEXT:    s_mov_b32 s1, s0
2326; SI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2327; SI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2328; SI-NEXT:    s_waitcnt vmcnt(1)
2329; SI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2330; SI-NEXT:    s_waitcnt vmcnt(0)
2331; SI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2332; SI-NEXT:    v_xor_b32_e32 v0, v1, v0
2333; SI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2334; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2335; SI-NEXT:    v_or_b32_e32 v0, 1, v0
2336; SI-NEXT:    v_mul_f32_e32 v1, v3, v4
2337; SI-NEXT:    v_trunc_f32_e32 v1, v1
2338; SI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2339; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2340; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2341; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2342; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2343; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2344; SI-NEXT:    s_endpgm
2345;
2346; VI-LABEL: fdiv_test_denormals:
2347; VI:       ; %bb.0: ; %bb
2348; VI-NEXT:    s_mov_b32 s0, 0
2349; VI-NEXT:    s_mov_b32 s3, 0xf000
2350; VI-NEXT:    s_mov_b32 s2, -1
2351; VI-NEXT:    s_mov_b32 s1, s0
2352; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
2353; VI-NEXT:    buffer_load_sbyte v1, off, s[0:3], 0
2354; VI-NEXT:    s_waitcnt vmcnt(1)
2355; VI-NEXT:    v_cvt_f32_i32_e32 v2, v0
2356; VI-NEXT:    s_waitcnt vmcnt(0)
2357; VI-NEXT:    v_cvt_f32_i32_e32 v3, v1
2358; VI-NEXT:    v_xor_b32_e32 v0, v1, v0
2359; VI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
2360; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2361; VI-NEXT:    v_or_b32_e32 v0, 1, v0
2362; VI-NEXT:    v_mul_f32_e32 v1, v3, v4
2363; VI-NEXT:    v_trunc_f32_e32 v1, v1
2364; VI-NEXT:    v_mad_f32 v3, -v1, v2, v3
2365; VI-NEXT:    v_cvt_i32_f32_e32 v1, v1
2366; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2367; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
2368; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2369; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2370; VI-NEXT:    s_endpgm
2371;
2372; GCN-LABEL: fdiv_test_denormals:
2373; GCN:       ; %bb.0: ; %bb
2374; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
2375; GCN-NEXT:    v_mov_b32_e32 v0, 0
2376; GCN-NEXT:    v_mov_b32_e32 v1, 0
2377; GCN-NEXT:    flat_load_sbyte v3, v[0:1]
2378; GCN-NEXT:    s_waitcnt vmcnt(1)
2379; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v2
2380; GCN-NEXT:    s_waitcnt vmcnt(0)
2381; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v3
2382; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2383; GCN-NEXT:    v_xor_b32_e32 v2, v3, v2
2384; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2385; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
2386; GCN-NEXT:    v_mul_f32_e32 v3, v5, v6
2387; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2388; GCN-NEXT:    v_mad_f32 v5, -v3, v4, v5
2389; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2390; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
2391; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
2392; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
2393; GCN-NEXT:    flat_store_byte v[0:1], v2
2394; GCN-NEXT:    s_endpgm
2395;
2396; GFX1030-LABEL: fdiv_test_denormals:
2397; GFX1030:       ; %bb.0: ; %bb
2398; GFX1030-NEXT:    global_load_sbyte v2, v[0:1], off
2399; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
2400; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
2401; GFX1030-NEXT:    global_load_sbyte v3, v[0:1], off
2402; GFX1030-NEXT:    s_waitcnt vmcnt(1)
2403; GFX1030-NEXT:    v_cvt_f32_i32_e32 v4, v2
2404; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v4
2405; GFX1030-NEXT:    s_waitcnt vmcnt(0)
2406; GFX1030-NEXT:    v_cvt_f32_i32_e32 v6, v3
2407; GFX1030-NEXT:    v_xor_b32_e32 v2, v3, v2
2408; GFX1030-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
2409; GFX1030-NEXT:    v_mul_f32_e32 v5, v6, v5
2410; GFX1030-NEXT:    v_or_b32_e32 v2, 1, v2
2411; GFX1030-NEXT:    v_trunc_f32_e32 v3, v5
2412; GFX1030-NEXT:    v_fma_f32 v5, -v3, v4, v6
2413; GFX1030-NEXT:    v_cvt_i32_f32_e32 v3, v3
2414; GFX1030-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4|
2415; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
2416; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v3, v2
2417; GFX1030-NEXT:    global_store_byte v[0:1], v2, off
2418; GFX1030-NEXT:    s_endpgm
2419;
2420; EG-LABEL: fdiv_test_denormals:
2421; EG:       ; %bb.0: ; %bb
2422; EG-NEXT:    TEX 0 @6
2423; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
2424; EG-NEXT:    TEX 0 @8
2425; EG-NEXT:    ALU 25, @11, KC0[], KC1[]
2426; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
2427; EG-NEXT:    CF_END
2428; EG-NEXT:    Fetch clause starting at 6:
2429; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
2430; EG-NEXT:    Fetch clause starting at 8:
2431; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
2432; EG-NEXT:    ALU clause starting at 10:
2433; EG-NEXT:     MOV * T1.X, 0.0,
2434; EG-NEXT:    ALU clause starting at 11:
2435; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
2436; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2437; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
2438; EG-NEXT:     BFE_INT T1.W, T1.X, 0.0, literal.x,
2439; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
2440; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2441; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
2442; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
2443; EG-NEXT:     TRUNC T2.W, PV.W,
2444; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
2445; EG-NEXT:     ASHR T0.W, PS, literal.x,
2446; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
2447; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
2448; EG-NEXT:     TRUNC T0.Z, T2.W,
2449; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
2450; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
2451; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
2452; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
2453; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
2454; EG-NEXT:     AND_INT T0.X, PV.W, literal.x,
2455; EG-NEXT:     MOV * T0.W, literal.x,
2456; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2457; EG-NEXT:     MOV T0.Y, 0.0,
2458; EG-NEXT:     MOV * T0.Z, 0.0,
2459; EG-NEXT:     MOV * T1.X, literal.x,
2460; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2461bb:
2462  %tmp = load i8, ptr addrspace(1) null, align 1
2463  %tmp1 = sext i8 %tmp to i32
2464  %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 undef
2465  %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1
2466  %tmp4 = sext i8 %tmp3 to i32
2467  %tmp5 = sdiv i32 %tmp1, %tmp4
2468  %tmp6 = trunc i32 %tmp5 to i8
2469  store i8 %tmp6, ptr addrspace(1) null, align 1
2470  ret void
2471}
2472
2473define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
2474; SI-LABEL: v_test_udiv64_mulhi_fold:
2475; SI:       ; %bb.0:
2476; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2477; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 5
2478; SI-NEXT:    s_mov_b32 s4, 0x71b47843
2479; SI-NEXT:    v_lshrrev_b32_e32 v1, 5, v1
2480; SI-NEXT:    v_mul_hi_u32 v3, v0, s4
2481; SI-NEXT:    v_mul_lo_u32 v4, v1, s4
2482; SI-NEXT:    s_mov_b32 s6, 0xa7c5ac4
2483; SI-NEXT:    v_mul_hi_u32 v5, v1, s4
2484; SI-NEXT:    v_mul_hi_u32 v2, v0, s6
2485; SI-NEXT:    v_mul_lo_u32 v0, v0, s6
2486; SI-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
2487; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
2488; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
2489; SI-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2490; SI-NEXT:    v_mul_lo_u32 v2, v1, s6
2491; SI-NEXT:    v_mul_hi_u32 v1, v1, s6
2492; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
2493; SI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
2494; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2495; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2496; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 7
2497; SI-NEXT:    v_lshrrev_b32_e32 v1, 7, v1
2498; SI-NEXT:    s_setpc_b64 s[30:31]
2499;
2500; VI-LABEL: v_test_udiv64_mulhi_fold:
2501; VI:       ; %bb.0:
2502; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2503; VI-NEXT:    v_alignbit_b32 v4, v1, v0, 5
2504; VI-NEXT:    s_mov_b32 s4, 0x71b47843
2505; VI-NEXT:    v_mul_hi_u32 v2, v4, s4
2506; VI-NEXT:    v_mov_b32_e32 v3, 0
2507; VI-NEXT:    v_lshrrev_b32_e32 v5, 5, v1
2508; VI-NEXT:    s_mov_b32 s6, 0xa7c5ac4
2509; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3]
2510; VI-NEXT:    v_mov_b32_e32 v2, v0
2511; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3]
2512; VI-NEXT:    v_mov_b32_e32 v0, v1
2513; VI-NEXT:    v_mov_b32_e32 v1, v3
2514; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2515; VI-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
2516; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
2517; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 7
2518; VI-NEXT:    v_lshrrev_b32_e32 v1, 7, v1
2519; VI-NEXT:    s_setpc_b64 s[30:31]
2520;
2521; GCN-LABEL: v_test_udiv64_mulhi_fold:
2522; GCN:       ; %bb.0:
2523; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2524; GCN-NEXT:    v_alignbit_b32 v4, v1, v0, 5
2525; GCN-NEXT:    s_mov_b32 s4, 0x71b47843
2526; GCN-NEXT:    v_mul_hi_u32 v2, v4, s4
2527; GCN-NEXT:    v_mov_b32_e32 v3, 0
2528; GCN-NEXT:    v_lshrrev_b32_e32 v5, 5, v1
2529; GCN-NEXT:    s_mov_b32 s6, 0xa7c5ac4
2530; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3]
2531; GCN-NEXT:    v_mov_b32_e32 v2, v0
2532; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3]
2533; GCN-NEXT:    v_mov_b32_e32 v0, v1
2534; GCN-NEXT:    v_mov_b32_e32 v1, v3
2535; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2536; GCN-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
2537; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
2538; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 7
2539; GCN-NEXT:    v_lshrrev_b32_e32 v1, 7, v1
2540; GCN-NEXT:    s_setpc_b64 s[30:31]
2541;
2542; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
2543; GFX1030:       ; %bb.0:
2544; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2545; GFX1030-NEXT:    v_alignbit_b32 v4, v1, v0, 5
2546; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
2547; GFX1030-NEXT:    v_lshrrev_b32_e32 v5, 5, v1
2548; GFX1030-NEXT:    v_mul_hi_u32 v2, 0x71b47843, v4
2549; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0x71b47843, v5, v[2:3]
2550; GFX1030-NEXT:    v_mov_b32_e32 v2, v0
2551; GFX1030-NEXT:    v_mov_b32_e32 v0, v1
2552; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, 0xa7c5ac4, v4, v[2:3]
2553; GFX1030-NEXT:    v_mov_b32_e32 v1, v3
2554; GFX1030-NEXT:    v_add_co_u32 v0, s4, v0, v1
2555; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s4
2556; GFX1030-NEXT:    v_mad_u64_u32 v[0:1], null, 0xa7c5ac4, v5, v[0:1]
2557; GFX1030-NEXT:    v_alignbit_b32 v0, v1, v0, 7
2558; GFX1030-NEXT:    v_lshrrev_b32_e32 v1, 7, v1
2559; GFX1030-NEXT:    s_setpc_b64 s[30:31]
2560;
2561; EG-LABEL: v_test_udiv64_mulhi_fold:
2562; EG:       ; %bb.0:
2563; EG-NEXT:    CF_END
2564; EG-NEXT:    PAD
2565  %d = udiv i64 %arg, 100000
2566  ret i64 %d
2567}
2568