xref: /llvm-project/llvm/test/CodeGen/AMDGPU/sdiv.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx600 | FileCheck %s --check-prefix=GCN
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=TONGA
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s --check-prefix=GFX9
5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
6
7; The code generated by sdiv is long and complex and may frequently change.
8; The goal of this test is to make sure the ISel doesn't fail.
9;
10; This program was previously failing to compile when one of the selectcc
11; opcodes generated by the sdiv lowering was being legalized and optimized to:
12; selectcc Remainder -1, 0, -1, SETGT
13; This was fixed by adding an additional pattern in R600Instructions.td to
14; match this pattern with a CNDGE_INT.
15
16define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
17; GCN-LABEL: sdiv_i32:
18; GCN:       ; %bb.0:
19; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
20; GCN-NEXT:    s_mov_b32 s3, 0xf000
21; GCN-NEXT:    s_mov_b32 s2, -1
22; GCN-NEXT:    s_mov_b32 s10, s2
23; GCN-NEXT:    s_mov_b32 s11, s3
24; GCN-NEXT:    s_waitcnt lgkmcnt(0)
25; GCN-NEXT:    s_mov_b32 s8, s6
26; GCN-NEXT:    s_mov_b32 s9, s7
27; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
28; GCN-NEXT:    s_mov_b32 s0, s4
29; GCN-NEXT:    s_mov_b32 s1, s5
30; GCN-NEXT:    s_waitcnt vmcnt(0)
31; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
32; GCN-NEXT:    v_max_i32_e32 v2, v1, v2
33; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
34; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
35; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
36; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
37; GCN-NEXT:    v_max_i32_e32 v5, v0, v5
38; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
39; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
40; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
41; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
42; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
43; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
44; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
45; GCN-NEXT:    v_mul_hi_u32 v3, v5, v3
46; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
47; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
48; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
49; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v1, v2
50; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
51; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
52; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
53; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
54; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
55; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
56; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
57; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
58; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
59; GCN-NEXT:    s_endpgm
60;
61; TONGA-LABEL: sdiv_i32:
62; TONGA:       ; %bb.0:
63; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
64; TONGA-NEXT:    s_mov_b32 s3, 0xf000
65; TONGA-NEXT:    s_mov_b32 s2, -1
66; TONGA-NEXT:    s_mov_b32 s10, s2
67; TONGA-NEXT:    s_mov_b32 s11, s3
68; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
69; TONGA-NEXT:    s_mov_b32 s8, s6
70; TONGA-NEXT:    s_mov_b32 s9, s7
71; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
72; TONGA-NEXT:    s_mov_b32 s0, s4
73; TONGA-NEXT:    s_mov_b32 s1, s5
74; TONGA-NEXT:    s_waitcnt vmcnt(0)
75; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, 0, v1
76; TONGA-NEXT:    v_max_i32_e32 v2, v1, v2
77; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
78; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
79; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
80; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
81; TONGA-NEXT:    v_max_i32_e32 v5, v0, v5
82; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
83; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
84; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
85; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
86; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
87; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
88; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
89; TONGA-NEXT:    v_mul_hi_u32 v3, v5, v3
90; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
91; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
92; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
93; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v1, v2
94; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
95; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
96; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
97; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
98; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
99; TONGA-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
100; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v0
101; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
102; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
103; TONGA-NEXT:    s_endpgm
104;
105; GFX9-LABEL: sdiv_i32:
106; GFX9:       ; %bb.0:
107; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
108; GFX9-NEXT:    s_mov_b32 s3, 0xf000
109; GFX9-NEXT:    s_mov_b32 s2, -1
110; GFX9-NEXT:    s_mov_b32 s6, s2
111; GFX9-NEXT:    s_mov_b32 s7, s3
112; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX9-NEXT:    s_mov_b32 s4, s10
114; GFX9-NEXT:    s_mov_b32 s5, s11
115; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
116; GFX9-NEXT:    s_mov_b32 s0, s8
117; GFX9-NEXT:    s_mov_b32 s1, s9
118; GFX9-NEXT:    s_waitcnt vmcnt(0)
119; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
120; GFX9-NEXT:    s_abs_i32 s5, s4
121; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
122; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
123; GFX9-NEXT:    s_sub_i32 s7, 0, s5
124; GFX9-NEXT:    s_xor_b32 s4, s6, s4
125; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
126; GFX9-NEXT:    s_abs_i32 s6, s6
127; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
128; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
129; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
130; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
131; GFX9-NEXT:    s_mul_i32 s7, s7, s8
132; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
133; GFX9-NEXT:    s_add_i32 s8, s8, s7
134; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s8
135; GFX9-NEXT:    s_mul_i32 s8, s7, s5
136; GFX9-NEXT:    s_sub_i32 s6, s6, s8
137; GFX9-NEXT:    s_add_i32 s9, s7, 1
138; GFX9-NEXT:    s_sub_i32 s8, s6, s5
139; GFX9-NEXT:    s_cmp_ge_u32 s6, s5
140; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
141; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
142; GFX9-NEXT:    s_add_i32 s8, s7, 1
143; GFX9-NEXT:    s_cmp_ge_u32 s6, s5
144; GFX9-NEXT:    s_cselect_b32 s5, s8, s7
145; GFX9-NEXT:    s_xor_b32 s5, s5, s4
146; GFX9-NEXT:    s_sub_i32 s4, s5, s4
147; GFX9-NEXT:    v_mov_b32_e32 v0, s4
148; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
149; GFX9-NEXT:    s_endpgm
150;
151; EG-LABEL: sdiv_i32:
152; EG:       ; %bb.0:
153; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
154; EG-NEXT:    TEX 0 @6
155; EG-NEXT:    ALU 26, @9, KC0[CB0:0-32], KC1[]
156; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
157; EG-NEXT:    CF_END
158; EG-NEXT:    PAD
159; EG-NEXT:    Fetch clause starting at 6:
160; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
161; EG-NEXT:    ALU clause starting at 8:
162; EG-NEXT:     MOV * T0.X, KC0[2].Z,
163; EG-NEXT:    ALU clause starting at 9:
164; EG-NEXT:     SETGT_INT * T0.W, 0.0, T0.Y,
165; EG-NEXT:     ADD_INT * T1.W, T0.Y, PV.W,
166; EG-NEXT:     XOR_INT * T1.W, PV.W, T0.W,
167; EG-NEXT:     SUB_INT T2.W, 0.0, PV.W,
168; EG-NEXT:     RECIP_UINT * T0.Y, PV.W,
169; EG-NEXT:     SETGT_INT T3.W, 0.0, T0.X,
170; EG-NEXT:     MULLO_INT * T0.Z, PV.W, PS,
171; EG-NEXT:     ADD_INT T2.W, T0.X, PV.W,
172; EG-NEXT:     MULHI * T0.X, T0.Y, PS,
173; EG-NEXT:     ADD_INT T4.W, T0.Y, PS,
174; EG-NEXT:     XOR_INT * T2.W, PV.W, T3.W,
175; EG-NEXT:     MULHI * T0.X, PS, PV.W,
176; EG-NEXT:     MULLO_INT * T0.Y, PS, T1.W,
177; EG-NEXT:     SUB_INT * T2.W, T2.W, PS,
178; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
179; EG-NEXT:     SETGE_UINT T4.W, PV.W, T1.W,
180; EG-NEXT:     SUB_INT * T5.W, PV.W, T1.W,
181; EG-NEXT:     CNDE_INT T2.W, PV.W, T2.W, PS,
182; EG-NEXT:     CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
183; EG-NEXT:     ADD_INT T5.W, PS, 1,
184; EG-NEXT:     SETGE_UINT * T1.W, PV.W, T1.W,
185; EG-NEXT:     CNDE_INT T1.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
186; EG-NEXT:     XOR_INT * T0.W, T3.W, T0.W,
187; EG-NEXT:     XOR_INT * T1.W, PV.W, PS,
188; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
189; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
190; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
191  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
192  %num = load i32, ptr addrspace(1) %in
193  %den = load i32, ptr addrspace(1) %den_ptr
194  %result = sdiv i32 %num, %den
195  store i32 %result, ptr addrspace(1) %out
196  ret void
197}
198
199define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
200; GCN-LABEL: sdiv_i32_4:
201; GCN:       ; %bb.0:
202; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
203; GCN-NEXT:    s_mov_b32 s7, 0xf000
204; GCN-NEXT:    s_mov_b32 s6, -1
205; GCN-NEXT:    s_mov_b32 s10, s6
206; GCN-NEXT:    s_mov_b32 s11, s7
207; GCN-NEXT:    s_waitcnt lgkmcnt(0)
208; GCN-NEXT:    s_mov_b32 s8, s2
209; GCN-NEXT:    s_mov_b32 s9, s3
210; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
211; GCN-NEXT:    s_mov_b32 s4, s0
212; GCN-NEXT:    s_mov_b32 s5, s1
213; GCN-NEXT:    s_waitcnt vmcnt(0)
214; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
215; GCN-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
216; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
217; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
218; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
219; GCN-NEXT:    s_endpgm
220;
221; TONGA-LABEL: sdiv_i32_4:
222; TONGA:       ; %bb.0:
223; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
224; TONGA-NEXT:    s_mov_b32 s7, 0xf000
225; TONGA-NEXT:    s_mov_b32 s6, -1
226; TONGA-NEXT:    s_mov_b32 s10, s6
227; TONGA-NEXT:    s_mov_b32 s11, s7
228; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
229; TONGA-NEXT:    s_mov_b32 s8, s2
230; TONGA-NEXT:    s_mov_b32 s9, s3
231; TONGA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
232; TONGA-NEXT:    s_mov_b32 s4, s0
233; TONGA-NEXT:    s_mov_b32 s5, s1
234; TONGA-NEXT:    s_waitcnt vmcnt(0)
235; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
236; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
237; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
238; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
239; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
240; TONGA-NEXT:    s_endpgm
241;
242; GFX9-LABEL: sdiv_i32_4:
243; GFX9:       ; %bb.0:
244; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
245; GFX9-NEXT:    s_mov_b32 s7, 0xf000
246; GFX9-NEXT:    s_mov_b32 s6, -1
247; GFX9-NEXT:    s_mov_b32 s10, s6
248; GFX9-NEXT:    s_mov_b32 s11, s7
249; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX9-NEXT:    s_mov_b32 s8, s2
251; GFX9-NEXT:    s_mov_b32 s9, s3
252; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
253; GFX9-NEXT:    s_mov_b32 s4, s0
254; GFX9-NEXT:    s_mov_b32 s5, s1
255; GFX9-NEXT:    s_waitcnt vmcnt(0)
256; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
257; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
258; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
259; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
260; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
261; GFX9-NEXT:    s_endpgm
262;
263; EG-LABEL: sdiv_i32_4:
264; EG:       ; %bb.0:
265; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
266; EG-NEXT:    TEX 0 @6
267; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
268; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
269; EG-NEXT:    CF_END
270; EG-NEXT:    PAD
271; EG-NEXT:    Fetch clause starting at 6:
272; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
273; EG-NEXT:    ALU clause starting at 8:
274; EG-NEXT:     MOV * T0.X, KC0[2].Z,
275; EG-NEXT:    ALU clause starting at 9:
276; EG-NEXT:     ASHR * T0.W, T0.X, literal.x,
277; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
278; EG-NEXT:     LSHR * T0.W, PV.W, literal.x,
279; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
280; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
281; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
282; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
283; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
284  %num = load i32, ptr addrspace(1) %in
285  %result = sdiv i32 %num, 4
286  store i32 %result, ptr addrspace(1) %out
287  ret void
288}
289
290; Multiply by a weird constant to make sure setIntDivIsCheap is
291; working.
292
293define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspace(1) %in) {
294; GCN-LABEL: slow_sdiv_i32_3435:
295; GCN:       ; %bb.0:
296; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
297; GCN-NEXT:    s_mov_b32 s7, 0xf000
298; GCN-NEXT:    s_mov_b32 s6, -1
299; GCN-NEXT:    s_mov_b32 s10, s6
300; GCN-NEXT:    s_mov_b32 s11, s7
301; GCN-NEXT:    s_waitcnt lgkmcnt(0)
302; GCN-NEXT:    s_mov_b32 s8, s2
303; GCN-NEXT:    s_mov_b32 s9, s3
304; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
305; GCN-NEXT:    s_mov_b32 s2, 0x98a1930b
306; GCN-NEXT:    s_mov_b32 s4, s0
307; GCN-NEXT:    s_mov_b32 s5, s1
308; GCN-NEXT:    s_waitcnt vmcnt(0)
309; GCN-NEXT:    v_mul_hi_i32 v1, v0, s2
310; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
311; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
312; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
313; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
314; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
315; GCN-NEXT:    s_endpgm
316;
317; TONGA-LABEL: slow_sdiv_i32_3435:
318; TONGA:       ; %bb.0:
319; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
320; TONGA-NEXT:    s_mov_b32 s7, 0xf000
321; TONGA-NEXT:    s_mov_b32 s6, -1
322; TONGA-NEXT:    s_mov_b32 s10, s6
323; TONGA-NEXT:    s_mov_b32 s11, s7
324; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
325; TONGA-NEXT:    s_mov_b32 s8, s2
326; TONGA-NEXT:    s_mov_b32 s9, s3
327; TONGA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
328; TONGA-NEXT:    s_mov_b32 s2, 0x98a1930b
329; TONGA-NEXT:    s_mov_b32 s4, s0
330; TONGA-NEXT:    s_mov_b32 s5, s1
331; TONGA-NEXT:    s_waitcnt vmcnt(0)
332; TONGA-NEXT:    v_mul_hi_i32 v1, v0, s2
333; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
334; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
335; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
336; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
337; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
338; TONGA-NEXT:    s_endpgm
339;
340; GFX9-LABEL: slow_sdiv_i32_3435:
341; GFX9:       ; %bb.0:
342; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
343; GFX9-NEXT:    s_mov_b32 s7, 0xf000
344; GFX9-NEXT:    s_mov_b32 s6, -1
345; GFX9-NEXT:    s_mov_b32 s10, s6
346; GFX9-NEXT:    s_mov_b32 s11, s7
347; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
348; GFX9-NEXT:    s_mov_b32 s8, s2
349; GFX9-NEXT:    s_mov_b32 s9, s3
350; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
351; GFX9-NEXT:    s_mov_b32 s2, 0x98a1930b
352; GFX9-NEXT:    s_mov_b32 s4, s0
353; GFX9-NEXT:    s_mov_b32 s5, s1
354; GFX9-NEXT:    s_waitcnt vmcnt(0)
355; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
356; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
357; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
358; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
359; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
360; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
361; GFX9-NEXT:    s_endpgm
362;
363; EG-LABEL: slow_sdiv_i32_3435:
364; EG:       ; %bb.0:
365; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
366; EG-NEXT:    TEX 0 @6
367; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
368; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
369; EG-NEXT:    CF_END
370; EG-NEXT:    PAD
371; EG-NEXT:    Fetch clause starting at 6:
372; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
373; EG-NEXT:    ALU clause starting at 8:
374; EG-NEXT:     MOV * T0.X, KC0[2].Z,
375; EG-NEXT:    ALU clause starting at 9:
376; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
377; EG-NEXT:    -1734241525(-4.176600e-24), 0(0.000000e+00)
378; EG-NEXT:     ADD_INT * T0.W, PS, T0.X,
379; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
380; EG-NEXT:     LSHR * T0.W, PV.W, literal.y,
381; EG-NEXT:    11(1.541428e-44), 31(4.344025e-44)
382; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
383; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
384; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
385  %num = load i32, ptr addrspace(1) %in
386  %result = sdiv i32 %num, 3435
387  store i32 %result, ptr addrspace(1) %out
388  ret void
389}
390
391define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
392; GCN-LABEL: sdiv_v2i32:
393; GCN:       ; %bb.0:
394; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
395; GCN-NEXT:    s_mov_b32 s7, 0xf000
396; GCN-NEXT:    s_mov_b32 s6, -1
397; GCN-NEXT:    s_mov_b32 s10, s6
398; GCN-NEXT:    s_mov_b32 s11, s7
399; GCN-NEXT:    s_waitcnt lgkmcnt(0)
400; GCN-NEXT:    s_mov_b32 s8, s2
401; GCN-NEXT:    s_mov_b32 s9, s3
402; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
403; GCN-NEXT:    s_mov_b32 s4, s0
404; GCN-NEXT:    s_mov_b32 s5, s1
405; GCN-NEXT:    s_waitcnt vmcnt(0)
406; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
407; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
408; GCN-NEXT:    v_xor_b32_e32 v4, v0, v2
409; GCN-NEXT:    v_xor_b32_e32 v7, v1, v3
410; GCN-NEXT:    v_max_i32_e32 v2, v2, v6
411; GCN-NEXT:    v_max_i32_e32 v3, v3, v9
412; GCN-NEXT:    v_cvt_f32_u32_e32 v6, v2
413; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v3
414; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
415; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v6
416; GCN-NEXT:    v_max_i32_e32 v0, v0, v5
417; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v9
418; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
419; GCN-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
420; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
421; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
422; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
423; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
424; GCN-NEXT:    v_mul_lo_u32 v9, v9, v6
425; GCN-NEXT:    v_mul_lo_u32 v10, v10, v5
426; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
427; GCN-NEXT:    v_mul_hi_u32 v9, v6, v9
428; GCN-NEXT:    v_max_i32_e32 v1, v1, v8
429; GCN-NEXT:    v_mul_hi_u32 v8, v5, v10
430; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
431; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
432; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
433; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
434; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
435; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
436; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
437; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
438; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
439; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
440; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
441; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
442; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
443; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
444; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v0, v2
445; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
446; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v1, v3
447; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
448; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
449; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v6
450; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
451; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
452; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
453; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
454; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
455; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
456; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
457; GCN-NEXT:    v_xor_b32_e32 v1, v1, v7
458; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
459; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
460; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
461; GCN-NEXT:    s_endpgm
462;
463; TONGA-LABEL: sdiv_v2i32:
464; TONGA:       ; %bb.0:
465; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
466; TONGA-NEXT:    s_mov_b32 s7, 0xf000
467; TONGA-NEXT:    s_mov_b32 s6, -1
468; TONGA-NEXT:    s_mov_b32 s10, s6
469; TONGA-NEXT:    s_mov_b32 s11, s7
470; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
471; TONGA-NEXT:    s_mov_b32 s8, s2
472; TONGA-NEXT:    s_mov_b32 s9, s3
473; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
474; TONGA-NEXT:    s_mov_b32 s4, s0
475; TONGA-NEXT:    s_mov_b32 s5, s1
476; TONGA-NEXT:    s_waitcnt vmcnt(0)
477; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
478; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v3
479; TONGA-NEXT:    v_xor_b32_e32 v4, v0, v2
480; TONGA-NEXT:    v_xor_b32_e32 v7, v1, v3
481; TONGA-NEXT:    v_max_i32_e32 v2, v2, v6
482; TONGA-NEXT:    v_max_i32_e32 v3, v3, v9
483; TONGA-NEXT:    v_cvt_f32_u32_e32 v6, v2
484; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v3
485; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
486; TONGA-NEXT:    v_rcp_iflag_f32_e32 v6, v6
487; TONGA-NEXT:    v_max_i32_e32 v0, v0, v5
488; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v9
489; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
490; TONGA-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
491; TONGA-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
492; TONGA-NEXT:    v_cvt_u32_f32_e32 v6, v6
493; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
494; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v3
495; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v6
496; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v5
497; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, 0, v1
498; TONGA-NEXT:    v_mul_hi_u32 v9, v6, v9
499; TONGA-NEXT:    v_max_i32_e32 v1, v1, v8
500; TONGA-NEXT:    v_mul_hi_u32 v8, v5, v10
501; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
502; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
503; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v8
504; TONGA-NEXT:    v_mul_hi_u32 v6, v0, v6
505; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
506; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
507; TONGA-NEXT:    v_mul_lo_u32 v8, v6, v2
508; TONGA-NEXT:    v_mul_lo_u32 v10, v5, v3
509; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
510; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
511; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
512; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
513; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
514; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
515; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
516; TONGA-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
517; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v1, v3
518; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
519; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
520; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
521; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
522; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
523; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
524; TONGA-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
525; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
526; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
527; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
528; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v7
529; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
530; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v7
531; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
532; TONGA-NEXT:    s_endpgm
533;
534; GFX9-LABEL: sdiv_v2i32:
535; GFX9:       ; %bb.0:
536; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
537; GFX9-NEXT:    s_mov_b32 s3, 0xf000
538; GFX9-NEXT:    s_mov_b32 s2, -1
539; GFX9-NEXT:    s_mov_b32 s6, s2
540; GFX9-NEXT:    s_mov_b32 s7, s3
541; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX9-NEXT:    s_mov_b32 s4, s10
543; GFX9-NEXT:    s_mov_b32 s5, s11
544; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
545; GFX9-NEXT:    s_waitcnt vmcnt(0)
546; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
547; GFX9-NEXT:    s_abs_i32 s1, s0
548; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s1
549; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
550; GFX9-NEXT:    s_xor_b32 s0, s5, s0
551; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
552; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
553; GFX9-NEXT:    s_sub_i32 s0, 0, s1
554; GFX9-NEXT:    s_abs_i32 s5, s5
555; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
556; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
557; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
558; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
559; GFX9-NEXT:    s_mul_i32 s0, s0, s7
560; GFX9-NEXT:    s_mul_hi_u32 s0, s7, s0
561; GFX9-NEXT:    s_add_i32 s7, s7, s0
562; GFX9-NEXT:    s_mul_hi_u32 s0, s5, s7
563; GFX9-NEXT:    s_mul_i32 s7, s0, s1
564; GFX9-NEXT:    s_sub_i32 s5, s5, s7
565; GFX9-NEXT:    s_add_i32 s10, s0, 1
566; GFX9-NEXT:    s_sub_i32 s7, s5, s1
567; GFX9-NEXT:    s_cmp_ge_u32 s5, s1
568; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
569; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
570; GFX9-NEXT:    s_add_i32 s7, s0, 1
571; GFX9-NEXT:    s_cmp_ge_u32 s5, s1
572; GFX9-NEXT:    s_cselect_b32 s5, s7, s0
573; GFX9-NEXT:    s_abs_i32 s7, s4
574; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
575; GFX9-NEXT:    s_xor_b32 s5, s5, s6
576; GFX9-NEXT:    s_mov_b32 s1, s9
577; GFX9-NEXT:    s_sub_i32 s9, 0, s7
578; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
579; GFX9-NEXT:    s_sub_i32 s5, s5, s6
580; GFX9-NEXT:    s_mov_b32 s0, s8
581; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
582; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
583; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
584; GFX9-NEXT:    s_xor_b32 s4, s8, s4
585; GFX9-NEXT:    s_abs_i32 s8, s8
586; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
587; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
588; GFX9-NEXT:    s_mul_i32 s9, s9, s6
589; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s9
590; GFX9-NEXT:    s_add_i32 s6, s6, s9
591; GFX9-NEXT:    s_mul_hi_u32 s6, s8, s6
592; GFX9-NEXT:    s_mul_i32 s9, s6, s7
593; GFX9-NEXT:    s_sub_i32 s8, s8, s9
594; GFX9-NEXT:    s_add_i32 s10, s6, 1
595; GFX9-NEXT:    s_sub_i32 s9, s8, s7
596; GFX9-NEXT:    s_cmp_ge_u32 s8, s7
597; GFX9-NEXT:    s_cselect_b32 s6, s10, s6
598; GFX9-NEXT:    s_cselect_b32 s8, s9, s8
599; GFX9-NEXT:    s_add_i32 s9, s6, 1
600; GFX9-NEXT:    s_cmp_ge_u32 s8, s7
601; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
602; GFX9-NEXT:    s_xor_b32 s6, s6, s4
603; GFX9-NEXT:    s_sub_i32 s4, s6, s4
604; GFX9-NEXT:    v_mov_b32_e32 v0, s5
605; GFX9-NEXT:    v_mov_b32_e32 v1, s4
606; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
607; GFX9-NEXT:    s_endpgm
608;
609; EG-LABEL: sdiv_v2i32:
610; EG:       ; %bb.0:
611; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
612; EG-NEXT:    TEX 0 @6
613; EG-NEXT:    ALU 51, @9, KC0[CB0:0-32], KC1[]
614; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
615; EG-NEXT:    CF_END
616; EG-NEXT:    PAD
617; EG-NEXT:    Fetch clause starting at 6:
618; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
619; EG-NEXT:    ALU clause starting at 8:
620; EG-NEXT:     MOV * T0.X, KC0[2].Z,
621; EG-NEXT:    ALU clause starting at 9:
622; EG-NEXT:     SETGT_INT * T1.W, 0.0, T0.W,
623; EG-NEXT:     ADD_INT T0.W, T0.W, PV.W,
624; EG-NEXT:     SETGT_INT * T2.W, 0.0, T0.Z,
625; EG-NEXT:     XOR_INT * T0.W, PV.W, T1.W,
626; EG-NEXT:     SUB_INT T1.Z, 0.0, PV.W,
627; EG-NEXT:     ADD_INT T3.W, T0.Z, T2.W,
628; EG-NEXT:     RECIP_UINT * T0.Z, PV.W,
629; EG-NEXT:     XOR_INT T3.W, PV.W, T2.W,
630; EG-NEXT:     MULLO_INT * T1.X, PV.Z, PS,
631; EG-NEXT:     SUB_INT T4.W, 0.0, PV.W,
632; EG-NEXT:     RECIP_UINT * T1.Y, PV.W,
633; EG-NEXT:     SETGT_INT T5.W, 0.0, T0.X,
634; EG-NEXT:     MULLO_INT * T1.Z, PV.W, PS,
635; EG-NEXT:     SETGT_INT T2.Z, 0.0, T0.Y,
636; EG-NEXT:     ADD_INT T4.W, T0.X, PV.W,
637; EG-NEXT:     MULHI * T0.X, T1.Y, PS,
638; EG-NEXT:     ADD_INT T1.Y, T1.Y, PS,
639; EG-NEXT:     XOR_INT T1.Z, PV.W, T5.W,
640; EG-NEXT:     ADD_INT T4.W, T0.Y, PV.Z, BS:VEC_120/SCL_212
641; EG-NEXT:     MULHI * T0.X, T0.Z, T1.X,
642; EG-NEXT:     ADD_INT T0.Z, T0.Z, PS,
643; EG-NEXT:     XOR_INT T4.W, PV.W, T2.Z,
644; EG-NEXT:     MULHI * T0.X, PV.Z, PV.Y,
645; EG-NEXT:     MULHI * T0.Y, PV.W, PV.Z,
646; EG-NEXT:     MULLO_INT * T0.Z, PS, T0.W,
647; EG-NEXT:     SUB_INT T4.W, T4.W, PS,
648; EG-NEXT:     MULLO_INT * T0.Z, T0.X, T3.W,
649; EG-NEXT:     SUB_INT T1.Y, T1.Z, PS,
650; EG-NEXT:     ADD_INT T0.Z, T0.Y, 1,
651; EG-NEXT:     SETGE_UINT T6.W, PV.W, T0.W,
652; EG-NEXT:     SUB_INT * T7.W, PV.W, T0.W,
653; EG-NEXT:     CNDE_INT T1.X, PV.W, T4.W, PS, BS:VEC_021/SCL_122
654; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PV.Z,
655; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
656; EG-NEXT:     SETGE_UINT T4.W, PV.Y, T3.W,
657; EG-NEXT:     SUB_INT * T6.W, PV.Y, T3.W,
658; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PS,
659; EG-NEXT:     CNDE_INT T0.Z, PV.W, T0.X, PV.Z,
660; EG-NEXT:     ADD_INT T4.W, PV.Y, 1,
661; EG-NEXT:     SETGE_UINT * T0.W, PV.X, T0.W,
662; EG-NEXT:     CNDE_INT T0.Y, PS, T0.Y, PV.W,
663; EG-NEXT:     XOR_INT T1.Z, T2.Z, T1.W, BS:VEC_021/SCL_122
664; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
665; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T3.W,
666; EG-NEXT:     CNDE_INT T0.Z, PS, T0.Z, PV.W,
667; EG-NEXT:     XOR_INT T0.W, T5.W, T2.W,
668; EG-NEXT:     XOR_INT * T1.W, PV.Y, PV.Z,
669; EG-NEXT:     SUB_INT T0.Y, PS, T1.Z,
670; EG-NEXT:     XOR_INT * T1.W, PV.Z, PV.W,
671; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
672; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
673; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
674  %den_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
675  %num = load <2 x i32>, ptr addrspace(1) %in
676  %den = load <2 x i32>, ptr addrspace(1) %den_ptr
677  %result = sdiv <2 x i32> %num, %den
678  store <2 x i32> %result, ptr addrspace(1) %out
679  ret void
680}
681
682define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
683; GCN-LABEL: sdiv_v2i32_4:
684; GCN:       ; %bb.0:
685; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
686; GCN-NEXT:    s_mov_b32 s7, 0xf000
687; GCN-NEXT:    s_mov_b32 s6, -1
688; GCN-NEXT:    s_mov_b32 s10, s6
689; GCN-NEXT:    s_mov_b32 s11, s7
690; GCN-NEXT:    s_waitcnt lgkmcnt(0)
691; GCN-NEXT:    s_mov_b32 s8, s2
692; GCN-NEXT:    s_mov_b32 s9, s3
693; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
694; GCN-NEXT:    s_mov_b32 s4, s0
695; GCN-NEXT:    s_mov_b32 s5, s1
696; GCN-NEXT:    s_waitcnt vmcnt(0)
697; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
698; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
699; GCN-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
700; GCN-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
701; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
702; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
703; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
704; GCN-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
705; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
706; GCN-NEXT:    s_endpgm
707;
708; TONGA-LABEL: sdiv_v2i32_4:
709; TONGA:       ; %bb.0:
710; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
711; TONGA-NEXT:    s_mov_b32 s7, 0xf000
712; TONGA-NEXT:    s_mov_b32 s6, -1
713; TONGA-NEXT:    s_mov_b32 s10, s6
714; TONGA-NEXT:    s_mov_b32 s11, s7
715; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
716; TONGA-NEXT:    s_mov_b32 s8, s2
717; TONGA-NEXT:    s_mov_b32 s9, s3
718; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
719; TONGA-NEXT:    s_mov_b32 s4, s0
720; TONGA-NEXT:    s_mov_b32 s5, s1
721; TONGA-NEXT:    s_waitcnt vmcnt(0)
722; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
723; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
724; TONGA-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
725; TONGA-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
726; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
727; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
728; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
729; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
730; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
731; TONGA-NEXT:    s_endpgm
732;
733; GFX9-LABEL: sdiv_v2i32_4:
734; GFX9:       ; %bb.0:
735; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
736; GFX9-NEXT:    s_mov_b32 s7, 0xf000
737; GFX9-NEXT:    s_mov_b32 s6, -1
738; GFX9-NEXT:    s_mov_b32 s10, s6
739; GFX9-NEXT:    s_mov_b32 s11, s7
740; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX9-NEXT:    s_mov_b32 s8, s2
742; GFX9-NEXT:    s_mov_b32 s9, s3
743; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
744; GFX9-NEXT:    s_mov_b32 s4, s0
745; GFX9-NEXT:    s_mov_b32 s5, s1
746; GFX9-NEXT:    s_waitcnt vmcnt(0)
747; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
748; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
749; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
750; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
751; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
752; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
753; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
754; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
755; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
756; GFX9-NEXT:    s_endpgm
757;
758; EG-LABEL: sdiv_v2i32_4:
759; EG:       ; %bb.0:
760; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
761; EG-NEXT:    TEX 0 @6
762; EG-NEXT:    ALU 13, @9, KC0[CB0:0-32], KC1[]
763; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
764; EG-NEXT:    CF_END
765; EG-NEXT:    PAD
766; EG-NEXT:    Fetch clause starting at 6:
767; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
768; EG-NEXT:    ALU clause starting at 8:
769; EG-NEXT:     MOV * T0.X, KC0[2].Z,
770; EG-NEXT:    ALU clause starting at 9:
771; EG-NEXT:     ASHR * T0.W, T0.Y, literal.x,
772; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
773; EG-NEXT:     LSHR T0.W, PV.W, literal.x,
774; EG-NEXT:     ASHR * T1.W, T0.X, literal.y,
775; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
776; EG-NEXT:     LSHR T1.W, PS, literal.x,
777; EG-NEXT:     ADD_INT * T0.W, T0.Y, PV.W,
778; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
779; EG-NEXT:     ASHR T0.Y, PS, literal.x,
780; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
781; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
782; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
783; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
784; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
785  %num = load <2 x i32>, ptr addrspace(1) %in
786  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
787  store <2 x i32> %result, ptr addrspace(1) %out
788  ret void
789}
790
791define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
792; GCN-LABEL: sdiv_v4i32:
793; GCN:       ; %bb.0:
794; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
795; GCN-NEXT:    s_mov_b32 s7, 0xf000
796; GCN-NEXT:    s_mov_b32 s6, -1
797; GCN-NEXT:    s_mov_b32 s10, s6
798; GCN-NEXT:    s_mov_b32 s11, s7
799; GCN-NEXT:    s_waitcnt lgkmcnt(0)
800; GCN-NEXT:    s_mov_b32 s8, s2
801; GCN-NEXT:    s_mov_b32 s9, s3
802; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
803; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
804; GCN-NEXT:    s_mov_b32 s4, s0
805; GCN-NEXT:    s_mov_b32 s5, s1
806; GCN-NEXT:    s_waitcnt vmcnt(1)
807; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v1
808; GCN-NEXT:    s_waitcnt vmcnt(0)
809; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
810; GCN-NEXT:    v_xor_b32_e32 v11, v1, v5
811; GCN-NEXT:    v_max_i32_e32 v5, v5, v12
812; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v5
813; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
814; GCN-NEXT:    v_xor_b32_e32 v8, v0, v4
815; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
816; GCN-NEXT:    v_max_i32_e32 v4, v4, v10
817; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 0, v5
818; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v12
819; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
820; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v4
821; GCN-NEXT:    v_max_i32_e32 v1, v1, v13
822; GCN-NEXT:    v_sub_i32_e32 v15, vcc, 0, v6
823; GCN-NEXT:    v_mul_lo_u32 v16, v16, v10
824; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
825; GCN-NEXT:    v_xor_b32_e32 v14, v2, v6
826; GCN-NEXT:    v_max_i32_e32 v6, v6, v15
827; GCN-NEXT:    v_mul_hi_u32 v16, v10, v16
828; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
829; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
830; GCN-NEXT:    v_cvt_f32_u32_e32 v15, v6
831; GCN-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
832; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 0, v4
833; GCN-NEXT:    v_mul_lo_u32 v16, v16, v12
834; GCN-NEXT:    v_mul_hi_u32 v10, v1, v10
835; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
836; GCN-NEXT:    v_mul_hi_u32 v13, v12, v16
837; GCN-NEXT:    v_max_i32_e32 v0, v0, v9
838; GCN-NEXT:    v_rcp_iflag_f32_e32 v9, v15
839; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
840; GCN-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
841; GCN-NEXT:    v_mul_lo_u32 v13, v10, v5
842; GCN-NEXT:    v_mul_hi_u32 v12, v0, v12
843; GCN-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
844; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
845; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v13
846; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v10
847; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v5
848; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v13, s[0:1]
849; GCN-NEXT:    v_sub_i32_e32 v13, vcc, v1, v5
850; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
851; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v5
852; GCN-NEXT:    v_mul_lo_u32 v1, v12, v4
853; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v6
854; GCN-NEXT:    v_mul_lo_u32 v5, v5, v9
855; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
856; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v12
857; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v4
858; GCN-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[2:3]
859; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v0, v4
860; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[2:3]
861; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v4
862; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0, v7
863; GCN-NEXT:    v_mul_hi_u32 v4, v9, v5
864; GCN-NEXT:    v_max_i32_e32 v5, v7, v0
865; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v5
866; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v1
867; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
868; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
869; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
870; GCN-NEXT:    v_max_i32_e32 v2, v2, v9
871; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
872; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
873; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v0
874; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v12, s[2:3]
875; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
876; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
877; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
878; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v10
879; GCN-NEXT:    v_cndmask_b32_e64 v1, v10, v13, s[0:1]
880; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v5
881; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
882; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
883; GCN-NEXT:    v_mul_lo_u32 v10, v10, v9
884; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
885; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v6
886; GCN-NEXT:    v_xor_b32_e32 v1, v1, v11
887; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
888; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v2, v6
889; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v11
890; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
891; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
892; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
893; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
894; GCN-NEXT:    v_mul_hi_u32 v4, v9, v10
895; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
896; GCN-NEXT:    v_max_i32_e32 v6, v3, v6
897; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
898; GCN-NEXT:    v_mul_hi_u32 v4, v6, v4
899; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
900; GCN-NEXT:    v_xor_b32_e32 v2, v2, v14
901; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v14
902; GCN-NEXT:    v_mul_lo_u32 v8, v4, v5
903; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
904; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
905; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
906; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v6, v5
907; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
908; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
909; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
910; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
911; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
912; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
913; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
914; GCN-NEXT:    v_xor_b32_e32 v4, v4, v3
915; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v4, v3
916; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
917; GCN-NEXT:    s_endpgm
918;
919; TONGA-LABEL: sdiv_v4i32:
920; TONGA:       ; %bb.0:
921; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
922; TONGA-NEXT:    s_mov_b32 s7, 0xf000
923; TONGA-NEXT:    s_mov_b32 s6, -1
924; TONGA-NEXT:    s_mov_b32 s10, s6
925; TONGA-NEXT:    s_mov_b32 s11, s7
926; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
927; TONGA-NEXT:    s_mov_b32 s8, s2
928; TONGA-NEXT:    s_mov_b32 s9, s3
929; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
930; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
931; TONGA-NEXT:    s_mov_b32 s4, s0
932; TONGA-NEXT:    s_mov_b32 s5, s1
933; TONGA-NEXT:    s_waitcnt vmcnt(1)
934; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v1
935; TONGA-NEXT:    s_waitcnt vmcnt(0)
936; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v5
937; TONGA-NEXT:    v_xor_b32_e32 v11, v1, v5
938; TONGA-NEXT:    v_max_i32_e32 v5, v5, v12
939; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v5
940; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v4
941; TONGA-NEXT:    v_xor_b32_e32 v8, v0, v4
942; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
943; TONGA-NEXT:    v_max_i32_e32 v4, v4, v10
944; TONGA-NEXT:    v_sub_u32_e32 v16, vcc, 0, v5
945; TONGA-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v12
946; TONGA-NEXT:    v_cvt_u32_f32_e32 v10, v10
947; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v4
948; TONGA-NEXT:    v_max_i32_e32 v1, v1, v13
949; TONGA-NEXT:    v_sub_u32_e32 v15, vcc, 0, v6
950; TONGA-NEXT:    v_mul_lo_u32 v16, v16, v10
951; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
952; TONGA-NEXT:    v_xor_b32_e32 v14, v2, v6
953; TONGA-NEXT:    v_max_i32_e32 v6, v6, v15
954; TONGA-NEXT:    v_mul_hi_u32 v16, v10, v16
955; TONGA-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
956; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v12
957; TONGA-NEXT:    v_cvt_f32_u32_e32 v15, v6
958; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v10, v16
959; TONGA-NEXT:    v_sub_u32_e32 v16, vcc, 0, v4
960; TONGA-NEXT:    v_mul_lo_u32 v16, v16, v12
961; TONGA-NEXT:    v_mul_hi_u32 v10, v1, v10
962; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
963; TONGA-NEXT:    v_mul_hi_u32 v13, v12, v16
964; TONGA-NEXT:    v_max_i32_e32 v0, v0, v9
965; TONGA-NEXT:    v_rcp_iflag_f32_e32 v9, v15
966; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
967; TONGA-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
968; TONGA-NEXT:    v_mul_lo_u32 v13, v10, v5
969; TONGA-NEXT:    v_mul_hi_u32 v12, v0, v12
970; TONGA-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
971; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
972; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v13
973; TONGA-NEXT:    v_add_u32_e32 v13, vcc, 1, v10
974; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v5
975; TONGA-NEXT:    v_cndmask_b32_e64 v10, v10, v13, s[0:1]
976; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, v1, v5
977; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v13, s[0:1]
978; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v5
979; TONGA-NEXT:    v_mul_lo_u32 v1, v12, v4
980; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v6
981; TONGA-NEXT:    v_mul_lo_u32 v5, v5, v9
982; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
983; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v12
984; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v4
985; TONGA-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[2:3]
986; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v0, v4
987; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[2:3]
988; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v4
989; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, 0, v7
990; TONGA-NEXT:    v_mul_hi_u32 v4, v9, v5
991; TONGA-NEXT:    v_max_i32_e32 v5, v7, v0
992; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v5
993; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v1
994; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
995; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
996; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
997; TONGA-NEXT:    v_max_i32_e32 v2, v2, v9
998; TONGA-NEXT:    v_mul_hi_u32 v4, v2, v4
999; TONGA-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1000; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v0
1001; TONGA-NEXT:    v_cndmask_b32_e64 v0, v1, v12, s[2:3]
1002; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
1003; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
1004; TONGA-NEXT:    v_mul_lo_u32 v8, v4, v6
1005; TONGA-NEXT:    v_add_u32_e32 v13, vcc, 1, v10
1006; TONGA-NEXT:    v_cndmask_b32_e64 v1, v10, v13, s[0:1]
1007; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v5
1008; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v8
1009; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
1010; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v9
1011; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v4
1012; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v6
1013; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v11
1014; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
1015; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v2, v6
1016; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v11
1017; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
1018; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v4
1019; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
1020; TONGA-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
1021; TONGA-NEXT:    v_mul_hi_u32 v4, v9, v10
1022; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, 0, v3
1023; TONGA-NEXT:    v_max_i32_e32 v6, v3, v6
1024; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
1025; TONGA-NEXT:    v_mul_hi_u32 v4, v6, v4
1026; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
1027; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v14
1028; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v14
1029; TONGA-NEXT:    v_mul_lo_u32 v8, v4, v5
1030; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
1031; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
1032; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v6, v8
1033; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v6, v5
1034; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
1035; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
1036; TONGA-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
1037; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
1038; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
1039; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
1040; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
1041; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v3
1042; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v4, v3
1043; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1044; TONGA-NEXT:    s_endpgm
1045;
1046; GFX9-LABEL: sdiv_v4i32:
1047; GFX9:       ; %bb.0:
1048; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1049; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1050; GFX9-NEXT:    s_mov_b32 s2, -1
1051; GFX9-NEXT:    s_mov_b32 s6, s2
1052; GFX9-NEXT:    s_mov_b32 s7, s3
1053; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1054; GFX9-NEXT:    s_mov_b32 s4, s10
1055; GFX9-NEXT:    s_mov_b32 s5, s11
1056; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
1057; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
1058; GFX9-NEXT:    s_waitcnt vmcnt(1)
1059; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1060; GFX9-NEXT:    s_abs_i32 s1, s0
1061; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
1062; GFX9-NEXT:    s_waitcnt vmcnt(0)
1063; GFX9-NEXT:    v_readfirstlane_b32 s5, v4
1064; GFX9-NEXT:    s_xor_b32 s0, s5, s0
1065; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
1066; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1067; GFX9-NEXT:    s_sub_i32 s0, 0, s1
1068; GFX9-NEXT:    s_abs_i32 s5, s5
1069; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1070; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1071; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1072; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
1073; GFX9-NEXT:    s_mul_i32 s0, s0, s7
1074; GFX9-NEXT:    s_mul_hi_u32 s0, s7, s0
1075; GFX9-NEXT:    s_add_i32 s7, s7, s0
1076; GFX9-NEXT:    s_mul_hi_u32 s0, s5, s7
1077; GFX9-NEXT:    s_mul_i32 s7, s0, s1
1078; GFX9-NEXT:    s_sub_i32 s5, s5, s7
1079; GFX9-NEXT:    s_add_i32 s10, s0, 1
1080; GFX9-NEXT:    s_sub_i32 s7, s5, s1
1081; GFX9-NEXT:    s_cmp_ge_u32 s5, s1
1082; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
1083; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
1084; GFX9-NEXT:    s_add_i32 s7, s0, 1
1085; GFX9-NEXT:    s_cmp_ge_u32 s5, s1
1086; GFX9-NEXT:    s_cselect_b32 s1, s7, s0
1087; GFX9-NEXT:    s_abs_i32 s5, s4
1088; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
1089; GFX9-NEXT:    s_xor_b32 s1, s1, s6
1090; GFX9-NEXT:    s_sub_i32 s10, 0, s5
1091; GFX9-NEXT:    s_sub_i32 s6, s1, s6
1092; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1093; GFX9-NEXT:    s_mov_b32 s0, s8
1094; GFX9-NEXT:    v_readfirstlane_b32 s8, v5
1095; GFX9-NEXT:    s_xor_b32 s4, s8, s4
1096; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1097; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1098; GFX9-NEXT:    s_abs_i32 s8, s8
1099; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
1100; GFX9-NEXT:    v_readfirstlane_b32 s7, v2
1101; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
1102; GFX9-NEXT:    s_mul_i32 s10, s10, s1
1103; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s10
1104; GFX9-NEXT:    s_add_i32 s1, s1, s10
1105; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s1
1106; GFX9-NEXT:    s_mul_i32 s10, s1, s5
1107; GFX9-NEXT:    s_sub_i32 s8, s8, s10
1108; GFX9-NEXT:    s_add_i32 s11, s1, 1
1109; GFX9-NEXT:    s_sub_i32 s10, s8, s5
1110; GFX9-NEXT:    s_cmp_ge_u32 s8, s5
1111; GFX9-NEXT:    s_cselect_b32 s1, s11, s1
1112; GFX9-NEXT:    s_cselect_b32 s8, s10, s8
1113; GFX9-NEXT:    s_add_i32 s10, s1, 1
1114; GFX9-NEXT:    s_cmp_ge_u32 s8, s5
1115; GFX9-NEXT:    s_cselect_b32 s5, s10, s1
1116; GFX9-NEXT:    s_abs_i32 s8, s7
1117; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1118; GFX9-NEXT:    s_xor_b32 s5, s5, s4
1119; GFX9-NEXT:    s_sub_i32 s11, 0, s8
1120; GFX9-NEXT:    s_sub_i32 s4, s5, s4
1121; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1122; GFX9-NEXT:    v_readfirstlane_b32 s10, v6
1123; GFX9-NEXT:    s_xor_b32 s7, s10, s7
1124; GFX9-NEXT:    s_abs_i32 s10, s10
1125; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1126; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1127; GFX9-NEXT:    s_ashr_i32 s7, s7, 31
1128; GFX9-NEXT:    s_mov_b32 s1, s9
1129; GFX9-NEXT:    v_readfirstlane_b32 s9, v3
1130; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
1131; GFX9-NEXT:    s_mul_i32 s11, s11, s5
1132; GFX9-NEXT:    s_mul_hi_u32 s11, s5, s11
1133; GFX9-NEXT:    s_add_i32 s5, s5, s11
1134; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s5
1135; GFX9-NEXT:    s_mul_i32 s11, s5, s8
1136; GFX9-NEXT:    s_sub_i32 s10, s10, s11
1137; GFX9-NEXT:    s_add_i32 s12, s5, 1
1138; GFX9-NEXT:    s_sub_i32 s11, s10, s8
1139; GFX9-NEXT:    s_cmp_ge_u32 s10, s8
1140; GFX9-NEXT:    s_cselect_b32 s5, s12, s5
1141; GFX9-NEXT:    s_cselect_b32 s10, s11, s10
1142; GFX9-NEXT:    s_add_i32 s11, s5, 1
1143; GFX9-NEXT:    s_cmp_ge_u32 s10, s8
1144; GFX9-NEXT:    s_cselect_b32 s5, s11, s5
1145; GFX9-NEXT:    s_abs_i32 s8, s9
1146; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
1147; GFX9-NEXT:    v_readfirstlane_b32 s10, v7
1148; GFX9-NEXT:    s_xor_b32 s5, s5, s7
1149; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1150; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1151; GFX9-NEXT:    s_xor_b32 s4, s10, s9
1152; GFX9-NEXT:    s_sub_i32 s9, 0, s8
1153; GFX9-NEXT:    s_sub_i32 s5, s5, s7
1154; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1155; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1156; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1157; GFX9-NEXT:    s_abs_i32 s6, s10
1158; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
1159; GFX9-NEXT:    v_readfirstlane_b32 s7, v2
1160; GFX9-NEXT:    s_mul_i32 s9, s9, s7
1161; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s9
1162; GFX9-NEXT:    s_add_i32 s7, s7, s9
1163; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s7
1164; GFX9-NEXT:    s_mul_i32 s9, s7, s8
1165; GFX9-NEXT:    s_sub_i32 s6, s6, s9
1166; GFX9-NEXT:    s_add_i32 s10, s7, 1
1167; GFX9-NEXT:    s_sub_i32 s9, s6, s8
1168; GFX9-NEXT:    s_cmp_ge_u32 s6, s8
1169; GFX9-NEXT:    s_cselect_b32 s7, s10, s7
1170; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
1171; GFX9-NEXT:    s_add_i32 s9, s7, 1
1172; GFX9-NEXT:    s_cmp_ge_u32 s6, s8
1173; GFX9-NEXT:    s_cselect_b32 s6, s9, s7
1174; GFX9-NEXT:    s_xor_b32 s6, s6, s4
1175; GFX9-NEXT:    s_sub_i32 s4, s6, s4
1176; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1177; GFX9-NEXT:    v_mov_b32_e32 v3, s4
1178; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1179; GFX9-NEXT:    s_endpgm
1180;
1181; EG-LABEL: sdiv_v4i32:
1182; EG:       ; %bb.0:
1183; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1184; EG-NEXT:    TEX 1 @6
1185; EG-NEXT:    ALU 101, @11, KC0[CB0:0-32], KC1[]
1186; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
1187; EG-NEXT:    CF_END
1188; EG-NEXT:    PAD
1189; EG-NEXT:    Fetch clause starting at 6:
1190; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
1191; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1192; EG-NEXT:    ALU clause starting at 10:
1193; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1194; EG-NEXT:    ALU clause starting at 11:
1195; EG-NEXT:     SETGT_INT * T2.W, 0.0, T1.W,
1196; EG-NEXT:     ADD_INT * T1.W, T1.W, PV.W,
1197; EG-NEXT:     XOR_INT * T1.W, PV.W, T2.W,
1198; EG-NEXT:     SUB_INT T3.W, 0.0, PV.W,
1199; EG-NEXT:     RECIP_UINT * T2.X, PV.W,
1200; EG-NEXT:     SETGT_INT T4.W, 0.0, T0.W,
1201; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
1202; EG-NEXT:     SETGT_INT T2.Z, 0.0, T1.Y,
1203; EG-NEXT:     ADD_INT T0.W, T0.W, PV.W,
1204; EG-NEXT:     MULHI * T2.Y, T2.X, PS,
1205; EG-NEXT:     ADD_INT T3.Z, T2.X, PS,
1206; EG-NEXT:     XOR_INT T0.W, PV.W, T4.W,
1207; EG-NEXT:     ADD_INT * T3.W, T1.Y, PV.Z,
1208; EG-NEXT:     XOR_INT T3.W, PS, T2.Z,
1209; EG-NEXT:     MULHI * T1.Y, PV.W, PV.Z,
1210; EG-NEXT:     SUB_INT T5.W, 0.0, PV.W,
1211; EG-NEXT:     RECIP_UINT * T2.X, PV.W,
1212; EG-NEXT:     SETGT_INT T6.W, 0.0, T0.Y,
1213; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
1214; EG-NEXT:     ADD_INT T5.W, T0.Y, PV.W,
1215; EG-NEXT:     MULHI * T0.Y, T2.X, PS,
1216; EG-NEXT:     ADD_INT T0.Y, T2.X, PS,
1217; EG-NEXT:     XOR_INT T3.Z, PV.W, T6.W, BS:VEC_021/SCL_122
1218; EG-NEXT:     SETGT_INT T5.W, 0.0, T1.Z,
1219; EG-NEXT:     MULLO_INT * T2.X, T1.Y, T1.W,
1220; EG-NEXT:     ADD_INT T7.W, T1.Z, PV.W,
1221; EG-NEXT:     MULHI * T0.Y, PV.Z, PV.Y,
1222; EG-NEXT:     XOR_INT T7.W, PV.W, T5.W, BS:VEC_021/SCL_122
1223; EG-NEXT:     MULLO_INT * T1.Z, PS, T3.W,
1224; EG-NEXT:     SUB_INT T4.Z, 0.0, PV.W,
1225; EG-NEXT:     SETGT_INT T8.W, 0.0, T1.X,
1226; EG-NEXT:     RECIP_UINT * T2.Y, PV.W,
1227; EG-NEXT:     ADD_INT T9.W, T1.X, PV.W,
1228; EG-NEXT:     MULLO_INT * T1.X, PV.Z, PS,
1229; EG-NEXT:     SETGT_INT T4.Z, 0.0, T0.Z,
1230; EG-NEXT:     XOR_INT T9.W, PV.W, T8.W,
1231; EG-NEXT:     MULHI * T1.X, T2.Y, PS,
1232; EG-NEXT:     ADD_INT T1.X, T2.Y, PS,
1233; EG-NEXT:     SUB_INT T2.Y, 0.0, PV.W,
1234; EG-NEXT:     SUB_INT T1.Z, T3.Z, T1.Z,
1235; EG-NEXT:     ADD_INT T10.W, T0.Z, PV.Z, BS:VEC_201
1236; EG-NEXT:     RECIP_UINT * T0.Z, PV.W,
1237; EG-NEXT:     XOR_INT T3.X, PV.W, T4.Z,
1238; EG-NEXT:     ADD_INT T3.Y, T0.Y, 1,
1239; EG-NEXT:     SETGE_UINT T3.Z, PV.Z, T3.W,
1240; EG-NEXT:     SUB_INT T10.W, PV.Z, T3.W,
1241; EG-NEXT:     MULLO_INT * T2.Y, PV.Y, PS,
1242; EG-NEXT:     CNDE_INT T1.Z, PV.Z, T1.Z, PV.W,
1243; EG-NEXT:     CNDE_INT T10.W, PV.Z, T0.Y, PV.Y,
1244; EG-NEXT:     MULHI * T0.Y, PV.X, T1.X,
1245; EG-NEXT:     SETGT_INT T3.Y, 0.0, T0.X,
1246; EG-NEXT:     ADD_INT T3.Z, PV.W, 1,
1247; EG-NEXT:     SETGE_UINT T3.W, PV.Z, T3.W, BS:VEC_021/SCL_122
1248; EG-NEXT:     MULLO_INT * T1.X, PS, T7.W,
1249; EG-NEXT:     CNDE_INT T4.Y, PV.W, T10.W, PV.Z,
1250; EG-NEXT:     ADD_INT T1.Z, T0.X, PV.Y,
1251; EG-NEXT:     SUB_INT T3.W, T3.X, PS, BS:VEC_120/SCL_212
1252; EG-NEXT:     MULHI * T0.X, T0.Z, T2.Y,
1253; EG-NEXT:     ADD_INT T1.X, T0.Y, 1,
1254; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T7.W,
1255; EG-NEXT:     ADD_INT T0.Z, T0.Z, PS,
1256; EG-NEXT:     XOR_INT T10.W, PV.Z, T3.Y,
1257; EG-NEXT:     SUB_INT * T0.W, T0.W, T2.X,
1258; EG-NEXT:     SUB_INT T0.X, T3.W, T7.W,
1259; EG-NEXT:     ADD_INT T5.Y, T1.Y, 1,
1260; EG-NEXT:     SETGE_UINT T1.Z, PS, T1.W, BS:VEC_021/SCL_122
1261; EG-NEXT:     SUB_INT T11.W, PS, T1.W, BS:VEC_021/SCL_122
1262; EG-NEXT:     MULHI * T0.Z, PV.W, PV.Z,
1263; EG-NEXT:     CNDE_INT T2.X, PV.Z, T0.W, PV.W, BS:VEC_021/SCL_122
1264; EG-NEXT:     CNDE_INT T1.Y, PV.Z, T1.Y, PV.Y,
1265; EG-NEXT:     CNDE_INT T1.Z, T2.Y, T3.W, PV.X, BS:VEC_201
1266; EG-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T1.X, BS:VEC_201
1267; EG-NEXT:     MULLO_INT * T0.X, PS, T9.W,
1268; EG-NEXT:     ADD_INT T1.X, PV.W, 1,
1269; EG-NEXT:     SETGE_UINT T0.Y, PV.Z, T7.W,
1270; EG-NEXT:     ADD_INT T1.Z, PV.Y, 1,
1271; EG-NEXT:     SETGE_UINT T1.W, PV.X, T1.W, BS:VEC_102/SCL_221
1272; EG-NEXT:     SUB_INT * T3.W, T10.W, PS,
1273; EG-NEXT:     ADD_INT T0.X, T0.Z, 1,
1274; EG-NEXT:     SETGE_UINT T2.Y, PS, T9.W, BS:VEC_102/SCL_221
1275; EG-NEXT:     SUB_INT T3.Z, PS, T9.W, BS:VEC_102/SCL_221
1276; EG-NEXT:     CNDE_INT T1.W, PV.W, T1.Y, PV.Z,
1277; EG-NEXT:     XOR_INT * T2.W, T4.W, T2.W,
1278; EG-NEXT:     XOR_INT T2.X, PV.W, PS,
1279; EG-NEXT:     CNDE_INT T1.Y, PV.Y, T3.W, PV.Z, BS:VEC_021/SCL_122
1280; EG-NEXT:     CNDE_INT T0.Z, PV.Y, T0.Z, PV.X,
1281; EG-NEXT:     CNDE_INT T0.W, T0.Y, T0.W, T1.X, BS:VEC_102/SCL_221
1282; EG-NEXT:     XOR_INT * T1.W, T4.Z, T5.W,
1283; EG-NEXT:     XOR_INT T0.X, T6.W, T2.Z,
1284; EG-NEXT:     XOR_INT T0.Y, PV.W, PS,
1285; EG-NEXT:     ADD_INT T1.Z, PV.Z, 1,
1286; EG-NEXT:     SETGE_UINT T0.W, PV.Y, T9.W, BS:VEC_021/SCL_122
1287; EG-NEXT:     SUB_INT * T2.W, PV.X, T2.W,
1288; EG-NEXT:     CNDE_INT T1.Y, PV.W, T0.Z, PV.Z,
1289; EG-NEXT:     SUB_INT T2.Z, PV.Y, T1.W,
1290; EG-NEXT:     XOR_INT T0.W, T3.Y, T8.W, BS:VEC_021/SCL_122
1291; EG-NEXT:     XOR_INT * T1.W, T4.Y, PV.X,
1292; EG-NEXT:     SUB_INT T2.Y, PS, T0.X,
1293; EG-NEXT:     XOR_INT * T1.W, PV.Y, PV.W,
1294; EG-NEXT:     SUB_INT T2.X, PV.W, T0.W,
1295; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1296; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1297  %den_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
1298  %num = load <4 x i32>, ptr addrspace(1) %in
1299  %den = load <4 x i32>, ptr addrspace(1) %den_ptr
1300  %result = sdiv <4 x i32> %num, %den
1301  store <4 x i32> %result, ptr addrspace(1) %out
1302  ret void
1303}
1304
1305define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1306; GCN-LABEL: sdiv_v4i32_4:
1307; GCN:       ; %bb.0:
1308; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1309; GCN-NEXT:    s_mov_b32 s7, 0xf000
1310; GCN-NEXT:    s_mov_b32 s6, -1
1311; GCN-NEXT:    s_mov_b32 s10, s6
1312; GCN-NEXT:    s_mov_b32 s11, s7
1313; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1314; GCN-NEXT:    s_mov_b32 s8, s2
1315; GCN-NEXT:    s_mov_b32 s9, s3
1316; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1317; GCN-NEXT:    s_mov_b32 s4, s0
1318; GCN-NEXT:    s_mov_b32 s5, s1
1319; GCN-NEXT:    s_waitcnt vmcnt(0)
1320; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1321; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1322; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1323; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1324; GCN-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1325; GCN-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1326; GCN-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1327; GCN-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1328; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
1329; GCN-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
1330; GCN-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
1331; GCN-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
1332; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1333; GCN-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1334; GCN-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1335; GCN-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1336; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1337; GCN-NEXT:    s_endpgm
1338;
1339; TONGA-LABEL: sdiv_v4i32_4:
1340; TONGA:       ; %bb.0:
1341; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
1342; TONGA-NEXT:    s_mov_b32 s3, 0xf000
1343; TONGA-NEXT:    s_mov_b32 s2, -1
1344; TONGA-NEXT:    s_mov_b32 s10, s2
1345; TONGA-NEXT:    s_mov_b32 s11, s3
1346; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1347; TONGA-NEXT:    s_mov_b32 s8, s6
1348; TONGA-NEXT:    s_mov_b32 s9, s7
1349; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1350; TONGA-NEXT:    s_mov_b32 s0, s4
1351; TONGA-NEXT:    s_mov_b32 s1, s5
1352; TONGA-NEXT:    s_waitcnt vmcnt(0)
1353; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1354; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1355; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1356; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1357; TONGA-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1358; TONGA-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1359; TONGA-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1360; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1361; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
1362; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
1363; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
1364; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
1365; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1366; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1367; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1368; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1369; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1370; TONGA-NEXT:    s_endpgm
1371;
1372; GFX9-LABEL: sdiv_v4i32_4:
1373; GFX9:       ; %bb.0:
1374; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1375; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1376; GFX9-NEXT:    s_mov_b32 s6, -1
1377; GFX9-NEXT:    s_mov_b32 s10, s6
1378; GFX9-NEXT:    s_mov_b32 s11, s7
1379; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1380; GFX9-NEXT:    s_mov_b32 s8, s2
1381; GFX9-NEXT:    s_mov_b32 s9, s3
1382; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1383; GFX9-NEXT:    s_mov_b32 s4, s0
1384; GFX9-NEXT:    s_mov_b32 s5, s1
1385; GFX9-NEXT:    s_waitcnt vmcnt(0)
1386; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1387; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1388; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1389; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1390; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1391; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1392; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1393; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1394; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
1395; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
1396; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
1397; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
1398; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1399; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1400; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1401; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1402; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1403; GFX9-NEXT:    s_endpgm
1404;
1405; EG-LABEL: sdiv_v4i32_4:
1406; EG:       ; %bb.0:
1407; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1408; EG-NEXT:    TEX 0 @6
1409; EG-NEXT:    ALU 24, @9, KC0[CB0:0-32], KC1[]
1410; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1411; EG-NEXT:    CF_END
1412; EG-NEXT:    PAD
1413; EG-NEXT:    Fetch clause starting at 6:
1414; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1415; EG-NEXT:    ALU clause starting at 8:
1416; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1417; EG-NEXT:    ALU clause starting at 9:
1418; EG-NEXT:     ASHR T1.W, T0.W, literal.x,
1419; EG-NEXT:     ASHR * T2.W, T0.Z, literal.x,
1420; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1421; EG-NEXT:     LSHR * T1.W, PV.W, literal.x,
1422; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1423; EG-NEXT:     ADD_INT T1.Z, T0.W, PV.W,
1424; EG-NEXT:     LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212
1425; EG-NEXT:     ASHR * T1.W, T0.Y, literal.y,
1426; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
1427; EG-NEXT:     LSHR T1.Y, PS, literal.x,
1428; EG-NEXT:     ASHR T2.Z, T0.X, literal.y,
1429; EG-NEXT:     ADD_INT T0.W, T0.Z, PV.W,
1430; EG-NEXT:     ASHR * T1.W, PV.Z, literal.z,
1431; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
1432; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1433; EG-NEXT:     ASHR T1.Z, PV.W, literal.x,
1434; EG-NEXT:     LSHR T0.W, PV.Z, literal.y,
1435; EG-NEXT:     ADD_INT * T2.W, T0.Y, PV.Y,
1436; EG-NEXT:    2(2.802597e-45), 30(4.203895e-44)
1437; EG-NEXT:     ASHR T1.Y, PS, literal.x,
1438; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
1439; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1440; EG-NEXT:     ASHR T1.X, PV.W, literal.x,
1441; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1442; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1443  %num = load <4 x i32>, ptr addrspace(1) %in
1444  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
1445  store <4 x i32> %result, ptr addrspace(1) %out
1446  ret void
1447}
1448
1449define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1450; GCN-LABEL: v_sdiv_i8:
1451; GCN:       ; %bb.0:
1452; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1453; GCN-NEXT:    s_mov_b32 s7, 0xf000
1454; GCN-NEXT:    s_mov_b32 s6, -1
1455; GCN-NEXT:    s_mov_b32 s10, s6
1456; GCN-NEXT:    s_mov_b32 s11, s7
1457; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1458; GCN-NEXT:    s_mov_b32 s8, s2
1459; GCN-NEXT:    s_mov_b32 s9, s3
1460; GCN-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
1461; GCN-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
1462; GCN-NEXT:    s_mov_b32 s4, s0
1463; GCN-NEXT:    s_mov_b32 s5, s1
1464; GCN-NEXT:    s_waitcnt vmcnt(1)
1465; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v0
1466; GCN-NEXT:    s_waitcnt vmcnt(0)
1467; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v1
1468; GCN-NEXT:    v_xor_b32_e32 v0, v1, v0
1469; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1470; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1471; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
1472; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
1473; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1474; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
1475; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
1476; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1477; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1478; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1479; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 8
1480; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1481; GCN-NEXT:    s_endpgm
1482;
1483; TONGA-LABEL: v_sdiv_i8:
1484; TONGA:       ; %bb.0:
1485; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1486; TONGA-NEXT:    s_mov_b32 s7, 0xf000
1487; TONGA-NEXT:    s_mov_b32 s6, -1
1488; TONGA-NEXT:    s_mov_b32 s10, s6
1489; TONGA-NEXT:    s_mov_b32 s11, s7
1490; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1491; TONGA-NEXT:    s_mov_b32 s8, s2
1492; TONGA-NEXT:    s_mov_b32 s9, s3
1493; TONGA-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
1494; TONGA-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
1495; TONGA-NEXT:    s_mov_b32 s4, s0
1496; TONGA-NEXT:    s_mov_b32 s5, s1
1497; TONGA-NEXT:    s_waitcnt vmcnt(1)
1498; TONGA-NEXT:    v_cvt_f32_i32_e32 v2, v0
1499; TONGA-NEXT:    s_waitcnt vmcnt(0)
1500; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v1
1501; TONGA-NEXT:    v_xor_b32_e32 v0, v1, v0
1502; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1503; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1504; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1505; TONGA-NEXT:    v_mul_f32_e32 v1, v3, v4
1506; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
1507; TONGA-NEXT:    v_mad_f32 v3, -v1, v2, v3
1508; TONGA-NEXT:    v_cvt_i32_f32_e32 v1, v1
1509; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1510; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1511; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1512; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 8
1513; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1514; TONGA-NEXT:    s_endpgm
1515;
1516; GFX9-LABEL: v_sdiv_i8:
1517; GFX9:       ; %bb.0:
1518; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1519; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1520; GFX9-NEXT:    s_mov_b32 s6, -1
1521; GFX9-NEXT:    s_mov_b32 s10, s6
1522; GFX9-NEXT:    s_mov_b32 s11, s7
1523; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1524; GFX9-NEXT:    s_mov_b32 s8, s2
1525; GFX9-NEXT:    s_mov_b32 s9, s3
1526; GFX9-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
1527; GFX9-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
1528; GFX9-NEXT:    s_mov_b32 s4, s0
1529; GFX9-NEXT:    s_mov_b32 s5, s1
1530; GFX9-NEXT:    s_waitcnt vmcnt(1)
1531; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v0
1532; GFX9-NEXT:    s_waitcnt vmcnt(0)
1533; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
1534; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
1535; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1536; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1537; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1538; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v4
1539; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1540; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v1
1541; GFX9-NEXT:    v_mad_f32 v1, -v1, v2, v3
1542; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
1543; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1544; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1545; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 8
1546; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1547; GFX9-NEXT:    s_endpgm
1548;
1549; EG-LABEL: v_sdiv_i8:
1550; EG:       ; %bb.0:
1551; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1552; EG-NEXT:    TEX 1 @6
1553; EG-NEXT:    ALU 21, @11, KC0[CB0:0-32], KC1[]
1554; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1555; EG-NEXT:    CF_END
1556; EG-NEXT:    PAD
1557; EG-NEXT:    Fetch clause starting at 6:
1558; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1559; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1560; EG-NEXT:    ALU clause starting at 10:
1561; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1562; EG-NEXT:    ALU clause starting at 11:
1563; EG-NEXT:     BFE_INT * T0.W, T1.X, 0.0, literal.x,
1564; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1565; EG-NEXT:     INT_TO_FLT * T0.Y, PV.W,
1566; EG-NEXT:     BFE_INT T1.W, T0.X, 0.0, literal.x,
1567; EG-NEXT:     RECIP_IEEE * T0.X, PS,
1568; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1569; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
1570; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.X,
1571; EG-NEXT:     TRUNC T2.W, PV.W,
1572; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
1573; EG-NEXT:     ASHR T0.W, PS, literal.x,
1574; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z,
1575; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1576; EG-NEXT:     TRUNC T0.Z, T2.W,
1577; EG-NEXT:     SETGE T1.W, |PS|, |T0.Y|,
1578; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
1579; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
1580; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
1581; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1582; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
1583; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1584; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
1585  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
1586  %num = load i8, ptr addrspace(1) %in
1587  %den = load i8, ptr addrspace(1) %den_ptr
1588  %result = sdiv i8 %num, %den
1589  %result.ext = sext i8 %result to i32
1590  store i32 %result.ext, ptr addrspace(1) %out
1591  ret void
1592}
1593
1594define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1595; GCN-LABEL: v_sdiv_i23:
1596; GCN:       ; %bb.0:
1597; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1598; GCN-NEXT:    s_mov_b32 s7, 0xf000
1599; GCN-NEXT:    s_mov_b32 s6, -1
1600; GCN-NEXT:    s_mov_b32 s10, s6
1601; GCN-NEXT:    s_mov_b32 s11, s7
1602; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1603; GCN-NEXT:    s_mov_b32 s8, s2
1604; GCN-NEXT:    s_mov_b32 s9, s3
1605; GCN-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
1606; GCN-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:6
1607; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
1608; GCN-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1609; GCN-NEXT:    s_mov_b32 s4, s0
1610; GCN-NEXT:    s_mov_b32 s5, s1
1611; GCN-NEXT:    s_waitcnt vmcnt(3)
1612; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1613; GCN-NEXT:    s_waitcnt vmcnt(2)
1614; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1615; GCN-NEXT:    s_waitcnt vmcnt(1)
1616; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
1617; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 23
1618; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v1
1619; GCN-NEXT:    s_waitcnt vmcnt(0)
1620; GCN-NEXT:    v_or_b32_e32 v0, v3, v0
1621; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
1622; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v0
1623; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1624; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
1625; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1626; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
1627; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
1628; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1629; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
1630; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
1631; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1632; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1633; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1634; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
1635; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1636; GCN-NEXT:    s_endpgm
1637;
1638; TONGA-LABEL: v_sdiv_i23:
1639; TONGA:       ; %bb.0:
1640; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
1641; TONGA-NEXT:    s_mov_b32 s3, 0xf000
1642; TONGA-NEXT:    s_mov_b32 s2, -1
1643; TONGA-NEXT:    s_mov_b32 s10, s2
1644; TONGA-NEXT:    s_mov_b32 s11, s3
1645; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1646; TONGA-NEXT:    s_mov_b32 s8, s6
1647; TONGA-NEXT:    s_mov_b32 s9, s7
1648; TONGA-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
1649; TONGA-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:6
1650; TONGA-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
1651; TONGA-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1652; TONGA-NEXT:    s_mov_b32 s0, s4
1653; TONGA-NEXT:    s_mov_b32 s1, s5
1654; TONGA-NEXT:    s_waitcnt vmcnt(3)
1655; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1656; TONGA-NEXT:    s_waitcnt vmcnt(2)
1657; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1658; TONGA-NEXT:    s_waitcnt vmcnt(1)
1659; TONGA-NEXT:    v_or_b32_e32 v1, v2, v1
1660; TONGA-NEXT:    v_bfe_i32 v1, v1, 0, 23
1661; TONGA-NEXT:    v_cvt_f32_i32_e32 v2, v1
1662; TONGA-NEXT:    s_waitcnt vmcnt(0)
1663; TONGA-NEXT:    v_or_b32_e32 v0, v3, v0
1664; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
1665; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v0
1666; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1667; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
1668; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1669; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1670; TONGA-NEXT:    v_mul_f32_e32 v1, v3, v4
1671; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
1672; TONGA-NEXT:    v_mad_f32 v3, -v1, v2, v3
1673; TONGA-NEXT:    v_cvt_i32_f32_e32 v1, v1
1674; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1675; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1676; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1677; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
1678; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1679; TONGA-NEXT:    s_endpgm
1680;
1681; GFX9-LABEL: v_sdiv_i23:
1682; GFX9:       ; %bb.0:
1683; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1684; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1685; GFX9-NEXT:    s_mov_b32 s2, -1
1686; GFX9-NEXT:    s_mov_b32 s6, s2
1687; GFX9-NEXT:    s_mov_b32 s7, s3
1688; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1689; GFX9-NEXT:    s_mov_b32 s4, s10
1690; GFX9-NEXT:    s_mov_b32 s5, s11
1691; GFX9-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:2
1692; GFX9-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:6
1693; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:4
1694; GFX9-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
1695; GFX9-NEXT:    s_mov_b32 s0, s8
1696; GFX9-NEXT:    s_mov_b32 s1, s9
1697; GFX9-NEXT:    s_waitcnt vmcnt(3)
1698; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1699; GFX9-NEXT:    s_waitcnt vmcnt(2)
1700; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1701; GFX9-NEXT:    s_waitcnt vmcnt(1)
1702; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
1703; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 23
1704; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v1
1705; GFX9-NEXT:    s_waitcnt vmcnt(0)
1706; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
1707; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
1708; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
1709; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1710; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
1711; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1712; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1713; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v4
1714; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1715; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v1
1716; GFX9-NEXT:    v_mad_f32 v1, -v1, v2, v3
1717; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
1718; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1719; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1720; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
1721; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1722; GFX9-NEXT:    s_endpgm
1723;
1724; EG-LABEL: v_sdiv_i23:
1725; EG:       ; %bb.0:
1726; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1727; EG-NEXT:    TEX 3 @6
1728; EG-NEXT:    ALU 33, @15, KC0[CB0:0-32], KC1[]
1729; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1730; EG-NEXT:    CF_END
1731; EG-NEXT:    PAD
1732; EG-NEXT:    Fetch clause starting at 6:
1733; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1734; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1735; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1736; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1737; EG-NEXT:    ALU clause starting at 14:
1738; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1739; EG-NEXT:    ALU clause starting at 15:
1740; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1741; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1742; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1743; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1744; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1745; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1746; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1747; EG-NEXT:     ASHR T0.W, PV.W, literal.x,
1748; EG-NEXT:     OR_INT * T1.W, T2.X, T1.W,
1749; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1750; EG-NEXT:     LSHL T1.W, PS, literal.x,
1751; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
1752; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1753; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
1754; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1755; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1756; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
1757; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
1758; EG-NEXT:     TRUNC T2.W, PV.W,
1759; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
1760; EG-NEXT:     ASHR T0.W, PS, literal.x,
1761; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
1762; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1763; EG-NEXT:     TRUNC T0.Z, T2.W,
1764; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
1765; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
1766; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
1767; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
1768; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1769; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1770; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1771; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
1772; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1773; EG-NEXT:    9(1.261169e-44), 2(2.802597e-45)
1774  %den_ptr = getelementptr i23, ptr addrspace(1) %in, i23 1
1775  %num = load i23, ptr addrspace(1) %in
1776  %den = load i23, ptr addrspace(1) %den_ptr
1777  %result = sdiv i23 %num, %den
1778  %result.ext = sext i23 %result to i32
1779  store i32 %result.ext, ptr addrspace(1) %out
1780  ret void
1781}
1782
1783define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1784; GCN-LABEL: v_sdiv_i24:
1785; GCN:       ; %bb.0:
1786; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1787; GCN-NEXT:    s_mov_b32 s7, 0xf000
1788; GCN-NEXT:    s_mov_b32 s6, -1
1789; GCN-NEXT:    s_mov_b32 s10, s6
1790; GCN-NEXT:    s_mov_b32 s11, s7
1791; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1792; GCN-NEXT:    s_mov_b32 s8, s2
1793; GCN-NEXT:    s_mov_b32 s9, s3
1794; GCN-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:6
1795; GCN-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1796; GCN-NEXT:    buffer_load_sbyte v2, off, s[8:11], 0 offset:2
1797; GCN-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1798; GCN-NEXT:    s_mov_b32 s4, s0
1799; GCN-NEXT:    s_mov_b32 s5, s1
1800; GCN-NEXT:    s_waitcnt vmcnt(3)
1801; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
1802; GCN-NEXT:    s_waitcnt vmcnt(2)
1803; GCN-NEXT:    v_or_b32_e32 v1, v1, v4
1804; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
1805; GCN-NEXT:    s_waitcnt vmcnt(1)
1806; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
1807; GCN-NEXT:    s_waitcnt vmcnt(0)
1808; GCN-NEXT:    v_or_b32_e32 v3, v3, v4
1809; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v3
1810; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1811; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
1812; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1813; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
1814; GCN-NEXT:    v_mul_f32_e32 v2, v3, v4
1815; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1816; GCN-NEXT:    v_mad_f32 v3, -v2, v1, v3
1817; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
1818; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1819; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1820; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1821; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
1822; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1823; GCN-NEXT:    s_endpgm
1824;
1825; TONGA-LABEL: v_sdiv_i24:
1826; TONGA:       ; %bb.0:
1827; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
1828; TONGA-NEXT:    s_mov_b32 s3, 0xf000
1829; TONGA-NEXT:    s_mov_b32 s2, -1
1830; TONGA-NEXT:    s_mov_b32 s10, s2
1831; TONGA-NEXT:    s_mov_b32 s11, s3
1832; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1833; TONGA-NEXT:    s_mov_b32 s8, s6
1834; TONGA-NEXT:    s_mov_b32 s9, s7
1835; TONGA-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:6
1836; TONGA-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1837; TONGA-NEXT:    buffer_load_sbyte v2, off, s[8:11], 0 offset:2
1838; TONGA-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1839; TONGA-NEXT:    s_mov_b32 s0, s4
1840; TONGA-NEXT:    s_mov_b32 s1, s5
1841; TONGA-NEXT:    s_waitcnt vmcnt(3)
1842; TONGA-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
1843; TONGA-NEXT:    s_waitcnt vmcnt(2)
1844; TONGA-NEXT:    v_or_b32_e32 v1, v1, v4
1845; TONGA-NEXT:    v_cvt_f32_i32_e32 v1, v1
1846; TONGA-NEXT:    s_waitcnt vmcnt(1)
1847; TONGA-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
1848; TONGA-NEXT:    s_waitcnt vmcnt(0)
1849; TONGA-NEXT:    v_or_b32_e32 v3, v3, v4
1850; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v3
1851; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1852; TONGA-NEXT:    v_xor_b32_e32 v0, v2, v0
1853; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1854; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1855; TONGA-NEXT:    v_mul_f32_e32 v2, v3, v4
1856; TONGA-NEXT:    v_trunc_f32_e32 v2, v2
1857; TONGA-NEXT:    v_mad_f32 v3, -v2, v1, v3
1858; TONGA-NEXT:    v_cvt_i32_f32_e32 v2, v2
1859; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1860; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1861; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1862; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 24
1863; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1864; TONGA-NEXT:    s_endpgm
1865;
1866; GFX9-LABEL: v_sdiv_i24:
1867; GFX9:       ; %bb.0:
1868; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1869; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1870; GFX9-NEXT:    s_mov_b32 s2, -1
1871; GFX9-NEXT:    s_mov_b32 s6, s2
1872; GFX9-NEXT:    s_mov_b32 s7, s3
1873; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1874; GFX9-NEXT:    s_mov_b32 s4, s10
1875; GFX9-NEXT:    s_mov_b32 s5, s11
1876; GFX9-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 offset:6
1877; GFX9-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:4
1878; GFX9-NEXT:    buffer_load_sbyte v2, off, s[4:7], 0 offset:2
1879; GFX9-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
1880; GFX9-NEXT:    s_mov_b32 s0, s8
1881; GFX9-NEXT:    s_mov_b32 s1, s9
1882; GFX9-NEXT:    s_waitcnt vmcnt(3)
1883; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
1884; GFX9-NEXT:    s_waitcnt vmcnt(2)
1885; GFX9-NEXT:    v_or_b32_e32 v1, v1, v4
1886; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
1887; GFX9-NEXT:    s_waitcnt vmcnt(1)
1888; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
1889; GFX9-NEXT:    s_waitcnt vmcnt(0)
1890; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
1891; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
1892; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1893; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
1894; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1895; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1896; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v4
1897; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1898; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v2
1899; GFX9-NEXT:    v_mad_f32 v2, -v2, v1, v3
1900; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
1901; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1902; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1903; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 24
1904; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1905; GFX9-NEXT:    s_endpgm
1906;
1907; EG-LABEL: v_sdiv_i24:
1908; EG:       ; %bb.0:
1909; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1910; EG-NEXT:    TEX 3 @6
1911; EG-NEXT:    ALU 29, @15, KC0[CB0:0-32], KC1[]
1912; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1913; EG-NEXT:    CF_END
1914; EG-NEXT:    PAD
1915; EG-NEXT:    Fetch clause starting at 6:
1916; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1917; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1918; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 4, #1
1919; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 2, #1
1920; EG-NEXT:    ALU clause starting at 14:
1921; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1922; EG-NEXT:    ALU clause starting at 15:
1923; EG-NEXT:     BFE_INT * T0.W, T1.X, 0.0, literal.x,
1924; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1925; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1926; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1927; EG-NEXT:     BFE_INT T2.W, T0.X, 0.0, literal.x,
1928; EG-NEXT:     OR_INT * T1.W, T3.X, PV.W,
1929; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1930; EG-NEXT:     LSHL T3.W, PV.W, literal.x,
1931; EG-NEXT:     INT_TO_FLT * T0.X, PS,
1932; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1933; EG-NEXT:     OR_INT T1.W, T2.X, PV.W,
1934; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1935; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
1936; EG-NEXT:     MUL_IEEE * T1.W, PS, T0.Y,
1937; EG-NEXT:     TRUNC T1.W, PV.W,
1938; EG-NEXT:     XOR_INT * T0.W, T2.W, T0.W,
1939; EG-NEXT:     ASHR T0.W, PS, literal.x,
1940; EG-NEXT:     MULADD_IEEE * T2.W, -PV.W, T0.X, T0.Z,
1941; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1942; EG-NEXT:     TRUNC T0.Z, T1.W,
1943; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
1944; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
1945; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
1946; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
1947; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1948; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1949; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1950; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
1951; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1952; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
1953  %den_ptr = getelementptr i24, ptr addrspace(1) %in, i24 1
1954  %num = load i24, ptr addrspace(1) %in
1955  %den = load i24, ptr addrspace(1) %den_ptr
1956  %result = sdiv i24 %num, %den
1957  %result.ext = sext i24 %result to i32
1958  store i32 %result.ext, ptr addrspace(1) %out
1959  ret void
1960}
1961
1962define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1963; GCN-LABEL: v_sdiv_i25:
1964; GCN:       ; %bb.0:
1965; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
1966; GCN-NEXT:    s_mov_b32 s3, 0xf000
1967; GCN-NEXT:    s_mov_b32 s2, -1
1968; GCN-NEXT:    s_mov_b32 s10, s2
1969; GCN-NEXT:    s_mov_b32 s11, s3
1970; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1971; GCN-NEXT:    s_mov_b32 s8, s6
1972; GCN-NEXT:    s_mov_b32 s9, s7
1973; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1974; GCN-NEXT:    s_mov_b32 s0, s4
1975; GCN-NEXT:    s_mov_b32 s1, s5
1976; GCN-NEXT:    s_waitcnt vmcnt(0)
1977; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 25
1978; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
1979; GCN-NEXT:    v_max_i32_e32 v2, v1, v2
1980; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
1981; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
1982; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
1983; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1984; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
1985; GCN-NEXT:    v_max_i32_e32 v5, v0, v5
1986; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1987; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1988; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
1989; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
1990; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
1991; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
1992; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
1993; GCN-NEXT:    v_mul_hi_u32 v3, v5, v3
1994; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
1995; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
1996; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
1997; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v2, v1
1998; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
1999; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2000; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2001; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
2002; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
2003; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2004; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
2005; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
2006; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
2007; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2008; GCN-NEXT:    s_endpgm
2009;
2010; TONGA-LABEL: v_sdiv_i25:
2011; TONGA:       ; %bb.0:
2012; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
2013; TONGA-NEXT:    s_mov_b32 s3, 0xf000
2014; TONGA-NEXT:    s_mov_b32 s2, -1
2015; TONGA-NEXT:    s_mov_b32 s10, s2
2016; TONGA-NEXT:    s_mov_b32 s11, s3
2017; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
2018; TONGA-NEXT:    s_mov_b32 s8, s6
2019; TONGA-NEXT:    s_mov_b32 s9, s7
2020; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2021; TONGA-NEXT:    s_mov_b32 s0, s4
2022; TONGA-NEXT:    s_mov_b32 s1, s5
2023; TONGA-NEXT:    s_waitcnt vmcnt(0)
2024; TONGA-NEXT:    v_bfe_i32 v1, v1, 0, 25
2025; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, 0, v1
2026; TONGA-NEXT:    v_max_i32_e32 v2, v1, v2
2027; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
2028; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
2029; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 25
2030; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2031; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
2032; TONGA-NEXT:    v_max_i32_e32 v5, v0, v5
2033; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2034; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
2035; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
2036; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
2037; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
2038; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
2039; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
2040; TONGA-NEXT:    v_mul_hi_u32 v3, v5, v3
2041; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
2042; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
2043; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
2044; TONGA-NEXT:    v_subrev_u32_e32 v5, vcc, v2, v1
2045; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
2046; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2047; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2048; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
2049; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
2050; TONGA-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2051; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v0
2052; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
2053; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 25
2054; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2055; TONGA-NEXT:    s_endpgm
2056;
2057; GFX9-LABEL: v_sdiv_i25:
2058; GFX9:       ; %bb.0:
2059; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2060; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2061; GFX9-NEXT:    s_mov_b32 s2, -1
2062; GFX9-NEXT:    s_mov_b32 s6, s2
2063; GFX9-NEXT:    s_mov_b32 s7, s3
2064; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2065; GFX9-NEXT:    s_mov_b32 s4, s10
2066; GFX9-NEXT:    s_mov_b32 s5, s11
2067; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2068; GFX9-NEXT:    s_mov_b32 s1, s9
2069; GFX9-NEXT:    s_waitcnt vmcnt(0)
2070; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2071; GFX9-NEXT:    s_bfe_i32 s4, s0, 0x190000
2072; GFX9-NEXT:    s_abs_i32 s5, s4
2073; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
2074; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
2075; GFX9-NEXT:    s_mov_b32 s0, s8
2076; GFX9-NEXT:    s_sub_i32 s7, 0, s5
2077; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2078; GFX9-NEXT:    s_bfe_i32 s6, s6, 0x190000
2079; GFX9-NEXT:    s_xor_b32 s4, s6, s4
2080; GFX9-NEXT:    s_abs_i32 s6, s6
2081; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
2082; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2083; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
2084; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
2085; GFX9-NEXT:    s_mul_i32 s7, s7, s8
2086; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
2087; GFX9-NEXT:    s_add_i32 s8, s8, s7
2088; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s8
2089; GFX9-NEXT:    s_mul_i32 s8, s7, s5
2090; GFX9-NEXT:    s_sub_i32 s6, s6, s8
2091; GFX9-NEXT:    s_add_i32 s9, s7, 1
2092; GFX9-NEXT:    s_sub_i32 s8, s6, s5
2093; GFX9-NEXT:    s_cmp_ge_u32 s6, s5
2094; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
2095; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
2096; GFX9-NEXT:    s_add_i32 s8, s7, 1
2097; GFX9-NEXT:    s_cmp_ge_u32 s6, s5
2098; GFX9-NEXT:    s_cselect_b32 s5, s8, s7
2099; GFX9-NEXT:    s_xor_b32 s5, s5, s4
2100; GFX9-NEXT:    s_sub_i32 s4, s5, s4
2101; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x190000
2102; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2103; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2104; GFX9-NEXT:    s_endpgm
2105;
2106; EG-LABEL: v_sdiv_i25:
2107; EG:       ; %bb.0:
2108; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2109; EG-NEXT:    TEX 1 @6
2110; EG-NEXT:    ALU 37, @12, KC0[CB0:0-32], KC1[]
2111; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2112; EG-NEXT:    CF_END
2113; EG-NEXT:    PAD
2114; EG-NEXT:    Fetch clause starting at 6:
2115; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
2116; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
2117; EG-NEXT:    ALU clause starting at 10:
2118; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2119; EG-NEXT:     MOV * T1.X, PV.X,
2120; EG-NEXT:    ALU clause starting at 12:
2121; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2122; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2123; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
2124; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2125; EG-NEXT:     SETGT_INT * T1.W, 0.0, PV.W,
2126; EG-NEXT:     ADD_INT T0.W, T0.W, PV.W,
2127; EG-NEXT:     LSHL * T2.W, T1.X, literal.x,
2128; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2129; EG-NEXT:     XOR_INT * T0.W, PV.W, T1.W,
2130; EG-NEXT:     SUB_INT T0.Z, 0.0, PV.W,
2131; EG-NEXT:     ASHR T2.W, T2.W, literal.x,
2132; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2133; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2134; EG-NEXT:     SETGT_INT T3.W, 0.0, PV.W,
2135; EG-NEXT:     MULLO_INT * T0.Y, PV.Z, PS,
2136; EG-NEXT:     ADD_INT T2.W, T2.W, PV.W,
2137; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
2138; EG-NEXT:     ADD_INT T4.W, T0.X, PS,
2139; EG-NEXT:     XOR_INT * T2.W, PV.W, T3.W,
2140; EG-NEXT:     MULHI * T0.X, PS, PV.W,
2141; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2142; EG-NEXT:     SUB_INT * T2.W, T2.W, PS,
2143; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
2144; EG-NEXT:     SETGE_UINT T4.W, PV.W, T0.W,
2145; EG-NEXT:     SUB_INT * T5.W, PV.W, T0.W,
2146; EG-NEXT:     CNDE_INT T2.W, PV.W, T2.W, PS,
2147; EG-NEXT:     CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
2148; EG-NEXT:     ADD_INT T5.W, PS, 1,
2149; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
2150; EG-NEXT:     CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
2151; EG-NEXT:     XOR_INT * T1.W, T3.W, T1.W,
2152; EG-NEXT:     XOR_INT * T0.W, PV.W, PS,
2153; EG-NEXT:     SUB_INT * T0.W, PV.W, T1.W,
2154; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2155; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2156; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
2157; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2158; EG-NEXT:    7(9.809089e-45), 2(2.802597e-45)
2159  %den_ptr = getelementptr i25, ptr addrspace(1) %in, i25 1
2160  %num = load i25, ptr addrspace(1) %in
2161  %den = load i25, ptr addrspace(1) %den_ptr
2162  %result = sdiv i25 %num, %den
2163  %result.ext = sext i25 %result to i32
2164  store i32 %result.ext, ptr addrspace(1) %out
2165  ret void
2166}
2167
2168; Tests for 64-bit divide bypass.
2169; define amdgpu_kernel void @test_get_quotient(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
2170;   %result = sdiv i64 %a, %b
2171;   store i64 %result, ptr addrspace(1) %out, align 8
2172;   ret void
2173; }
2174
2175; define amdgpu_kernel void @test_get_remainder(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
2176;   %result = srem i64 %a, %b
2177;   store i64 %result, ptr addrspace(1) %out, align 8
2178;   ret void
2179; }
2180
2181; define amdgpu_kernel void @test_get_quotient_and_remainder(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
2182;   %resultdiv = sdiv i64 %a, %b
2183;   %resultrem = srem i64 %a, %b
2184;   %result = add i64 %resultdiv, %resultrem
2185;   store i64 %result, ptr addrspace(1) %out, align 8
2186;   ret void
2187; }
2188
2189define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) {
2190; GCN-LABEL: scalarize_mulhs_4xi32:
2191; GCN:       ; %bb.0:
2192; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2193; GCN-NEXT:    s_mov_b32 s7, 0xf000
2194; GCN-NEXT:    s_mov_b32 s6, -1
2195; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2196; GCN-NEXT:    s_mov_b32 s4, s0
2197; GCN-NEXT:    s_mov_b32 s5, s1
2198; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2199; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2200; GCN-NEXT:    s_mov_b32 s4, s2
2201; GCN-NEXT:    s_mov_b32 s5, s3
2202; GCN-NEXT:    s_waitcnt vmcnt(0)
2203; GCN-NEXT:    v_mul_hi_i32 v0, v0, s0
2204; GCN-NEXT:    v_mul_hi_i32 v1, v1, s0
2205; GCN-NEXT:    v_mul_hi_i32 v2, v2, s0
2206; GCN-NEXT:    v_mul_hi_i32 v3, v3, s0
2207; GCN-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2208; GCN-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2209; GCN-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2210; GCN-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2211; GCN-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2212; GCN-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2213; GCN-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2214; GCN-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2215; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
2216; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
2217; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
2218; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
2219; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2220; GCN-NEXT:    s_endpgm
2221;
2222; TONGA-LABEL: scalarize_mulhs_4xi32:
2223; TONGA:       ; %bb.0:
2224; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2225; TONGA-NEXT:    s_mov_b32 s7, 0xf000
2226; TONGA-NEXT:    s_mov_b32 s6, -1
2227; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
2228; TONGA-NEXT:    s_mov_b32 s4, s0
2229; TONGA-NEXT:    s_mov_b32 s5, s1
2230; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2231; TONGA-NEXT:    s_mov_b32 s0, 0x1389c755
2232; TONGA-NEXT:    s_mov_b32 s4, s2
2233; TONGA-NEXT:    s_mov_b32 s5, s3
2234; TONGA-NEXT:    s_waitcnt vmcnt(0)
2235; TONGA-NEXT:    v_mul_hi_i32 v0, v0, s0
2236; TONGA-NEXT:    v_mul_hi_i32 v1, v1, s0
2237; TONGA-NEXT:    v_mul_hi_i32 v2, v2, s0
2238; TONGA-NEXT:    v_mul_hi_i32 v3, v3, s0
2239; TONGA-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2240; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2241; TONGA-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2242; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2243; TONGA-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2244; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2245; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2246; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2247; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
2248; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
2249; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
2250; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
2251; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2252; TONGA-NEXT:    s_endpgm
2253;
2254; GFX9-LABEL: scalarize_mulhs_4xi32:
2255; GFX9:       ; %bb.0:
2256; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2257; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2258; GFX9-NEXT:    s_mov_b32 s6, -1
2259; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2260; GFX9-NEXT:    s_mov_b32 s4, s0
2261; GFX9-NEXT:    s_mov_b32 s5, s1
2262; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2263; GFX9-NEXT:    s_mov_b32 s0, 0x1389c755
2264; GFX9-NEXT:    s_mov_b32 s4, s2
2265; GFX9-NEXT:    s_mov_b32 s5, s3
2266; GFX9-NEXT:    s_waitcnt vmcnt(0)
2267; GFX9-NEXT:    v_mul_hi_i32 v0, v0, s0
2268; GFX9-NEXT:    v_mul_hi_i32 v1, v1, s0
2269; GFX9-NEXT:    v_mul_hi_i32 v2, v2, s0
2270; GFX9-NEXT:    v_mul_hi_i32 v3, v3, s0
2271; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2272; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2273; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2274; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2275; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2276; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2277; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2278; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2279; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
2280; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
2281; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
2282; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
2283; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2284; GFX9-NEXT:    s_endpgm
2285;
2286; EG-LABEL: scalarize_mulhs_4xi32:
2287; EG:       ; %bb.0:
2288; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2289; EG-NEXT:    TEX 0 @6
2290; EG-NEXT:    ALU 25, @9, KC0[CB0:0-32], KC1[]
2291; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2292; EG-NEXT:    CF_END
2293; EG-NEXT:    PAD
2294; EG-NEXT:    Fetch clause starting at 6:
2295; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2296; EG-NEXT:    ALU clause starting at 8:
2297; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2298; EG-NEXT:    ALU clause starting at 9:
2299; EG-NEXT:     MULHI_INT * T0.W, T0.W, literal.x,
2300; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2301; EG-NEXT:     ASHR T1.Z, PS, literal.x,
2302; EG-NEXT:     LSHR T0.W, PS, literal.y,
2303; EG-NEXT:     MULHI_INT * T0.Z, T0.Z, literal.z,
2304; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2305; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2306; EG-NEXT:     ASHR T1.Y, PS, literal.x,
2307; EG-NEXT:     LSHR T0.Z, PS, literal.y,
2308; EG-NEXT:     ADD_INT T0.W, PV.Z, PV.W,
2309; EG-NEXT:     MULHI_INT * T0.Y, T0.Y, literal.z,
2310; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2311; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2312; EG-NEXT:     ASHR T2.Y, PS, literal.x,
2313; EG-NEXT:     ADD_INT T0.Z, PV.Y, PV.Z,
2314; EG-NEXT:     LSHR T1.W, PS, literal.y,
2315; EG-NEXT:     MULHI_INT * T0.X, T0.X, literal.z,
2316; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2317; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2318; EG-NEXT:     ADD_INT T0.Y, PV.Y, PV.W,
2319; EG-NEXT:     ASHR T1.W, PS, literal.x,
2320; EG-NEXT:     LSHR * T2.W, PS, literal.y,
2321; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2322; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
2323; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.x,
2324; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2325  %1 = load <4 x i32>, ptr addrspace(1) %in, align 16
2326  %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2327  store <4 x i32> %2, ptr addrspace(1) %out, align 16
2328  ret void
2329}
2330