xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
3; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
4; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
5
6define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
7; GFX8-LABEL: sdivrem_i32:
8; GFX8:       ; %bb.0:
9; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
10; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
11; GFX8-NEXT:    s_ashr_i32 s6, s5, 31
12; GFX8-NEXT:    s_add_i32 s0, s5, s6
13; GFX8-NEXT:    s_xor_b32 s5, s0, s6
14; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
15; GFX8-NEXT:    s_sub_i32 s0, 0, s5
16; GFX8-NEXT:    s_ashr_i32 s7, s4, 31
17; GFX8-NEXT:    s_add_i32 s4, s4, s7
18; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
19; GFX8-NEXT:    s_xor_b32 s4, s4, s7
20; GFX8-NEXT:    s_xor_b32 s6, s7, s6
21; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
22; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
23; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
24; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
25; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
26; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
27; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
28; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX8-NEXT:    v_mov_b32_e32 v0, s0
30; GFX8-NEXT:    v_mov_b32_e32 v1, s1
31; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s5
32; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
33; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
34; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
35; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
36; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s5, v3
37; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
38; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
39; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
40; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
41; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s5, v3
42; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
43; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
44; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v2
45; GFX8-NEXT:    v_xor_b32_e32 v3, s7, v3
46; GFX8-NEXT:    flat_store_dword v[0:1], v2
47; GFX8-NEXT:    v_mov_b32_e32 v0, s2
48; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v3
49; GFX8-NEXT:    v_mov_b32_e32 v1, s3
50; GFX8-NEXT:    flat_store_dword v[0:1], v3
51; GFX8-NEXT:    s_endpgm
52;
53; GFX9-LABEL: sdivrem_i32:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x10
56; GFX9-NEXT:    v_mov_b32_e32 v2, 0
57; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
59; GFX9-NEXT:    s_add_i32 s1, s1, s4
60; GFX9-NEXT:    s_xor_b32 s5, s1, s4
61; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
62; GFX9-NEXT:    s_sub_i32 s1, 0, s5
63; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
64; GFX9-NEXT:    s_add_i32 s0, s0, s6
65; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
66; GFX9-NEXT:    s_xor_b32 s7, s0, s6
67; GFX9-NEXT:    s_xor_b32 s4, s6, s4
68; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
69; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
70; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
71; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
72; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
73; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
74; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
75; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
76; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
77; GFX9-NEXT:    v_sub_u32_e32 v1, s7, v1
78; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
79; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
80; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
81; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
82; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
83; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
84; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
85; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
86; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
87; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
88; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
89; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
90; GFX9-NEXT:    v_subrev_u32_e32 v1, s6, v1
91; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
93; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
94; GFX9-NEXT:    s_endpgm
95;
96; GFX10-LABEL: sdivrem_i32:
97; GFX10:       ; %bb.0:
98; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x10
99; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
101; GFX10-NEXT:    s_ashr_i32 s6, s0, 31
102; GFX10-NEXT:    s_add_i32 s1, s1, s4
103; GFX10-NEXT:    s_add_i32 s0, s0, s6
104; GFX10-NEXT:    s_xor_b32 s5, s1, s4
105; GFX10-NEXT:    s_xor_b32 s0, s0, s6
106; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s5
107; GFX10-NEXT:    s_sub_i32 s1, 0, s5
108; GFX10-NEXT:    s_xor_b32 s4, s6, s4
109; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
110; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
111; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
112; GFX10-NEXT:    v_mul_lo_u32 v1, s1, v0
113; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
114; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
115; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
116; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s5
117; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
118; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
119; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
120; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s5, v1
121; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s5, v1
122; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
123; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
124; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
125; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s5, v1
126; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s5, v1
127; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
128; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
129; GFX10-NEXT:    v_mov_b32_e32 v2, 0
130; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
131; GFX10-NEXT:    v_xor_b32_e32 v1, s6, v1
132; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
133; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s6, v1
134; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
135; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
136; GFX10-NEXT:    global_store_dword v2, v1, s[2:3]
137; GFX10-NEXT:    s_endpgm
138  %div = sdiv i32 %x, %y
139  store i32 %div, ptr addrspace(1) %out0
140  %rem = srem i32 %x, %y
141  store i32 %rem, ptr addrspace(1) %out1
142  ret void
143}
144
145define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
146; GFX8-LABEL: sdivrem_i64:
147; GFX8:       ; %bb.0:
148; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[8:9], 0x0
149; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
151; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
152; GFX8-NEXT:    s_add_u32 s0, s8, s2
153; GFX8-NEXT:    s_addc_u32 s1, s9, s2
154; GFX8-NEXT:    s_add_u32 s8, s10, s12
155; GFX8-NEXT:    s_mov_b32 s13, s12
156; GFX8-NEXT:    s_addc_u32 s9, s11, s12
157; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
158; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
159; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
160; GFX8-NEXT:    s_mov_b32 s3, s2
161; GFX8-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
162; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
163; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
164; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
165; GFX8-NEXT:    s_sub_u32 s14, 0, s8
166; GFX8-NEXT:    s_subb_u32 s15, 0, s9
167; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
168; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
169; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
170; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
171; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
172; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
173; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
174; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
175; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
176; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
177; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
178; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
179; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
180; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
181; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
182; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
183; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
184; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
185; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
186; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
187; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
188; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
189; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
190; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
191; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
192; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
193; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
194; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
195; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
196; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
197; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
198; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
199; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
200; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
201; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
202; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
203; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
204; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
205; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
206; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
207; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
208; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
209; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
210; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
211; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
212; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
213; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
214; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
215; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
216; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
217; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
218; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
219; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
220; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
221; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
222; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
223; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
224; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
225; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
226; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
227; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
228; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
229; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
230; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
231; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
232; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
233; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
234; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
235; GFX8-NEXT:    v_mul_lo_u32 v4, s11, v1
236; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
237; GFX8-NEXT:    v_mul_hi_u32 v3, s10, v1
238; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
239; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
240; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
241; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
242; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
243; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
244; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
245; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
246; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
247; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
248; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
249; GFX8-NEXT:    v_mov_b32_e32 v6, s11
250; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s10, v0
251; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
252; GFX8-NEXT:    v_mov_b32_e32 v5, s9
253; GFX8-NEXT:    v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
254; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s11, v1
255; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
256; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
257; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
258; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
259; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
260; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
261; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
262; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v0
263; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
264; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
265; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
266; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
267; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
268; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
269; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
270; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
271; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
272; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v7
273; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
274; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
275; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
276; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
277; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
278; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
279; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
280; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
281; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
282; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
283; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[0:1]
284; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[0:1]
285; GFX8-NEXT:    v_cndmask_b32_e64 v5, v0, v5, s[0:1]
286; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
287; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
288; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v4
289; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v3
290; GFX8-NEXT:    v_mov_b32_e32 v3, s1
291; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
292; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
293; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v5
294; GFX8-NEXT:    v_xor_b32_e32 v4, s2, v2
295; GFX8-NEXT:    v_mov_b32_e32 v5, s2
296; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s2, v3
297; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
298; GFX8-NEXT:    v_mov_b32_e32 v4, s4
299; GFX8-NEXT:    v_mov_b32_e32 v5, s5
300; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
301; GFX8-NEXT:    v_mov_b32_e32 v0, s6
302; GFX8-NEXT:    v_mov_b32_e32 v1, s7
303; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
304; GFX8-NEXT:    s_endpgm
305;
306; GFX9-LABEL: sdivrem_i64:
307; GFX9:       ; %bb.0:
308; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
309; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX9-NEXT:    s_ashr_i32 s2, s17, 31
311; GFX9-NEXT:    s_ashr_i32 s4, s19, 31
312; GFX9-NEXT:    s_add_u32 s0, s16, s2
313; GFX9-NEXT:    s_addc_u32 s1, s17, s2
314; GFX9-NEXT:    s_add_u32 s6, s18, s4
315; GFX9-NEXT:    s_mov_b32 s5, s4
316; GFX9-NEXT:    s_addc_u32 s7, s19, s4
317; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
318; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
319; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
320; GFX9-NEXT:    s_mov_b32 s3, s2
321; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
322; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
323; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
324; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
325; GFX9-NEXT:    s_sub_u32 s10, 0, s6
326; GFX9-NEXT:    s_subb_u32 s11, 0, s7
327; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
328; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
329; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
330; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
331; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
332; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
333; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
334; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
335; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
336; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
337; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
338; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
339; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
340; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
341; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
342; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
343; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
344; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
345; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
346; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
347; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
348; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
349; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
350; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
351; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
352; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
353; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
354; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
355; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
356; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
357; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
358; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
359; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
360; GFX9-NEXT:    v_mov_b32_e32 v7, s7
361; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
362; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
363; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
364; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
365; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
366; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
367; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
368; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
369; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
370; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
371; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
372; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
373; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
374; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
375; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
376; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
377; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
378; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
379; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
380; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
381; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
382; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
383; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
384; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
385; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
386; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
387; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
388; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
389; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
390; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
391; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
392; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
393; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
394; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
395; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
396; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
397; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
398; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
399; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
400; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
401; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
402; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
403; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
404; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
405; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
406; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
407; GFX9-NEXT:    v_mov_b32_e32 v6, s9
408; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s8, v0
409; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
410; GFX9-NEXT:    v_mov_b32_e32 v4, 0
411; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
412; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
413; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
414; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
415; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v0
416; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
417; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v2
418; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
419; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[0:1]
420; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s6, v0
421; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
422; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
423; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
424; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
425; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
426; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
427; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
428; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
429; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
430; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v8
431; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
432; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
433; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
434; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
435; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
436; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
437; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
438; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
439; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
440; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
441; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[0:1]
442; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
443; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
444; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
445; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
446; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v5
447; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v3
448; GFX9-NEXT:    v_mov_b32_e32 v3, s1
449; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
450; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
451; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v6
452; GFX9-NEXT:    v_xor_b32_e32 v5, s2, v2
453; GFX9-NEXT:    v_mov_b32_e32 v6, s2
454; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v3
455; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
456; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
457; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[14:15]
458; GFX9-NEXT:    s_endpgm
459;
460; GFX10-LABEL: sdivrem_i64:
461; GFX10:       ; %bb.0:
462; GFX10-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
463; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX10-NEXT:    s_ashr_i32 s2, s17, 31
465; GFX10-NEXT:    s_ashr_i32 s4, s19, 31
466; GFX10-NEXT:    s_add_u32 s0, s16, s2
467; GFX10-NEXT:    s_addc_u32 s1, s17, s2
468; GFX10-NEXT:    s_add_u32 s6, s18, s4
469; GFX10-NEXT:    s_mov_b32 s5, s4
470; GFX10-NEXT:    s_addc_u32 s7, s19, s4
471; GFX10-NEXT:    s_mov_b32 s3, s2
472; GFX10-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
473; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
474; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
475; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s6
476; GFX10-NEXT:    s_sub_u32 s8, 0, s6
477; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
478; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
479; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
480; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
481; GFX10-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
482; GFX10-NEXT:    v_trunc_f32_e32 v2, v1
483; GFX10-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
484; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
485; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
486; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
487; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s9, s8, v3, 0
488; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, s8, v4, v[1:2]
489; GFX10-NEXT:    s_subb_u32 s9, 0, s7
490; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
491; GFX10-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
492; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s10, s9, v3, v[1:2]
493; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
494; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
495; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
496; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
497; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
498; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
499; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v5
500; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
501; GFX10-NEXT:    v_add_co_u32 v6, s10, v7, v6
502; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s10
503; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
504; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
505; GFX10-NEXT:    v_add_co_u32 v2, s10, v6, v8
506; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
507; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
508; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
509; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
510; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
511; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v3, v0
512; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
513; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
514; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s10, s8, v3, 0
515; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s8, s8, v4, v[1:2]
516; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
517; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s8, s9, v3, v[1:2]
518; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
519; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
520; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v1
521; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v1
522; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v1
523; GFX10-NEXT:    v_mul_hi_u32 v1, v4, v1
524; GFX10-NEXT:    v_add_co_u32 v2, s8, v2, v5
525; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s8
526; GFX10-NEXT:    v_add_co_u32 v6, s8, v7, v6
527; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s8
528; GFX10-NEXT:    v_add_co_u32 v0, s8, v2, v0
529; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s8
530; GFX10-NEXT:    v_add_co_u32 v2, s8, v6, v8
531; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s8
532; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
533; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v6
534; GFX10-NEXT:    v_add_co_u32 v0, s8, v2, v0
535; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
536; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v3, v0
537; GFX10-NEXT:    v_add3_u32 v1, v5, v2, v1
538; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v0
539; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
540; GFX10-NEXT:    v_mul_hi_u32 v4, s0, v0
541; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
542; GFX10-NEXT:    v_mul_lo_u32 v3, s0, v1
543; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v1
544; GFX10-NEXT:    v_add_co_u32 v2, s8, v2, v3
545; GFX10-NEXT:    v_mul_hi_u32 v3, s0, v1
546; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s8
547; GFX10-NEXT:    v_add_co_u32 v2, s8, v2, v4
548; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
549; GFX10-NEXT:    v_add_co_u32 v0, s8, v5, v0
550; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
551; GFX10-NEXT:    v_add_nc_u32_e32 v2, v6, v2
552; GFX10-NEXT:    v_add_co_u32 v0, s8, v0, v3
553; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s8
554; GFX10-NEXT:    v_add_co_u32 v5, s8, v0, v2
555; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v1
556; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s8
557; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
558; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s8, s6, v5, 0
559; GFX10-NEXT:    v_add3_u32 v3, v3, v6, v2
560; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2]
561; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2]
562; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v5, 1
563; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
564; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
565; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
566; GFX10-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
567; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
568; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v0
569; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
570; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, s6
571; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
572; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v1
573; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
574; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
575; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v8
576; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
577; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v9
578; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
579; GFX10-NEXT:    v_add_co_u32 v13, s0, v2, 1
580; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
581; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v9
582; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
583; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v1
584; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
585; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
586; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s6
587; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
588; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
589; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
590; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
591; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
592; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
593; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s0
594; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s0
595; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s0
596; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
597; GFX10-NEXT:    v_mov_b32_e32 v4, 0
598; GFX10-NEXT:    v_xor_b32_e32 v2, s4, v2
599; GFX10-NEXT:    v_xor_b32_e32 v3, s5, v3
600; GFX10-NEXT:    v_xor_b32_e32 v5, s2, v0
601; GFX10-NEXT:    v_xor_b32_e32 v6, s2, v1
602; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, s4
603; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo
604; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, s2
605; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo
606; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
607; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[14:15]
608; GFX10-NEXT:    s_endpgm
609  %div = sdiv i64 %x, %y
610  store i64 %div, ptr addrspace(1) %out0
611  %rem = srem i64 %x, %y
612  store i64 %rem, ptr addrspace(1) %out1
613  ret void
614}
615
616define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
617; GFX8-LABEL: sdivrem_v2i32:
618; GFX8:       ; %bb.0:
619; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[8:9], 0x0
620; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
622; GFX8-NEXT:    s_add_i32 s0, s10, s2
623; GFX8-NEXT:    s_xor_b32 s3, s0, s2
624; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
625; GFX8-NEXT:    s_ashr_i32 s10, s11, 31
626; GFX8-NEXT:    s_add_i32 s0, s11, s10
627; GFX8-NEXT:    s_xor_b32 s11, s0, s10
628; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
629; GFX8-NEXT:    s_sub_i32 s0, 0, s3
630; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s11
631; GFX8-NEXT:    s_ashr_i32 s12, s8, 31
632; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
633; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
634; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
635; GFX8-NEXT:    s_sub_i32 s1, 0, s11
636; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v0
637; GFX8-NEXT:    s_add_i32 s0, s8, s12
638; GFX8-NEXT:    s_xor_b32 s0, s0, s12
639; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
640; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
641; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
642; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
643; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
644; GFX8-NEXT:    v_mul_lo_u32 v2, s1, v1
645; GFX8-NEXT:    v_mul_lo_u32 v3, v0, s3
646; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
647; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
648; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s0, v3
649; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
650; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
651; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
652; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
653; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
654; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
655; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
656; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
657; GFX8-NEXT:    s_xor_b32 s0, s12, s2
658; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
659; GFX8-NEXT:    s_add_i32 s1, s9, s2
660; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
661; GFX8-NEXT:    s_xor_b32 s1, s1, s2
662; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
663; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
664; GFX8-NEXT:    v_xor_b32_e32 v2, s12, v3
665; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
666; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
667; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s11
668; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s12, v2
669; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
670; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
671; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
672; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
673; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
674; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
675; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
676; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
677; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
678; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
679; GFX8-NEXT:    s_xor_b32 s0, s2, s10
680; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
681; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
682; GFX8-NEXT:    v_mov_b32_e32 v4, s4
683; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
684; GFX8-NEXT:    v_mov_b32_e32 v5, s5
685; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
686; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
687; GFX8-NEXT:    v_mov_b32_e32 v0, s6
688; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
689; GFX8-NEXT:    v_mov_b32_e32 v1, s7
690; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
691; GFX8-NEXT:    s_endpgm
692;
693; GFX9-LABEL: sdivrem_v2i32:
694; GFX9:       ; %bb.0:
695; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
696; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
697; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
698; GFX9-NEXT:    s_add_i32 s6, s6, s8
699; GFX9-NEXT:    s_xor_b32 s6, s6, s8
700; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
701; GFX9-NEXT:    s_ashr_i32 s9, s7, 31
702; GFX9-NEXT:    s_add_i32 s7, s7, s9
703; GFX9-NEXT:    s_xor_b32 s7, s7, s9
704; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
705; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
706; GFX9-NEXT:    s_sub_i32 s12, 0, s6
707; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
708; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
709; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
710; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
711; GFX9-NEXT:    s_add_i32 s4, s4, s10
712; GFX9-NEXT:    s_xor_b32 s4, s4, s10
713; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
714; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
715; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
716; GFX9-NEXT:    s_sub_i32 s12, 0, s7
717; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
718; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
719; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
720; GFX9-NEXT:    s_add_i32 s5, s5, s11
721; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
722; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
723; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
724; GFX9-NEXT:    s_xor_b32 s5, s5, s11
725; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
726; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
727; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
728; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
729; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
730; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
731; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
732; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v3
733; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
734; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
735; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
736; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
737; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v2
738; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
739; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
740; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
741; GFX9-NEXT:    s_xor_b32 s4, s10, s8
742; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
743; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
744; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
745; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
746; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
747; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
748; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
749; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
750; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
751; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
752; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
753; GFX9-NEXT:    s_xor_b32 s4, s11, s9
754; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
755; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
756; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
757; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
758; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
759; GFX9-NEXT:    v_mov_b32_e32 v4, 0
760; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
761; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
762; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
763; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
764; GFX9-NEXT:    s_endpgm
765;
766; GFX10-LABEL: sdivrem_v2i32:
767; GFX10:       ; %bb.0:
768; GFX10-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
769; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX10-NEXT:    s_ashr_i32 s1, s18, 31
771; GFX10-NEXT:    s_ashr_i32 s2, s19, 31
772; GFX10-NEXT:    s_add_i32 s0, s18, s1
773; GFX10-NEXT:    s_add_i32 s3, s19, s2
774; GFX10-NEXT:    s_xor_b32 s4, s0, s1
775; GFX10-NEXT:    s_xor_b32 s3, s3, s2
776; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
777; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
778; GFX10-NEXT:    s_sub_i32 s0, 0, s4
779; GFX10-NEXT:    s_sub_i32 s5, 0, s3
780; GFX10-NEXT:    s_ashr_i32 s6, s17, 31
781; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
782; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
783; GFX10-NEXT:    s_add_i32 s7, s17, s6
784; GFX10-NEXT:    s_xor_b32 s7, s7, s6
785; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
786; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
787; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
788; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
789; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v0
790; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v1
791; GFX10-NEXT:    s_ashr_i32 s5, s16, 31
792; GFX10-NEXT:    s_add_i32 s0, s16, s5
793; GFX10-NEXT:    s_xor_b32 s1, s5, s1
794; GFX10-NEXT:    s_xor_b32 s0, s0, s5
795; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
796; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
797; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
798; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
799; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
800; GFX10-NEXT:    v_mul_hi_u32 v1, s7, v1
801; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s4
802; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
803; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
804; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
805; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
806; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s7, v3
807; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
808; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
809; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
810; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
811; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
812; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
813; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
814; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
815; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
816; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
817; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s4, v2
818; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
819; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s4, v2
820; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
821; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
822; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
823; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
824; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
825; GFX10-NEXT:    s_xor_b32 s0, s6, s2
826; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
827; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
828; GFX10-NEXT:    v_xor_b32_e32 v2, s5, v2
829; GFX10-NEXT:    v_xor_b32_e32 v3, s6, v3
830; GFX10-NEXT:    v_mov_b32_e32 v4, 0
831; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
832; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
833; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s5, v2
834; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s6, v3
835; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
836; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[14:15]
837; GFX10-NEXT:    s_endpgm
838  %div = sdiv <2 x i32> %x, %y
839  store <2 x i32> %div, ptr addrspace(1) %out0
840  %rem = srem <2 x i32> %x, %y
841  store <2 x i32> %rem, ptr addrspace(1) %out1
842  ret void
843}
844
845define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
846; GFX8-LABEL: sdivrem_v4i32:
847; GFX8:       ; %bb.0:
848; GFX8-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
849; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
850; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX8-NEXT:    s_ashr_i32 s2, s16, 31
852; GFX8-NEXT:    s_add_i32 s0, s16, s2
853; GFX8-NEXT:    s_xor_b32 s3, s0, s2
854; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
855; GFX8-NEXT:    s_sub_i32 s1, 0, s3
856; GFX8-NEXT:    s_ashr_i32 s9, s17, 31
857; GFX8-NEXT:    s_add_i32 s0, s17, s9
858; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
859; GFX8-NEXT:    s_xor_b32 s10, s0, s9
860; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s10
861; GFX8-NEXT:    s_ashr_i32 s8, s12, 31
862; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
863; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
864; GFX8-NEXT:    s_add_i32 s0, s12, s8
865; GFX8-NEXT:    s_xor_b32 s0, s0, s8
866; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
867; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
868; GFX8-NEXT:    s_sub_i32 s11, 0, s10
869; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
870; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
871; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
872; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
873; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
874; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
875; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
876; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
877; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
878; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
879; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
880; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
881; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
882; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
883; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
884; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
885; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
886; GFX8-NEXT:    v_mul_lo_u32 v3, s11, v1
887; GFX8-NEXT:    s_xor_b32 s0, s8, s2
888; GFX8-NEXT:    s_ashr_i32 s2, s13, 31
889; GFX8-NEXT:    s_add_i32 s1, s13, s2
890; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
891; GFX8-NEXT:    s_xor_b32 s1, s1, s2
892; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
893; GFX8-NEXT:    v_xor_b32_e32 v2, s8, v2
894; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
895; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
896; GFX8-NEXT:    s_ashr_i32 s3, s18, 31
897; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
898; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s10
899; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s8, v2
900; GFX8-NEXT:    s_add_i32 s0, s18, s3
901; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v3
902; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
903; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
904; GFX8-NEXT:    s_xor_b32 s8, s0, s3
905; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
906; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s8
907; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s10, v2
908; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
909; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
910; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v1
911; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
912; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
913; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
914; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
915; GFX8-NEXT:    v_subrev_u32_e64 v5, s[0:1], s10, v2
916; GFX8-NEXT:    s_sub_i32 s0, 0, s8
917; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
918; GFX8-NEXT:    v_mul_lo_u32 v5, s0, v3
919; GFX8-NEXT:    s_xor_b32 s0, s2, s9
920; GFX8-NEXT:    s_ashr_i32 s9, s14, 31
921; GFX8-NEXT:    s_add_i32 s1, s14, s9
922; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v5
923; GFX8-NEXT:    s_xor_b32 s1, s1, s9
924; GFX8-NEXT:    v_xor_b32_e32 v2, s2, v2
925; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
926; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
927; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
928; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s2, v2
929; GFX8-NEXT:    s_ashr_i32 s2, s19, 31
930; GFX8-NEXT:    v_mul_lo_u32 v6, v3, s8
931; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
932; GFX8-NEXT:    s_add_i32 s0, s19, s2
933; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v6
934; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v3
935; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
936; GFX8-NEXT:    s_xor_b32 s10, s0, s2
937; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
938; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, s10
939; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s8, v2
940; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
941; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
942; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v3
943; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
944; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
945; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
946; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
947; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s8, v2
948; GFX8-NEXT:    s_sub_i32 s0, 0, s10
949; GFX8-NEXT:    v_cndmask_b32_e32 v7, v2, v7, vcc
950; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v6
951; GFX8-NEXT:    s_xor_b32 s0, s9, s3
952; GFX8-NEXT:    s_ashr_i32 s3, s15, 31
953; GFX8-NEXT:    s_add_i32 s1, s15, s3
954; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v2
955; GFX8-NEXT:    s_xor_b32 s1, s1, s3
956; GFX8-NEXT:    v_xor_b32_e32 v3, s0, v3
957; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
958; GFX8-NEXT:    v_mul_hi_u32 v8, s1, v2
959; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v3
960; GFX8-NEXT:    v_xor_b32_e32 v3, s9, v7
961; GFX8-NEXT:    v_mul_lo_u32 v7, v8, s10
962; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s9, v3
963; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v7
964; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v8
965; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
966; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
967; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s10, v3
968; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
969; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v7
970; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
971; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
972; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s10, v3
973; GFX8-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
974; GFX8-NEXT:    s_xor_b32 s0, s3, s2
975; GFX8-NEXT:    v_xor_b32_e32 v3, s0, v7
976; GFX8-NEXT:    v_xor_b32_e32 v7, s3, v8
977; GFX8-NEXT:    v_mov_b32_e32 v9, s5
978; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s0, v3
979; GFX8-NEXT:    v_mov_b32_e32 v8, s4
980; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
981; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s3, v7
982; GFX8-NEXT:    v_mov_b32_e32 v0, s6
983; GFX8-NEXT:    v_mov_b32_e32 v1, s7
984; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
985; GFX8-NEXT:    s_endpgm
986;
987; GFX9-LABEL: sdivrem_v4i32:
988; GFX9:       ; %bb.0:
989; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
990; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX9-NEXT:    s_ashr_i32 s12, s4, 31
992; GFX9-NEXT:    s_add_i32 s4, s4, s12
993; GFX9-NEXT:    s_xor_b32 s4, s4, s12
994; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
995; GFX9-NEXT:    s_ashr_i32 s13, s5, 31
996; GFX9-NEXT:    s_add_i32 s5, s5, s13
997; GFX9-NEXT:    s_xor_b32 s5, s5, s13
998; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
999; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
1000; GFX9-NEXT:    s_sub_i32 s15, 0, s4
1001; GFX9-NEXT:    s_ashr_i32 s14, s0, 31
1002; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1003; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1004; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1005; GFX9-NEXT:    s_add_i32 s0, s0, s14
1006; GFX9-NEXT:    s_xor_b32 s0, s0, s14
1007; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v0
1008; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1009; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1010; GFX9-NEXT:    s_sub_i32 s15, 0, s5
1011; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1012; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
1013; GFX9-NEXT:    v_mul_lo_u32 v3, s15, v1
1014; GFX9-NEXT:    s_ashr_i32 s15, s1, 31
1015; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1016; GFX9-NEXT:    v_mul_hi_u32 v0, s0, v0
1017; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
1018; GFX9-NEXT:    s_add_i32 s1, s1, s15
1019; GFX9-NEXT:    s_xor_b32 s1, s1, s15
1020; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
1021; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1022; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
1023; GFX9-NEXT:    v_mul_hi_u32 v1, s1, v1
1024; GFX9-NEXT:    v_sub_u32_e32 v3, s0, v3
1025; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
1026; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1027; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v3
1028; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1029; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
1030; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
1031; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1032; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v2
1033; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
1034; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s5
1035; GFX9-NEXT:    s_xor_b32 s0, s14, s12
1036; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
1037; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
1038; GFX9-NEXT:    v_xor_b32_e32 v2, s14, v2
1039; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
1040; GFX9-NEXT:    v_subrev_u32_e32 v4, s14, v2
1041; GFX9-NEXT:    v_sub_u32_e32 v2, s1, v3
1042; GFX9-NEXT:    s_add_i32 s1, s6, s0
1043; GFX9-NEXT:    s_xor_b32 s1, s1, s0
1044; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s1
1045; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1046; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
1047; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1048; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1049; GFX9-NEXT:    v_subrev_u32_e32 v5, s5, v2
1050; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1051; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1052; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1053; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1054; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
1055; GFX9-NEXT:    s_sub_i32 s4, 0, s1
1056; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1057; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
1058; GFX9-NEXT:    s_xor_b32 s4, s15, s13
1059; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
1060; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
1061; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
1062; GFX9-NEXT:    s_add_i32 s6, s7, s4
1063; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
1064; GFX9-NEXT:    s_xor_b32 s6, s6, s4
1065; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s6
1066; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v2
1067; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
1068; GFX9-NEXT:    s_add_i32 s2, s2, s5
1069; GFX9-NEXT:    s_xor_b32 s2, s2, s5
1070; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
1071; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v3
1072; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
1073; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1074; GFX9-NEXT:    v_xor_b32_e32 v2, s15, v2
1075; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s1
1076; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
1077; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
1078; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v2
1079; GFX9-NEXT:    v_sub_u32_e32 v2, s2, v6
1080; GFX9-NEXT:    s_sub_i32 s2, 0, s6
1081; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v7
1082; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1083; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v2
1084; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1085; GFX9-NEXT:    v_subrev_u32_e32 v6, s1, v2
1086; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1087; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v8
1088; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1089; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v2
1090; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1091; GFX9-NEXT:    v_subrev_u32_e32 v6, s1, v2
1092; GFX9-NEXT:    s_ashr_i32 s1, s3, 31
1093; GFX9-NEXT:    s_add_i32 s2, s3, s1
1094; GFX9-NEXT:    s_xor_b32 s2, s2, s1
1095; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
1096; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v7
1097; GFX9-NEXT:    s_xor_b32 s0, s5, s0
1098; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc
1099; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v3
1100; GFX9-NEXT:    v_mul_lo_u32 v3, v7, s6
1101; GFX9-NEXT:    v_add_u32_e32 v8, 1, v7
1102; GFX9-NEXT:    v_subrev_u32_e32 v2, s0, v2
1103; GFX9-NEXT:    s_xor_b32 s0, s1, s4
1104; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
1105; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
1106; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
1107; GFX9-NEXT:    v_subrev_u32_e32 v8, s6, v3
1108; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
1109; GFX9-NEXT:    v_add_u32_e32 v8, 1, v7
1110; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
1111; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
1112; GFX9-NEXT:    v_subrev_u32_e32 v8, s6, v3
1113; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
1114; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v7
1115; GFX9-NEXT:    v_xor_b32_e32 v6, s5, v6
1116; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v3
1117; GFX9-NEXT:    v_xor_b32_e32 v7, s1, v8
1118; GFX9-NEXT:    v_mov_b32_e32 v8, 0
1119; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v6
1120; GFX9-NEXT:    v_subrev_u32_e32 v7, s1, v7
1121; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
1123; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[10:11]
1124; GFX9-NEXT:    s_endpgm
1125;
1126; GFX10-LABEL: sdivrem_v4i32:
1127; GFX10:       ; %bb.0:
1128; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
1129; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1130; GFX10-NEXT:    s_ashr_i32 s10, s4, 31
1131; GFX10-NEXT:    s_ashr_i32 s11, s5, 31
1132; GFX10-NEXT:    s_ashr_i32 s12, s6, 31
1133; GFX10-NEXT:    s_ashr_i32 s13, s7, 31
1134; GFX10-NEXT:    s_add_i32 s4, s4, s10
1135; GFX10-NEXT:    s_add_i32 s5, s5, s11
1136; GFX10-NEXT:    s_add_i32 s6, s6, s12
1137; GFX10-NEXT:    s_add_i32 s7, s7, s13
1138; GFX10-NEXT:    s_xor_b32 s14, s4, s10
1139; GFX10-NEXT:    s_xor_b32 s15, s5, s11
1140; GFX10-NEXT:    s_xor_b32 s16, s6, s12
1141; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s14
1142; GFX10-NEXT:    s_xor_b32 s17, s7, s13
1143; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
1144; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s16
1145; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s17
1146; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1147; GFX10-NEXT:    s_sub_i32 s4, 0, s14
1148; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1149; GFX10-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1150; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1151; GFX10-NEXT:    s_sub_i32 s5, 0, s15
1152; GFX10-NEXT:    s_sub_i32 s6, 0, s16
1153; GFX10-NEXT:    s_ashr_i32 s18, s0, 31
1154; GFX10-NEXT:    s_ashr_i32 s19, s1, 31
1155; GFX10-NEXT:    s_ashr_i32 s20, s2, 31
1156; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1157; GFX10-NEXT:    s_ashr_i32 s21, s3, 31
1158; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1159; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1160; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1161; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
1162; GFX10-NEXT:    s_add_i32 s0, s0, s18
1163; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
1164; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
1165; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
1166; GFX10-NEXT:    v_mul_lo_u32 v4, s4, v0
1167; GFX10-NEXT:    s_sub_i32 s4, 0, s17
1168; GFX10-NEXT:    v_mul_lo_u32 v5, s5, v1
1169; GFX10-NEXT:    v_mul_lo_u32 v6, s6, v2
1170; GFX10-NEXT:    v_mul_lo_u32 v7, s4, v3
1171; GFX10-NEXT:    s_add_i32 s1, s1, s19
1172; GFX10-NEXT:    s_add_i32 s2, s2, s20
1173; GFX10-NEXT:    s_add_i32 s3, s3, s21
1174; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v4
1175; GFX10-NEXT:    s_xor_b32 s0, s0, s18
1176; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v5
1177; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v6
1178; GFX10-NEXT:    v_mul_hi_u32 v7, v3, v7
1179; GFX10-NEXT:    s_xor_b32 s1, s1, s19
1180; GFX10-NEXT:    s_xor_b32 s2, s2, s20
1181; GFX10-NEXT:    s_xor_b32 s3, s3, s21
1182; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v4
1183; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
1184; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v5
1185; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v6
1186; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v7
1187; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
1188; GFX10-NEXT:    s_xor_b32 s8, s18, s10
1189; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
1190; GFX10-NEXT:    v_mul_hi_u32 v2, s2, v2
1191; GFX10-NEXT:    v_mul_hi_u32 v3, s3, v3
1192; GFX10-NEXT:    s_xor_b32 s9, s19, s11
1193; GFX10-NEXT:    s_xor_b32 s10, s20, s12
1194; GFX10-NEXT:    v_mul_lo_u32 v4, v0, s14
1195; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
1196; GFX10-NEXT:    v_mul_lo_u32 v5, v1, s15
1197; GFX10-NEXT:    v_mul_lo_u32 v6, v2, s16
1198; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s17
1199; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
1200; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
1201; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
1202; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s0, v4
1203; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s1, v5
1204; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s2, v6
1205; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s3, v7
1206; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
1207; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v5
1208; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s16, v6
1209; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s17, v7
1210; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
1211; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s14, v4
1212; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
1213; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s15, v5
1214; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
1215; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s16, v6
1216; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s2
1217; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s17, v7
1218; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
1219; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
1220; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
1221; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
1222; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s2
1223; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
1224; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
1225; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
1226; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v4
1227; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v5
1228; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s16, v6
1229; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s17, v7
1230; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
1231; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s14, v4
1232; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
1233; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s15, v5
1234; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
1235; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s16, v6
1236; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s2
1237; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s17, v7
1238; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
1239; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
1240; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
1241; GFX10-NEXT:    s_xor_b32 s0, s21, s13
1242; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s2
1243; GFX10-NEXT:    v_xor_b32_e32 v0, s8, v0
1244; GFX10-NEXT:    v_xor_b32_e32 v1, s9, v1
1245; GFX10-NEXT:    v_xor_b32_e32 v2, s10, v2
1246; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v3
1247; GFX10-NEXT:    v_xor_b32_e32 v4, s18, v4
1248; GFX10-NEXT:    v_xor_b32_e32 v5, s19, v5
1249; GFX10-NEXT:    v_xor_b32_e32 v6, s20, v6
1250; GFX10-NEXT:    v_xor_b32_e32 v7, s21, v7
1251; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s8, v0
1252; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s9, v1
1253; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s10, v2
1254; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s0, v3
1255; GFX10-NEXT:    v_mov_b32_e32 v8, 0
1256; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, s18, v4
1257; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s19, v5
1258; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s20, v6
1259; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s21, v7
1260; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
1262; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
1263; GFX10-NEXT:    s_endpgm
1264  %div = sdiv <4 x i32> %x, %y
1265  store <4 x i32> %div, ptr addrspace(1) %out0
1266  %rem = srem <4 x i32> %x, %y
1267  store <4 x i32> %rem, ptr addrspace(1) %out1
1268  ret void
1269}
1270
1271define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
1272; GFX8-LABEL: sdivrem_v2i64:
1273; GFX8:       ; %bb.0:
1274; GFX8-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
1275; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
1276; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX8-NEXT:    s_ashr_i32 s4, s17, 31
1278; GFX8-NEXT:    s_ashr_i32 s6, s1, 31
1279; GFX8-NEXT:    s_add_u32 s10, s16, s4
1280; GFX8-NEXT:    s_addc_u32 s11, s17, s4
1281; GFX8-NEXT:    s_add_u32 s0, s0, s6
1282; GFX8-NEXT:    s_mov_b32 s7, s6
1283; GFX8-NEXT:    s_addc_u32 s1, s1, s6
1284; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
1285; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
1286; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
1287; GFX8-NEXT:    s_mov_b32 s5, s4
1288; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[4:5]
1289; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
1290; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
1291; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1292; GFX8-NEXT:    s_sub_u32 s16, 0, s8
1293; GFX8-NEXT:    s_subb_u32 s17, 0, s9
1294; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1295; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1296; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
1297; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
1298; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
1299; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
1300; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
1301; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
1302; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
1303; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
1304; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
1305; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
1306; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
1307; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
1308; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
1309; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
1310; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
1311; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
1312; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1313; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
1314; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1315; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
1316; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1317; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
1318; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
1319; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1320; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
1321; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1322; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1323; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
1324; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
1325; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
1326; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
1327; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
1328; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
1329; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
1330; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
1331; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
1332; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
1333; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
1334; GFX8-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
1335; GFX8-NEXT:    s_ashr_i32 s6, s19, 31
1336; GFX8-NEXT:    s_mov_b32 s7, s6
1337; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
1338; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1339; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
1340; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1341; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
1342; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
1343; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
1344; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
1345; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1346; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
1347; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1348; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
1349; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
1350; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1351; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1352; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
1353; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
1354; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
1355; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
1356; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
1357; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
1358; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
1359; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
1360; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
1361; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1362; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1363; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
1364; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1365; GFX8-NEXT:    v_mul_lo_u32 v4, s11, v1
1366; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1367; GFX8-NEXT:    v_mul_hi_u32 v3, s10, v1
1368; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
1369; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
1370; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
1371; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1372; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
1373; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
1374; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
1375; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1376; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1377; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
1378; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
1379; GFX8-NEXT:    v_mov_b32_e32 v6, s11
1380; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s10, v0
1381; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
1382; GFX8-NEXT:    v_mov_b32_e32 v5, s9
1383; GFX8-NEXT:    s_ashr_i32 s10, s3, 31
1384; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
1385; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v1
1386; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
1387; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
1388; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
1389; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
1390; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
1391; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
1392; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s8, v7
1393; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
1394; GFX8-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
1395; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], 1, v4
1396; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
1397; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
1398; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1399; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
1400; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
1401; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
1402; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
1403; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v1
1404; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
1405; GFX8-NEXT:    s_add_u32 s0, s18, s6
1406; GFX8-NEXT:    s_addc_u32 s1, s19, s6
1407; GFX8-NEXT:    s_add_u32 s2, s2, s10
1408; GFX8-NEXT:    s_mov_b32 s11, s10
1409; GFX8-NEXT:    s_addc_u32 s3, s3, s10
1410; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
1411; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s3
1412; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
1413; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
1414; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v8
1415; GFX8-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
1416; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
1417; GFX8-NEXT:    v_add_f32_e32 v0, v0, v5
1418; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1419; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
1420; GFX8-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc
1421; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
1422; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1423; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1424; GFX8-NEXT:    v_trunc_f32_e32 v11, v1
1425; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v11
1426; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
1427; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v0
1428; GFX8-NEXT:    s_sub_u32 s5, 0, s2
1429; GFX8-NEXT:    s_subb_u32 s20, 0, s3
1430; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
1431; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
1432; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
1433; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1434; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v11
1435; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v10, s[0:1]
1436; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v15, vcc
1437; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
1438; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
1439; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v0
1440; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
1441; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v16, vcc
1442; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
1443; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v1
1444; GFX8-NEXT:    v_mul_hi_u32 v2, v12, v0
1445; GFX8-NEXT:    v_mul_hi_u32 v0, v5, v0
1446; GFX8-NEXT:    v_xor_b32_e32 v9, s17, v10
1447; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
1448; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1449; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1450; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1451; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v1
1452; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
1453; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v1
1454; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
1455; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1456; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
1457; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1458; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
1459; GFX8-NEXT:    v_mul_hi_u32 v1, v5, v1
1460; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1461; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1462; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1463; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
1464; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v12, v0
1465; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
1466; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
1467; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v4
1468; GFX8-NEXT:    v_mov_b32_e32 v0, v3
1469; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
1470; GFX8-NEXT:    v_mov_b32_e32 v10, s17
1471; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s16, v1
1472; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
1473; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
1474; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v7
1475; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
1476; GFX8-NEXT:    v_mul_lo_u32 v9, v8, v3
1477; GFX8-NEXT:    v_mul_hi_u32 v11, v8, v2
1478; GFX8-NEXT:    v_mul_hi_u32 v2, v5, v2
1479; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
1480; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
1481; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1482; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
1483; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1484; GFX8-NEXT:    v_mul_lo_u32 v11, v5, v3
1485; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
1486; GFX8-NEXT:    v_mul_hi_u32 v9, v8, v3
1487; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v11, v2
1488; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
1489; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v9
1490; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1491; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
1492; GFX8-NEXT:    v_mul_hi_u32 v3, v5, v3
1493; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
1494; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1495; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
1496; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
1497; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
1498; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
1499; GFX8-NEXT:    v_mov_b32_e32 v10, s4
1500; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v2
1501; GFX8-NEXT:    v_mul_lo_u32 v8, s8, v3
1502; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
1503; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v10, vcc
1504; GFX8-NEXT:    v_mul_hi_u32 v6, s8, v2
1505; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
1506; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1507; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
1508; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1509; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v3
1510; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v2
1511; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
1512; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v3
1513; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
1514; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1515; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
1516; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1517; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
1518; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v2, v6
1519; GFX8-NEXT:    v_mul_hi_u32 v9, s9, v3
1520; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
1521; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1522; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
1523; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
1524; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
1525; GFX8-NEXT:    v_mov_b32_e32 v10, s9
1526; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
1527; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
1528; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1529; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
1530; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s9, v6
1531; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
1532; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
1533; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
1534; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1535; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
1536; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
1537; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
1538; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v2
1539; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
1540; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
1541; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
1542; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
1543; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
1544; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
1545; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
1546; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v8
1547; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
1548; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
1549; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v14
1550; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
1551; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
1552; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
1553; GFX8-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
1554; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
1555; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
1556; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
1557; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
1558; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v14, s[0:1]
1559; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
1560; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
1561; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
1562; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
1563; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
1564; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v6
1565; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v8
1566; GFX8-NEXT:    v_mov_b32_e32 v6, s1
1567; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
1568; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
1569; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v9
1570; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
1571; GFX8-NEXT:    v_mov_b32_e32 v8, s6
1572; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v6
1573; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v8, vcc
1574; GFX8-NEXT:    v_mov_b32_e32 v8, s12
1575; GFX8-NEXT:    v_mov_b32_e32 v9, s13
1576; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1577; GFX8-NEXT:    s_nop 0
1578; GFX8-NEXT:    v_mov_b32_e32 v0, s14
1579; GFX8-NEXT:    v_mov_b32_e32 v1, s15
1580; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
1581; GFX8-NEXT:    s_endpgm
1582;
1583; GFX9-LABEL: sdivrem_v2i64:
1584; GFX9:       ; %bb.0:
1585; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
1586; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
1587; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1588; GFX9-NEXT:    s_ashr_i32 s4, s17, 31
1589; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
1590; GFX9-NEXT:    s_add_u32 s10, s16, s4
1591; GFX9-NEXT:    s_addc_u32 s11, s17, s4
1592; GFX9-NEXT:    s_add_u32 s0, s0, s6
1593; GFX9-NEXT:    s_mov_b32 s7, s6
1594; GFX9-NEXT:    s_addc_u32 s1, s1, s6
1595; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
1596; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
1597; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
1598; GFX9-NEXT:    s_mov_b32 s5, s4
1599; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[4:5]
1600; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
1601; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
1602; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1603; GFX9-NEXT:    s_sub_u32 s16, 0, s8
1604; GFX9-NEXT:    s_subb_u32 s17, 0, s9
1605; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1606; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1607; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
1608; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
1609; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
1610; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
1611; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
1612; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
1613; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
1614; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
1615; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
1616; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
1617; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
1618; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
1619; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
1620; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
1621; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
1622; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
1623; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1624; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
1625; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1626; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
1627; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1628; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
1629; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
1630; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1631; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1632; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
1633; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1634; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
1635; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
1636; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
1637; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
1638; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
1639; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
1640; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
1641; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
1642; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
1643; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
1644; GFX9-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
1645; GFX9-NEXT:    s_ashr_i32 s6, s19, 31
1646; GFX9-NEXT:    s_mov_b32 s7, s6
1647; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
1648; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1649; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
1650; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1651; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
1652; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
1653; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
1654; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
1655; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
1656; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1657; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
1658; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1659; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1660; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
1661; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1662; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
1663; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
1664; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
1665; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
1666; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
1667; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
1668; GFX9-NEXT:    v_mul_hi_u32 v0, s11, v0
1669; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v1
1670; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
1671; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1672; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1673; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1674; GFX9-NEXT:    v_mul_lo_u32 v4, s11, v1
1675; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
1676; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v1
1677; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
1678; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
1679; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
1680; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1681; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
1682; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
1683; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1684; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
1685; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
1686; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
1687; GFX9-NEXT:    v_mov_b32_e32 v6, s11
1688; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v0
1689; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
1690; GFX9-NEXT:    v_mov_b32_e32 v4, s9
1691; GFX9-NEXT:    s_ashr_i32 s10, s3, 31
1692; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
1693; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v1
1694; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
1695; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
1696; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
1697; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
1698; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
1699; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
1700; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s8, v7
1701; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v2, s[0:1]
1702; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
1703; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
1704; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
1705; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
1706; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
1707; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
1708; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
1709; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
1710; GFX9-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[0:1]
1711; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
1712; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
1713; GFX9-NEXT:    s_add_u32 s0, s18, s6
1714; GFX9-NEXT:    s_addc_u32 s1, s19, s6
1715; GFX9-NEXT:    s_add_u32 s2, s2, s10
1716; GFX9-NEXT:    s_mov_b32 s11, s10
1717; GFX9-NEXT:    s_addc_u32 s3, s3, s10
1718; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
1719; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
1720; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s2
1721; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
1722; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
1723; GFX9-NEXT:    v_add_f32_e32 v1, v1, v15
1724; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1725; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v9
1726; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
1727; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
1728; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1729; GFX9-NEXT:    v_trunc_f32_e32 v16, v1
1730; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v16
1731; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
1732; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v0
1733; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
1734; GFX9-NEXT:    s_sub_u32 s5, 0, s2
1735; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
1736; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
1737; GFX9-NEXT:    v_cndmask_b32_e32 v12, v2, v13, vcc
1738; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v16
1739; GFX9-NEXT:    s_subb_u32 s20, 0, s3
1740; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
1741; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
1742; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
1743; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
1744; GFX9-NEXT:    v_cndmask_b32_e64 v8, v3, v11, s[0:1]
1745; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
1746; GFX9-NEXT:    v_mul_lo_u32 v2, v13, v0
1747; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v15, vcc
1748; GFX9-NEXT:    v_mul_lo_u32 v3, v17, v1
1749; GFX9-NEXT:    v_mul_hi_u32 v10, v17, v0
1750; GFX9-NEXT:    v_mul_hi_u32 v0, v13, v0
1751; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
1752; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
1753; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1754; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
1755; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1756; GFX9-NEXT:    v_mul_lo_u32 v10, v13, v1
1757; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
1758; GFX9-NEXT:    v_mul_hi_u32 v3, v17, v1
1759; GFX9-NEXT:    v_mul_hi_u32 v1, v13, v1
1760; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
1761; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
1762; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
1763; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1764; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1765; GFX9-NEXT:    v_add_u32_e32 v3, v10, v3
1766; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1767; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v17, v0
1768; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
1769; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
1770; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
1771; GFX9-NEXT:    v_mov_b32_e32 v0, v3
1772; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v4, s[0:1]
1773; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
1774; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
1775; GFX9-NEXT:    v_xor_b32_e32 v5, s16, v5
1776; GFX9-NEXT:    v_xor_b32_e32 v8, s17, v8
1777; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
1778; GFX9-NEXT:    v_mov_b32_e32 v9, s17
1779; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v5
1780; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v7
1781; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v2
1782; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v3
1783; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
1784; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v2
1785; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
1786; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1787; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
1788; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1789; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
1790; GFX9-NEXT:    v_mul_hi_u32 v2, v11, v2
1791; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
1792; GFX9-NEXT:    v_mul_hi_u32 v7, v10, v3
1793; GFX9-NEXT:    v_mul_hi_u32 v3, v11, v3
1794; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
1795; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1796; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
1797; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1798; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
1799; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
1800; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1801; GFX9-NEXT:    v_add3_u32 v3, v7, v5, v3
1802; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
1803; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
1804; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
1805; GFX9-NEXT:    v_mul_lo_u32 v7, s8, v3
1806; GFX9-NEXT:    v_mul_hi_u32 v9, s8, v2
1807; GFX9-NEXT:    v_mul_hi_u32 v2, s9, v2
1808; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v3
1809; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
1810; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1811; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
1812; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1813; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v3
1814; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
1815; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
1816; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
1817; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
1818; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1819; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
1820; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1821; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v2, v5
1822; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
1823; GFX9-NEXT:    v_mov_b32_e32 v8, s4
1824; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
1825; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s4, v4
1826; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
1827; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
1828; GFX9-NEXT:    v_add3_u32 v8, v6, v11, v12
1829; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
1830; GFX9-NEXT:    v_mov_b32_e32 v9, s9
1831; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v2
1832; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
1833; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1834; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
1835; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
1836; GFX9-NEXT:    v_sub_u32_e32 v6, s9, v6
1837; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
1838; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
1839; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1840; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
1841; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
1842; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
1843; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s2, v2
1844; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
1845; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
1846; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
1847; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
1848; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
1849; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
1850; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
1851; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v10
1852; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
1853; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
1854; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v14
1855; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
1856; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
1857; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
1858; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
1859; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
1860; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
1861; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
1862; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
1863; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
1864; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[0:1]
1865; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s[0:1]
1866; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
1867; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
1868; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
1869; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v6
1870; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v8
1871; GFX9-NEXT:    v_mov_b32_e32 v6, s1
1872; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
1873; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
1874; GFX9-NEXT:    v_xor_b32_e32 v6, s6, v9
1875; GFX9-NEXT:    v_mov_b32_e32 v13, 0
1876; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v7
1877; GFX9-NEXT:    v_mov_b32_e32 v8, s6
1878; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v6
1879; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
1880; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[12:13]
1881; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[14:15]
1882; GFX9-NEXT:    s_endpgm
1883;
1884; GFX10-LABEL: sdivrem_v2i64:
1885; GFX10:       ; %bb.0:
1886; GFX10-NEXT:    s_clause 0x1
1887; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
1888; GFX10-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
1889; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1890; GFX10-NEXT:    s_ashr_i32 s8, s1, 31
1891; GFX10-NEXT:    s_ashr_i32 s4, s17, 31
1892; GFX10-NEXT:    s_mov_b32 s9, s8
1893; GFX10-NEXT:    s_add_u32 s10, s16, s4
1894; GFX10-NEXT:    s_addc_u32 s11, s17, s4
1895; GFX10-NEXT:    s_add_u32 s0, s0, s8
1896; GFX10-NEXT:    s_addc_u32 s1, s1, s8
1897; GFX10-NEXT:    s_mov_b32 s5, s4
1898; GFX10-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
1899; GFX10-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
1900; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s7
1901; GFX10-NEXT:    s_sub_u32 s21, 0, s6
1902; GFX10-NEXT:    s_subb_u32 s20, 0, s7
1903; GFX10-NEXT:    s_xor_b64 s[16:17], s[4:5], s[8:9]
1904; GFX10-NEXT:    s_ashr_i32 s8, s19, 31
1905; GFX10-NEXT:    s_ashr_i32 s10, s3, 31
1906; GFX10-NEXT:    s_add_u32 s18, s18, s8
1907; GFX10-NEXT:    s_addc_u32 s19, s19, s8
1908; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
1909; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
1910; GFX10-NEXT:    s_add_u32 s2, s2, s10
1911; GFX10-NEXT:    s_mov_b32 s11, s10
1912; GFX10-NEXT:    s_addc_u32 s3, s3, s10
1913; GFX10-NEXT:    s_mov_b32 s9, s8
1914; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
1915; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
1916; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
1917; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s2
1918; GFX10-NEXT:    s_xor_b64 s[18:19], s[18:19], s[8:9]
1919; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1920; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
1921; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
1922; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1923; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1924; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
1925; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
1926; GFX10-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v1
1927; GFX10-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
1928; GFX10-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
1929; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v2
1930; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
1931; GFX10-NEXT:    v_trunc_f32_e32 v6, v4
1932; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
1933; GFX10-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v6
1934; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s21, v7, 0
1935; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
1936; GFX10-NEXT:    s_sub_u32 s5, 0, s2
1937; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v3
1938; GFX10-NEXT:    v_mul_hi_u32 v10, v9, v0
1939; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s22, s5, v8, 0
1940; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2]
1941; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v6
1942; GFX10-NEXT:    v_mov_b32_e32 v1, v3
1943; GFX10-NEXT:    v_mul_hi_u32 v6, v7, v0
1944; GFX10-NEXT:    s_subb_u32 s22, 0, s3
1945; GFX10-NEXT:    v_mul_hi_u32 v12, v8, v2
1946; GFX10-NEXT:    v_mul_lo_u32 v11, v5, v2
1947; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5]
1948; GFX10-NEXT:    v_mul_lo_u32 v4, v9, v0
1949; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2]
1950; GFX10-NEXT:    v_mul_hi_u32 v2, v5, v2
1951; GFX10-NEXT:    v_mul_lo_u32 v13, v7, v3
1952; GFX10-NEXT:    v_mul_lo_u32 v14, v9, v3
1953; GFX10-NEXT:    v_mul_hi_u32 v15, v7, v3
1954; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1]
1955; GFX10-NEXT:    v_mul_hi_u32 v1, v9, v3
1956; GFX10-NEXT:    v_add_co_u32 v3, s23, v4, v13
1957; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
1958; GFX10-NEXT:    v_add_co_u32 v10, s23, v14, v10
1959; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s23
1960; GFX10-NEXT:    v_mul_lo_u32 v14, v8, v0
1961; GFX10-NEXT:    v_add_co_u32 v3, s23, v3, v6
1962; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
1963; GFX10-NEXT:    v_add_co_u32 v6, s23, v10, v15
1964; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v0
1965; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
1966; GFX10-NEXT:    v_mul_hi_u32 v16, v8, v0
1967; GFX10-NEXT:    v_mul_hi_u32 v17, v5, v0
1968; GFX10-NEXT:    v_add_nc_u32_e32 v0, v4, v3
1969; GFX10-NEXT:    v_add_co_u32 v4, s23, v11, v14
1970; GFX10-NEXT:    v_add_nc_u32_e32 v3, v13, v10
1971; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
1972; GFX10-NEXT:    v_add_co_u32 v2, s23, v15, v2
1973; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s23
1974; GFX10-NEXT:    v_add_co_u32 v0, s23, v6, v0
1975; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s23
1976; GFX10-NEXT:    v_add_co_u32 v4, s23, v4, v12
1977; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
1978; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v16
1979; GFX10-NEXT:    v_add3_u32 v1, v3, v6, v1
1980; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v7, v0
1981; GFX10-NEXT:    v_add_nc_u32_e32 v3, v10, v4
1982; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
1983; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo
1984; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s21, v6, 0
1985; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v3
1986; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v12
1987; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
1988; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1989; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v2
1990; GFX10-NEXT:    v_mul_hi_u32 v11, v7, v0
1991; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v17
1992; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo
1993; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s23, s5, v8, 0
1994; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2]
1995; GFX10-NEXT:    v_mov_b32_e32 v1, v3
1996; GFX10-NEXT:    v_mul_lo_u32 v12, v9, v2
1997; GFX10-NEXT:    v_mul_hi_u32 v13, v8, v2
1998; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5]
1999; GFX10-NEXT:    v_mul_lo_u32 v4, v7, v0
2000; GFX10-NEXT:    v_mul_hi_u32 v5, v6, v0
2001; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2]
2002; GFX10-NEXT:    v_mul_hi_u32 v2, v9, v2
2003; GFX10-NEXT:    v_mul_lo_u32 v14, v6, v3
2004; GFX10-NEXT:    v_mul_lo_u32 v15, v7, v3
2005; GFX10-NEXT:    v_mul_hi_u32 v16, v6, v3
2006; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1]
2007; GFX10-NEXT:    v_mul_hi_u32 v1, v7, v3
2008; GFX10-NEXT:    v_add_co_u32 v3, s5, v4, v14
2009; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
2010; GFX10-NEXT:    v_add_co_u32 v11, s5, v15, v11
2011; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
2012; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v5
2013; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v0
2014; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s5
2015; GFX10-NEXT:    v_add_co_u32 v5, s5, v11, v16
2016; GFX10-NEXT:    v_mul_lo_u32 v16, v9, v0
2017; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
2018; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
2019; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v0
2020; GFX10-NEXT:    v_mul_hi_u32 v0, v9, v0
2021; GFX10-NEXT:    v_add_nc_u32_e32 v4, v14, v11
2022; GFX10-NEXT:    v_add_co_u32 v11, s5, v12, v15
2023; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
2024; GFX10-NEXT:    v_add_co_u32 v2, s5, v16, v2
2025; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
2026; GFX10-NEXT:    v_add_co_u32 v3, s5, v5, v3
2027; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
2028; GFX10-NEXT:    v_add_co_u32 v11, s5, v11, v13
2029; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
2030; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v17
2031; GFX10-NEXT:    v_add3_u32 v1, v4, v5, v1
2032; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v6, v3
2033; GFX10-NEXT:    v_add_nc_u32_e32 v4, v12, v11
2034; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s5
2035; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo
2036; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v3
2037; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v4
2038; GFX10-NEXT:    v_add_nc_u32_e32 v5, v14, v13
2039; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
2040; GFX10-NEXT:    v_mul_lo_u32 v11, s0, v1
2041; GFX10-NEXT:    v_mul_hi_u32 v7, s0, v3
2042; GFX10-NEXT:    v_mul_hi_u32 v3, s1, v3
2043; GFX10-NEXT:    v_mul_lo_u32 v12, s1, v1
2044; GFX10-NEXT:    v_add3_u32 v0, v5, v4, v0
2045; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
2046; GFX10-NEXT:    v_mul_hi_u32 v4, s0, v1
2047; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v1
2048; GFX10-NEXT:    v_add_co_u32 v1, s5, v6, v11
2049; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo
2050; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
2051; GFX10-NEXT:    v_add_co_u32 v3, s5, v12, v3
2052; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
2053; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v7
2054; GFX10-NEXT:    v_mul_lo_u32 v0, s19, v2
2055; GFX10-NEXT:    v_mul_lo_u32 v12, s18, v8
2056; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
2057; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v4
2058; GFX10-NEXT:    v_mul_hi_u32 v9, s18, v2
2059; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
2060; GFX10-NEXT:    v_mul_hi_u32 v2, s19, v2
2061; GFX10-NEXT:    v_mul_lo_u32 v7, s19, v8
2062; GFX10-NEXT:    v_add_nc_u32_e32 v1, v6, v1
2063; GFX10-NEXT:    v_add_co_u32 v6, s5, v0, v12
2064; GFX10-NEXT:    v_mul_hi_u32 v13, s18, v8
2065; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v4
2066; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
2067; GFX10-NEXT:    v_add_co_u32 v12, s5, v3, v1
2068; GFX10-NEXT:    v_add_co_u32 v2, s20, v7, v2
2069; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s5
2070; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v12, 0
2071; GFX10-NEXT:    v_add_co_u32 v6, s5, v6, v9
2072; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
2073; GFX10-NEXT:    v_add_co_u32 v9, s5, v2, v13
2074; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s20
2075; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
2076; GFX10-NEXT:    v_add3_u32 v4, v4, v7, v5
2077; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v6
2078; GFX10-NEXT:    v_mul_hi_u32 v5, s19, v8
2079; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v12, 1
2080; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v2
2081; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2]
2082; GFX10-NEXT:    v_add_co_u32 v6, s5, v9, v6
2083; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
2084; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo
2085; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v7, 1
2086; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2]
2087; GFX10-NEXT:    v_add3_u32 v5, v3, v9, v5
2088; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s5, s2, v6, 0
2089; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo
2090; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s0, v0
2091; GFX10-NEXT:    v_sub_nc_u32_e32 v9, s1, v1
2092; GFX10-NEXT:    v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo
2093; GFX10-NEXT:    v_mov_b32_e32 v0, v3
2094; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
2095; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v14, s6
2096; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo
2097; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v14
2098; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
2099; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s0
2100; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v3
2101; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s0
2102; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v16
2103; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s0
2104; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v15
2105; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s0
2106; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1]
2107; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v16
2108; GFX10-NEXT:    v_cndmask_b32_e64 v1, v19, v18, s0
2109; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v15
2110; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
2111; GFX10-NEXT:    v_cndmask_b32_e64 v17, v20, v17, s0
2112; GFX10-NEXT:    v_sub_co_u32 v1, s0, v3, s6
2113; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
2114; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
2115; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
2116; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1]
2117; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v17
2118; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
2119; GFX10-NEXT:    v_cndmask_b32_e64 v1, v12, v7, s0
2120; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v9, vcc_lo
2121; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s18, v2
2122; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s0
2123; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo
2124; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s19, v0
2125; GFX10-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s0
2126; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
2127; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v8
2128; GFX10-NEXT:    v_xor_b32_e32 v1, s16, v1
2129; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo
2130; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
2131; GFX10-NEXT:    v_xor_b32_e32 v4, s17, v4
2132; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
2133; GFX10-NEXT:    v_xor_b32_e32 v3, s4, v3
2134; GFX10-NEXT:    v_xor_b32_e32 v7, s4, v7
2135; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
2136; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v2, s2
2137; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo
2138; GFX10-NEXT:    v_sub_co_u32 v0, s0, v1, s16
2139; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0
2140; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v8
2141; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo
2142; GFX10-NEXT:    v_cndmask_b32_e64 v4, v9, v12, s0
2143; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v14
2144; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
2145; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v13
2146; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
2147; GFX10-NEXT:    v_add_co_u32 v15, s0, v6, 1
2148; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v5, s0
2149; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v14
2150; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s0
2151; GFX10-NEXT:    v_add_co_u32 v12, s0, v15, 1
2152; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
2153; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
2154; GFX10-NEXT:    v_sub_co_u32 v9, s0, v13, s2
2155; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0
2156; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
2157; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v4
2158; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
2159; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v9, vcc_lo
2160; GFX10-NEXT:    v_cndmask_b32_e32 v9, v14, v11, vcc_lo
2161; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
2162; GFX10-NEXT:    v_cndmask_b32_e64 v11, v5, v15, s0
2163; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s0
2164; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s0
2165; GFX10-NEXT:    s_xor_b64 s[0:1], s[8:9], s[10:11]
2166; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v3, s4
2167; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v6
2168; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v11
2169; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo
2170; GFX10-NEXT:    v_xor_b32_e32 v7, s8, v2
2171; GFX10-NEXT:    v_xor_b32_e32 v8, s8, v8
2172; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, s0
2173; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo
2174; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s8
2175; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo
2176; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[12:13]
2177; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[14:15]
2178; GFX10-NEXT:    s_endpgm
2179  %div = sdiv <2 x i64> %x, %y
2180  store <2 x i64> %div, ptr addrspace(1) %out0
2181  %rem = srem <2 x i64> %x, %y
2182  store <2 x i64> %rem, ptr addrspace(1) %out1
2183  ret void
2184}
2185
2186define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) {
2187; GFX8-LABEL: sdiv_i8:
2188; GFX8:       ; %bb.0:
2189; GFX8-NEXT:    s_load_dword s4, s[8:9], 0x10
2190; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2191; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x80008
2192; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
2193; GFX8-NEXT:    s_add_i32 s0, s0, s5
2194; GFX8-NEXT:    s_xor_b32 s6, s0, s5
2195; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
2196; GFX8-NEXT:    s_sub_i32 s0, 0, s6
2197; GFX8-NEXT:    s_sext_i32_i8 s4, s4
2198; GFX8-NEXT:    s_ashr_i32 s7, s4, 31
2199; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2200; GFX8-NEXT:    s_add_i32 s4, s4, s7
2201; GFX8-NEXT:    s_xor_b32 s4, s4, s7
2202; GFX8-NEXT:    s_xor_b32 s5, s7, s5
2203; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2204; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2205; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
2206; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2207; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2208; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2209; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
2210; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2211; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2212; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2213; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s6
2214; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2215; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
2216; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
2217; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2218; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
2219; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2220; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2221; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
2222; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2223; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
2224; GFX8-NEXT:    v_xor_b32_e32 v2, s5, v2
2225; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2226; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s5, v2
2227; GFX8-NEXT:    v_xor_b32_e32 v3, s7, v3
2228; GFX8-NEXT:    flat_store_byte v[0:1], v2
2229; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2230; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v3
2231; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2232; GFX8-NEXT:    flat_store_byte v[0:1], v3
2233; GFX8-NEXT:    s_endpgm
2234;
2235; GFX9-LABEL: sdiv_i8:
2236; GFX9:       ; %bb.0:
2237; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x10
2238; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2239; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2240; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x80008
2241; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
2242; GFX9-NEXT:    s_add_i32 s1, s1, s4
2243; GFX9-NEXT:    s_xor_b32 s5, s1, s4
2244; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
2245; GFX9-NEXT:    s_sub_i32 s1, 0, s5
2246; GFX9-NEXT:    s_sext_i32_i8 s0, s0
2247; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
2248; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2249; GFX9-NEXT:    s_add_i32 s0, s0, s6
2250; GFX9-NEXT:    s_xor_b32 s7, s0, s6
2251; GFX9-NEXT:    s_xor_b32 s4, s6, s4
2252; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2253; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2254; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
2255; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2256; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
2257; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
2258; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
2259; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
2260; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2261; GFX9-NEXT:    v_sub_u32_e32 v1, s7, v1
2262; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
2263; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2264; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
2265; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2266; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2267; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
2268; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2269; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
2270; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2271; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
2272; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
2273; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
2274; GFX9-NEXT:    v_subrev_u32_e32 v1, s6, v1
2275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2276; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
2277; GFX9-NEXT:    global_store_byte v2, v1, s[2:3]
2278; GFX9-NEXT:    s_endpgm
2279;
2280; GFX10-LABEL: sdiv_i8:
2281; GFX10:       ; %bb.0:
2282; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x10
2283; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2284; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x80008
2285; GFX10-NEXT:    s_sext_i32_i8 s0, s0
2286; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
2287; GFX10-NEXT:    s_ashr_i32 s6, s0, 31
2288; GFX10-NEXT:    s_add_i32 s1, s1, s4
2289; GFX10-NEXT:    s_add_i32 s0, s0, s6
2290; GFX10-NEXT:    s_xor_b32 s5, s1, s4
2291; GFX10-NEXT:    s_xor_b32 s0, s0, s6
2292; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s5
2293; GFX10-NEXT:    s_sub_i32 s1, 0, s5
2294; GFX10-NEXT:    s_xor_b32 s4, s6, s4
2295; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2296; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2297; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2298; GFX10-NEXT:    v_mul_lo_u32 v1, s1, v0
2299; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
2300; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
2301; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
2302; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s5
2303; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2304; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
2305; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2306; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s5, v1
2307; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s5, v1
2308; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2309; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2310; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2311; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s5, v1
2312; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s5, v1
2313; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2314; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2315; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2316; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
2317; GFX10-NEXT:    v_xor_b32_e32 v1, s6, v1
2318; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
2319; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s6, v1
2320; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2321; GFX10-NEXT:    global_store_byte v2, v0, s[0:1]
2322; GFX10-NEXT:    global_store_byte v2, v1, s[2:3]
2323; GFX10-NEXT:    s_endpgm
2324  %div = sdiv i8 %x, %y
2325  store i8 %div, ptr addrspace(1) %out0
2326  %rem = srem i8 %x, %y
2327  store i8 %rem, ptr addrspace(1) %out1
2328  ret void
2329}
2330
2331define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) {
2332; GFX8-LABEL: sdivrem_v2i8:
2333; GFX8:       ; %bb.0:
2334; GFX8-NEXT:    s_load_dword s2, s[8:9], 0x10
2335; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2336; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x80010
2337; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
2338; GFX8-NEXT:    s_add_i32 s0, s0, s3
2339; GFX8-NEXT:    s_xor_b32 s10, s0, s3
2340; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
2341; GFX8-NEXT:    s_sub_i32 s4, 0, s10
2342; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x80018
2343; GFX8-NEXT:    s_ashr_i32 s12, s1, 31
2344; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2345; GFX8-NEXT:    s_add_i32 s1, s1, s12
2346; GFX8-NEXT:    s_xor_b32 s13, s1, s12
2347; GFX8-NEXT:    s_sext_i32_i8 s0, s2
2348; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2349; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2350; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s13
2351; GFX8-NEXT:    s_ashr_i32 s11, s0, 31
2352; GFX8-NEXT:    s_add_i32 s0, s0, s11
2353; GFX8-NEXT:    v_mul_lo_u32 v1, s4, v0
2354; GFX8-NEXT:    s_xor_b32 s0, s0, s11
2355; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2356; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
2357; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2358; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2359; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
2360; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
2361; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
2362; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s10
2363; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2364; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
2365; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2366; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2367; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
2368; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2369; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2370; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2371; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2372; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s10, v2
2373; GFX8-NEXT:    s_sub_i32 s1, 0, s13
2374; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2375; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
2376; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x80008
2377; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
2378; GFX8-NEXT:    s_add_i32 s1, s1, s2
2379; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
2380; GFX8-NEXT:    s_xor_b32 s1, s1, s2
2381; GFX8-NEXT:    s_xor_b32 s0, s11, s3
2382; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
2383; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
2384; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
2385; GFX8-NEXT:    v_xor_b32_e32 v2, s11, v2
2386; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
2387; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s13
2388; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s11, v2
2389; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2390; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
2391; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
2392; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2393; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v3
2394; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2395; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2396; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
2397; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2398; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v3
2399; GFX8-NEXT:    s_xor_b32 s0, s2, s12
2400; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
2401; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2402; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
2403; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
2404; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
2405; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
2406; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2407; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2408; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2409; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
2410; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2411; GFX8-NEXT:    flat_store_short v[0:1], v4
2412; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v3
2413; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
2414; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2415; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2416; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2417; GFX8-NEXT:    flat_store_short v[0:1], v2
2418; GFX8-NEXT:    s_endpgm
2419;
2420; GFX9-LABEL: sdivrem_v2i8:
2421; GFX9:       ; %bb.0:
2422; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
2423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2424; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80010
2425; GFX9-NEXT:    s_ashr_i32 s5, s0, 31
2426; GFX9-NEXT:    s_add_i32 s0, s0, s5
2427; GFX9-NEXT:    s_xor_b32 s6, s0, s5
2428; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
2429; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2430; GFX9-NEXT:    s_bfe_i32 s8, s4, 0x80018
2431; GFX9-NEXT:    s_ashr_i32 s9, s8, 31
2432; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2433; GFX9-NEXT:    s_add_i32 s8, s8, s9
2434; GFX9-NEXT:    s_xor_b32 s8, s8, s9
2435; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
2436; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2437; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2438; GFX9-NEXT:    s_sub_i32 s10, 0, s6
2439; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2440; GFX9-NEXT:    s_sext_i32_i8 s7, s4
2441; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v0
2442; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
2443; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2444; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2445; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2446; GFX9-NEXT:    s_add_i32 s7, s7, s10
2447; GFX9-NEXT:    s_xor_b32 s7, s7, s10
2448; GFX9-NEXT:    s_sub_i32 s11, 0, s8
2449; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2450; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
2451; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v1
2452; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x80008
2453; GFX9-NEXT:    s_ashr_i32 s11, s4, 31
2454; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
2455; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
2456; GFX9-NEXT:    s_add_i32 s4, s4, s11
2457; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
2458; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
2459; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
2460; GFX9-NEXT:    s_xor_b32 s4, s4, s11
2461; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
2462; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2463; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
2464; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
2465; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2466; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
2467; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
2468; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2469; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
2470; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
2471; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
2472; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
2473; GFX9-NEXT:    s_xor_b32 s5, s10, s5
2474; GFX9-NEXT:    v_xor_b32_e32 v0, s5, v0
2475; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
2476; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2477; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2478; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v3
2479; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2480; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
2481; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2482; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2483; GFX9-NEXT:    s_xor_b32 s4, s11, s9
2484; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
2485; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v3
2486; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
2487; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2488; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
2489; GFX9-NEXT:    v_subrev_u32_e32 v0, s5, v0
2490; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
2491; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
2492; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
2493; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2494; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2495; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
2496; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2497; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
2498; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v3
2499; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
2500; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
2501; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2502; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
2503; GFX9-NEXT:    s_endpgm
2504;
2505; GFX10-LABEL: sdivrem_v2i8:
2506; GFX10:       ; %bb.0:
2507; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x10
2508; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2509; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x80018
2510; GFX10-NEXT:    s_bfe_i32 s3, s0, 0x80010
2511; GFX10-NEXT:    s_ashr_i32 s2, s1, 31
2512; GFX10-NEXT:    s_ashr_i32 s10, s3, 31
2513; GFX10-NEXT:    s_add_i32 s1, s1, s2
2514; GFX10-NEXT:    s_add_i32 s3, s3, s10
2515; GFX10-NEXT:    s_xor_b32 s1, s1, s2
2516; GFX10-NEXT:    s_xor_b32 s3, s3, s10
2517; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
2518; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
2519; GFX10-NEXT:    s_sub_i32 s4, 0, s1
2520; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2521; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2522; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2523; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2524; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2525; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
2526; GFX10-NEXT:    v_mul_lo_u32 v2, s4, v0
2527; GFX10-NEXT:    s_sub_i32 s4, 0, s3
2528; GFX10-NEXT:    v_mul_lo_u32 v3, s4, v1
2529; GFX10-NEXT:    s_bfe_i32 s4, s0, 0x80008
2530; GFX10-NEXT:    s_sext_i32_i8 s0, s0
2531; GFX10-NEXT:    s_ashr_i32 s11, s4, 31
2532; GFX10-NEXT:    s_ashr_i32 s12, s0, 31
2533; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
2534; GFX10-NEXT:    s_add_i32 s4, s4, s11
2535; GFX10-NEXT:    s_add_i32 s0, s0, s12
2536; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
2537; GFX10-NEXT:    s_xor_b32 s4, s4, s11
2538; GFX10-NEXT:    s_xor_b32 s0, s0, s12
2539; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
2540; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2541; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
2542; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
2543; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s1
2544; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
2545; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
2546; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
2547; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
2548; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
2549; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
2550; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
2551; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
2552; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
2553; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
2554; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
2555; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
2556; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
2557; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
2558; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
2559; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
2560; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
2561; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
2562; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
2563; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
2564; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
2565; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
2566; GFX10-NEXT:    s_xor_b32 s1, s11, s2
2567; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
2568; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
2569; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
2570; GFX10-NEXT:    v_xor_b32_e32 v2, s11, v2
2571; GFX10-NEXT:    s_xor_b32 s0, s12, s10
2572; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
2573; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
2574; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
2575; GFX10-NEXT:    v_xor_b32_e32 v3, s12, v3
2576; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s11, v2
2577; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
2578; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2579; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s12, v3
2580; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2581; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2582; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2583; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2584; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2585; GFX10-NEXT:    global_store_short v1, v0, s[4:5]
2586; GFX10-NEXT:    global_store_short v1, v2, s[6:7]
2587; GFX10-NEXT:    s_endpgm
2588  %div = sdiv <2 x i8> %x, %y
2589  store <2 x i8> %div, ptr addrspace(1) %out0
2590  %rem = srem <2 x i8> %x, %y
2591  store <2 x i8> %rem, ptr addrspace(1) %out1
2592  ret void
2593}
2594
2595define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) {
2596; GFX8-LABEL: sdiv_i16:
2597; GFX8:       ; %bb.0:
2598; GFX8-NEXT:    s_load_dword s4, s[8:9], 0x10
2599; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2600; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x100010
2601; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
2602; GFX8-NEXT:    s_add_i32 s0, s0, s5
2603; GFX8-NEXT:    s_xor_b32 s6, s0, s5
2604; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
2605; GFX8-NEXT:    s_sub_i32 s0, 0, s6
2606; GFX8-NEXT:    s_sext_i32_i16 s4, s4
2607; GFX8-NEXT:    s_ashr_i32 s7, s4, 31
2608; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2609; GFX8-NEXT:    s_add_i32 s4, s4, s7
2610; GFX8-NEXT:    s_xor_b32 s4, s4, s7
2611; GFX8-NEXT:    s_xor_b32 s5, s7, s5
2612; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2613; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2614; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
2615; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2616; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2617; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2618; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
2619; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2620; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2621; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2622; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s6
2623; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2624; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
2625; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
2626; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2627; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
2628; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2629; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2630; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
2631; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2632; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
2633; GFX8-NEXT:    v_xor_b32_e32 v2, s5, v2
2634; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2635; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s5, v2
2636; GFX8-NEXT:    v_xor_b32_e32 v3, s7, v3
2637; GFX8-NEXT:    flat_store_short v[0:1], v2
2638; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2639; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v3
2640; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2641; GFX8-NEXT:    flat_store_short v[0:1], v3
2642; GFX8-NEXT:    s_endpgm
2643;
2644; GFX9-LABEL: sdiv_i16:
2645; GFX9:       ; %bb.0:
2646; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x10
2647; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2648; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2649; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x100010
2650; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
2651; GFX9-NEXT:    s_add_i32 s1, s1, s4
2652; GFX9-NEXT:    s_xor_b32 s5, s1, s4
2653; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
2654; GFX9-NEXT:    s_sub_i32 s1, 0, s5
2655; GFX9-NEXT:    s_sext_i32_i16 s0, s0
2656; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
2657; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2658; GFX9-NEXT:    s_add_i32 s0, s0, s6
2659; GFX9-NEXT:    s_xor_b32 s7, s0, s6
2660; GFX9-NEXT:    s_xor_b32 s4, s6, s4
2661; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2662; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2663; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
2664; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2665; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
2666; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
2667; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
2668; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
2669; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2670; GFX9-NEXT:    v_sub_u32_e32 v1, s7, v1
2671; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
2672; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2673; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
2674; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2675; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2676; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
2677; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2678; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
2679; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2680; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
2681; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
2682; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
2683; GFX9-NEXT:    v_subrev_u32_e32 v1, s6, v1
2684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2685; GFX9-NEXT:    global_store_short v2, v0, s[0:1]
2686; GFX9-NEXT:    global_store_short v2, v1, s[2:3]
2687; GFX9-NEXT:    s_endpgm
2688;
2689; GFX10-LABEL: sdiv_i16:
2690; GFX10:       ; %bb.0:
2691; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x10
2692; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2693; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x100010
2694; GFX10-NEXT:    s_sext_i32_i16 s0, s0
2695; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
2696; GFX10-NEXT:    s_ashr_i32 s6, s0, 31
2697; GFX10-NEXT:    s_add_i32 s1, s1, s4
2698; GFX10-NEXT:    s_add_i32 s0, s0, s6
2699; GFX10-NEXT:    s_xor_b32 s5, s1, s4
2700; GFX10-NEXT:    s_xor_b32 s0, s0, s6
2701; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s5
2702; GFX10-NEXT:    s_sub_i32 s1, 0, s5
2703; GFX10-NEXT:    s_xor_b32 s4, s6, s4
2704; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2705; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2706; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2707; GFX10-NEXT:    v_mul_lo_u32 v1, s1, v0
2708; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
2709; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
2710; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
2711; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s5
2712; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2713; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
2714; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2715; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s5, v1
2716; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s5, v1
2717; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2718; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2719; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2720; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s5, v1
2721; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s5, v1
2722; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2723; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2724; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2725; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
2726; GFX10-NEXT:    v_xor_b32_e32 v1, s6, v1
2727; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
2728; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s6, v1
2729; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2730; GFX10-NEXT:    global_store_short v2, v0, s[0:1]
2731; GFX10-NEXT:    global_store_short v2, v1, s[2:3]
2732; GFX10-NEXT:    s_endpgm
2733  %div = sdiv i16 %x, %y
2734  store i16 %div, ptr addrspace(1) %out0
2735  %rem = srem i16 %x, %y
2736  store i16 %rem, ptr addrspace(1) %out1
2737  ret void
2738}
2739
2740define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) {
2741; GFX8-LABEL: sdivrem_v2i16:
2742; GFX8:       ; %bb.0:
2743; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x10
2744; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2745; GFX8-NEXT:    s_sext_i32_i16 s0, s3
2746; GFX8-NEXT:    s_ashr_i32 s10, s0, 31
2747; GFX8-NEXT:    s_add_i32 s0, s0, s10
2748; GFX8-NEXT:    s_xor_b32 s11, s0, s10
2749; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
2750; GFX8-NEXT:    s_sub_i32 s4, 0, s11
2751; GFX8-NEXT:    s_bfe_i32 s1, s3, 0x100010
2752; GFX8-NEXT:    s_ashr_i32 s12, s1, 31
2753; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2754; GFX8-NEXT:    s_add_i32 s1, s1, s12
2755; GFX8-NEXT:    s_xor_b32 s13, s1, s12
2756; GFX8-NEXT:    s_sext_i32_i16 s0, s2
2757; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2758; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2759; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s13
2760; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
2761; GFX8-NEXT:    s_add_i32 s0, s0, s3
2762; GFX8-NEXT:    v_mul_lo_u32 v1, s4, v0
2763; GFX8-NEXT:    s_xor_b32 s0, s0, s3
2764; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2765; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
2766; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2767; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2768; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
2769; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
2770; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
2771; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s11
2772; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2773; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
2774; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
2775; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2776; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s11, v2
2777; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2778; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2779; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
2780; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2781; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s11, v2
2782; GFX8-NEXT:    s_sub_i32 s1, 0, s13
2783; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2784; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
2785; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x100010
2786; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
2787; GFX8-NEXT:    s_add_i32 s1, s1, s2
2788; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
2789; GFX8-NEXT:    s_xor_b32 s1, s1, s2
2790; GFX8-NEXT:    s_xor_b32 s0, s3, s10
2791; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
2792; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
2793; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
2794; GFX8-NEXT:    v_xor_b32_e32 v2, s3, v2
2795; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
2796; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s13
2797; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v2
2798; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2799; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
2800; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
2801; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2802; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v3
2803; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2804; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2805; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
2806; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2807; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v3
2808; GFX8-NEXT:    s_xor_b32 s0, s2, s12
2809; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
2810; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2811; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
2812; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
2813; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2814; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
2815; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2816; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2817; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v3
2818; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2819; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2820; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2821; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2822; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2823; GFX8-NEXT:    flat_store_dword v[0:1], v4
2824; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2825; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2826; GFX8-NEXT:    flat_store_dword v[0:1], v2
2827; GFX8-NEXT:    s_endpgm
2828;
2829; GFX9-LABEL: sdivrem_v2i16:
2830; GFX9:       ; %bb.0:
2831; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
2832; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2833; GFX9-NEXT:    s_sext_i32_i16 s0, s5
2834; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
2835; GFX9-NEXT:    s_add_i32 s0, s0, s6
2836; GFX9-NEXT:    s_xor_b32 s7, s0, s6
2837; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
2838; GFX9-NEXT:    s_bfe_i32 s5, s5, 0x100010
2839; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2840; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
2841; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2842; GFX9-NEXT:    s_add_i32 s5, s5, s9
2843; GFX9-NEXT:    s_xor_b32 s5, s5, s9
2844; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
2845; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2846; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2847; GFX9-NEXT:    s_sub_i32 s10, 0, s7
2848; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2849; GFX9-NEXT:    s_sext_i32_i16 s8, s4
2850; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v0
2851; GFX9-NEXT:    s_ashr_i32 s10, s8, 31
2852; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2853; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2854; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2855; GFX9-NEXT:    s_add_i32 s8, s8, s10
2856; GFX9-NEXT:    s_xor_b32 s8, s8, s10
2857; GFX9-NEXT:    s_sub_i32 s11, 0, s5
2858; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2859; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
2860; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v1
2861; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x100010
2862; GFX9-NEXT:    s_ashr_i32 s11, s4, 31
2863; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s7
2864; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
2865; GFX9-NEXT:    s_add_i32 s4, s4, s11
2866; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
2867; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
2868; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
2869; GFX9-NEXT:    s_xor_b32 s4, s4, s11
2870; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
2871; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2872; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
2873; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
2874; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2875; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
2876; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
2877; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2878; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
2879; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
2880; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s5
2881; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
2882; GFX9-NEXT:    s_xor_b32 s6, s10, s6
2883; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
2884; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
2885; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
2886; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2887; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
2888; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2889; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
2890; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
2891; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2892; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
2893; GFX9-NEXT:    s_xor_b32 s4, s11, s9
2894; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
2895; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
2896; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2897; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
2898; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
2899; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
2900; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
2901; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2902; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
2903; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
2904; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
2905; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2906; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
2907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2908; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
2909; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
2910; GFX9-NEXT:    s_endpgm
2911;
2912; GFX10-LABEL: sdivrem_v2i16:
2913; GFX10:       ; %bb.0:
2914; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x10
2915; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2916; GFX10-NEXT:    s_sext_i32_i16 s2, s1
2917; GFX10-NEXT:    s_bfe_i32 s1, s1, 0x100010
2918; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
2919; GFX10-NEXT:    s_ashr_i32 s10, s1, 31
2920; GFX10-NEXT:    s_add_i32 s2, s2, s3
2921; GFX10-NEXT:    s_add_i32 s1, s1, s10
2922; GFX10-NEXT:    s_xor_b32 s2, s2, s3
2923; GFX10-NEXT:    s_xor_b32 s1, s1, s10
2924; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
2925; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s1
2926; GFX10-NEXT:    s_sub_i32 s4, 0, s2
2927; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2928; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2929; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2930; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2931; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2932; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
2933; GFX10-NEXT:    v_mul_lo_u32 v2, s4, v0
2934; GFX10-NEXT:    s_sub_i32 s4, 0, s1
2935; GFX10-NEXT:    v_mul_lo_u32 v3, s4, v1
2936; GFX10-NEXT:    s_sext_i32_i16 s4, s0
2937; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x100010
2938; GFX10-NEXT:    s_ashr_i32 s11, s4, 31
2939; GFX10-NEXT:    s_ashr_i32 s12, s0, 31
2940; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
2941; GFX10-NEXT:    s_add_i32 s4, s4, s11
2942; GFX10-NEXT:    s_add_i32 s0, s0, s12
2943; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
2944; GFX10-NEXT:    s_xor_b32 s4, s4, s11
2945; GFX10-NEXT:    s_xor_b32 s0, s0, s12
2946; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
2947; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2948; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
2949; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
2950; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
2951; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
2952; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s1
2953; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
2954; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
2955; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
2956; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
2957; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
2958; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
2959; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
2960; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
2961; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, s1, v3
2962; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
2963; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
2964; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v0
2965; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s0
2966; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
2967; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, s2, v2
2968; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
2969; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
2970; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
2971; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v3
2972; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
2973; GFX10-NEXT:    s_xor_b32 s1, s11, s3
2974; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
2975; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
2976; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s0
2977; GFX10-NEXT:    v_xor_b32_e32 v2, s11, v2
2978; GFX10-NEXT:    s_xor_b32 s0, s12, s10
2979; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
2980; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
2981; GFX10-NEXT:    v_xor_b32_e32 v3, s12, v3
2982; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s11, v2
2983; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
2984; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2985; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s12, v3
2986; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2987; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
2988; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2989; GFX10-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
2990; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2991; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
2992; GFX10-NEXT:    global_store_dword v1, v2, s[6:7]
2993; GFX10-NEXT:    s_endpgm
2994  %div = sdiv <2 x i16> %x, %y
2995  store <2 x i16> %div, ptr addrspace(1) %out0
2996  %rem = srem <2 x i16> %x, %y
2997  store <2 x i16> %rem, ptr addrspace(1) %out1
2998  ret void
2999}
3000
3001define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) {
3002; GFX8-LABEL: sdivrem_i3:
3003; GFX8:       ; %bb.0:
3004; GFX8-NEXT:    s_load_dword s4, s[8:9], 0x10
3005; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3006; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x30008
3007; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
3008; GFX8-NEXT:    s_add_i32 s0, s0, s5
3009; GFX8-NEXT:    s_xor_b32 s6, s0, s5
3010; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
3011; GFX8-NEXT:    s_sub_i32 s0, 0, s6
3012; GFX8-NEXT:    s_bfe_i32 s4, s4, 0x30000
3013; GFX8-NEXT:    s_ashr_i32 s7, s4, 31
3014; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3015; GFX8-NEXT:    s_add_i32 s4, s4, s7
3016; GFX8-NEXT:    s_xor_b32 s4, s4, s7
3017; GFX8-NEXT:    s_xor_b32 s5, s7, s5
3018; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3019; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
3020; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
3021; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3022; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
3023; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
3024; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
3025; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3026; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3027; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3028; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s6
3029; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
3030; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
3031; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
3032; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3033; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
3034; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3035; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
3036; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
3037; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3038; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
3039; GFX8-NEXT:    v_xor_b32_e32 v2, s5, v2
3040; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3041; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s5, v2
3042; GFX8-NEXT:    v_xor_b32_e32 v3, s7, v3
3043; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
3044; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v3
3045; GFX8-NEXT:    flat_store_byte v[0:1], v2
3046; GFX8-NEXT:    v_mov_b32_e32 v0, s2
3047; GFX8-NEXT:    v_and_b32_e32 v2, 7, v3
3048; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3049; GFX8-NEXT:    flat_store_byte v[0:1], v2
3050; GFX8-NEXT:    s_endpgm
3051;
3052; GFX9-LABEL: sdivrem_i3:
3053; GFX9:       ; %bb.0:
3054; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x10
3055; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3056; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3057; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x30008
3058; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
3059; GFX9-NEXT:    s_add_i32 s1, s1, s4
3060; GFX9-NEXT:    s_xor_b32 s5, s1, s4
3061; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
3062; GFX9-NEXT:    s_sub_i32 s1, 0, s5
3063; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x30000
3064; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
3065; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3066; GFX9-NEXT:    s_add_i32 s0, s0, s6
3067; GFX9-NEXT:    s_xor_b32 s7, s0, s6
3068; GFX9-NEXT:    s_xor_b32 s4, s6, s4
3069; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3070; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
3071; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
3072; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3073; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
3074; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
3075; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
3076; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
3077; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3078; GFX9-NEXT:    v_sub_u32_e32 v1, s7, v1
3079; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
3080; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3081; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
3082; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3083; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3084; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
3085; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3086; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
3087; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3088; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
3089; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
3090; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
3091; GFX9-NEXT:    v_subrev_u32_e32 v1, s6, v1
3092; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3093; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3094; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
3095; GFX9-NEXT:    v_and_b32_e32 v0, 7, v1
3096; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
3097; GFX9-NEXT:    s_endpgm
3098;
3099; GFX10-LABEL: sdivrem_i3:
3100; GFX10:       ; %bb.0:
3101; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x10
3102; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3103; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x30008
3104; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x30000
3105; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
3106; GFX10-NEXT:    s_ashr_i32 s5, s0, 31
3107; GFX10-NEXT:    s_add_i32 s1, s1, s4
3108; GFX10-NEXT:    s_add_i32 s0, s0, s5
3109; GFX10-NEXT:    s_xor_b32 s1, s1, s4
3110; GFX10-NEXT:    s_xor_b32 s0, s0, s5
3111; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
3112; GFX10-NEXT:    s_sub_i32 s2, 0, s1
3113; GFX10-NEXT:    s_xor_b32 s4, s5, s4
3114; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3115; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3116; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
3117; GFX10-NEXT:    v_mul_lo_u32 v1, s2, v0
3118; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
3119; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
3120; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
3121; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s1
3122; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3123; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
3124; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3125; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3126; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3127; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3128; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3129; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3130; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3131; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3132; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3133; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3134; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3135; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
3136; GFX10-NEXT:    v_xor_b32_e32 v1, s5, v1
3137; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
3138; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s5, v1
3139; GFX10-NEXT:    v_and_b32_e32 v0, 7, v0
3140; GFX10-NEXT:    v_and_b32_e32 v1, 7, v1
3141; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3142; GFX10-NEXT:    global_store_byte v2, v0, s[0:1]
3143; GFX10-NEXT:    global_store_byte v2, v1, s[2:3]
3144; GFX10-NEXT:    s_endpgm
3145  %div = sdiv i3 %x, %y
3146  store i3 %div, ptr addrspace(1) %out0
3147  %rem = srem i3 %x, %y
3148  store i3 %rem, ptr addrspace(1) %out1
3149  ret void
3150}
3151
3152define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) {
3153; GFX8-LABEL: sdivrem_i27:
3154; GFX8:       ; %bb.0:
3155; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
3156; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3157; GFX8-NEXT:    s_bfe_i32 s0, s5, 0x1b0000
3158; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
3159; GFX8-NEXT:    s_add_i32 s0, s0, s5
3160; GFX8-NEXT:    s_xor_b32 s6, s0, s5
3161; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
3162; GFX8-NEXT:    s_sub_i32 s0, 0, s6
3163; GFX8-NEXT:    s_bfe_i32 s4, s4, 0x1b0000
3164; GFX8-NEXT:    s_ashr_i32 s7, s4, 31
3165; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3166; GFX8-NEXT:    s_add_i32 s4, s4, s7
3167; GFX8-NEXT:    s_xor_b32 s4, s4, s7
3168; GFX8-NEXT:    s_xor_b32 s5, s7, s5
3169; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3170; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
3171; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
3172; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3173; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
3174; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
3175; GFX8-NEXT:    v_mul_hi_u32 v2, s4, v0
3176; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3177; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3178; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3179; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s6
3180; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
3181; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
3182; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
3183; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3184; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
3185; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3186; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
3187; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
3188; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3189; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s6, v3
3190; GFX8-NEXT:    v_xor_b32_e32 v2, s5, v2
3191; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3192; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s5, v2
3193; GFX8-NEXT:    v_xor_b32_e32 v3, s7, v3
3194; GFX8-NEXT:    v_and_b32_e32 v2, 0x7ffffff, v2
3195; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v3
3196; GFX8-NEXT:    flat_store_dword v[0:1], v2
3197; GFX8-NEXT:    v_mov_b32_e32 v0, s2
3198; GFX8-NEXT:    v_and_b32_e32 v2, 0x7ffffff, v3
3199; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3200; GFX8-NEXT:    flat_store_dword v[0:1], v2
3201; GFX8-NEXT:    s_endpgm
3202;
3203; GFX9-LABEL: sdivrem_i27:
3204; GFX9:       ; %bb.0:
3205; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x10
3206; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3208; GFX9-NEXT:    s_bfe_i32 s1, s1, 0x1b0000
3209; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
3210; GFX9-NEXT:    s_add_i32 s1, s1, s4
3211; GFX9-NEXT:    s_xor_b32 s5, s1, s4
3212; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
3213; GFX9-NEXT:    s_sub_i32 s1, 0, s5
3214; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x1b0000
3215; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
3216; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3217; GFX9-NEXT:    s_add_i32 s0, s0, s6
3218; GFX9-NEXT:    s_xor_b32 s7, s0, s6
3219; GFX9-NEXT:    s_xor_b32 s4, s6, s4
3220; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3221; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
3222; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
3223; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3224; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
3225; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
3226; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
3227; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
3228; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3229; GFX9-NEXT:    v_sub_u32_e32 v1, s7, v1
3230; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
3231; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3232; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
3233; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3234; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3235; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
3236; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3237; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
3238; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3239; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
3240; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
3241; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
3242; GFX9-NEXT:    v_subrev_u32_e32 v1, s6, v1
3243; GFX9-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
3244; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3245; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
3246; GFX9-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v1
3247; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
3248; GFX9-NEXT:    s_endpgm
3249;
3250; GFX10-LABEL: sdivrem_i27:
3251; GFX10:       ; %bb.0:
3252; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x10
3253; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3254; GFX10-NEXT:    s_bfe_i32 s1, s1, 0x1b0000
3255; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x1b0000
3256; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
3257; GFX10-NEXT:    s_ashr_i32 s5, s0, 31
3258; GFX10-NEXT:    s_add_i32 s1, s1, s4
3259; GFX10-NEXT:    s_add_i32 s0, s0, s5
3260; GFX10-NEXT:    s_xor_b32 s1, s1, s4
3261; GFX10-NEXT:    s_xor_b32 s0, s0, s5
3262; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
3263; GFX10-NEXT:    s_sub_i32 s2, 0, s1
3264; GFX10-NEXT:    s_xor_b32 s4, s5, s4
3265; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3266; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3267; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
3268; GFX10-NEXT:    v_mul_lo_u32 v1, s2, v0
3269; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
3270; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
3271; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
3272; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s1
3273; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3274; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
3275; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3276; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3277; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3278; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3279; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3280; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3281; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3282; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3283; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3284; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3285; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3286; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
3287; GFX10-NEXT:    v_xor_b32_e32 v1, s5, v1
3288; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
3289; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s5, v1
3290; GFX10-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
3291; GFX10-NEXT:    v_and_b32_e32 v1, 0x7ffffff, v1
3292; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3293; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
3294; GFX10-NEXT:    global_store_dword v2, v1, s[2:3]
3295; GFX10-NEXT:    s_endpgm
3296  %div = sdiv i27 %x, %y
3297  store i27 %div, ptr addrspace(1) %out0
3298  %rem = srem i27 %x, %y
3299  store i27 %rem, ptr addrspace(1) %out1
3300  ret void
3301}
3302