xref: /llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (revision 463e93b95f0887145b51edb81b770eeb4463abc5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
5
6define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
7; CHECK-LABEL: @udiv_i32(
8; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
11; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
13; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
14; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
15; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
16; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
17; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
19; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
20; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
21; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
22; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
23; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
28; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
29; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
30; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
31; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
32; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
33; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
34; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
35; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
36; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
37; CHECK-NEXT:    store i32 [[TMP29]], ptr addrspace(1) [[OUT:%.*]], align 4
38; CHECK-NEXT:    ret void
39;
40; GFX6-LABEL: udiv_i32:
41; GFX6:       ; %bb.0:
42; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
43; GFX6-NEXT:    s_mov_b32 s7, 0xf000
44; GFX6-NEXT:    s_mov_b32 s6, -1
45; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
47; GFX6-NEXT:    s_sub_i32 s4, 0, s3
48; GFX6-NEXT:    s_mov_b32 s5, s1
49; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
50; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
51; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
52; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
53; GFX6-NEXT:    s_mov_b32 s4, s0
54; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
55; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
56; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
57; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
58; GFX6-NEXT:    s_mul_i32 s0, s0, s3
59; GFX6-NEXT:    s_sub_i32 s0, s2, s0
60; GFX6-NEXT:    s_sub_i32 s1, s0, s3
61; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
62; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
63; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
64; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
65; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
66; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
67; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
68; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
69; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
70; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
71; GFX6-NEXT:    s_endpgm
72;
73; GFX9-LABEL: udiv_i32:
74; GFX9:       ; %bb.0:
75; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
76; GFX9-NEXT:    v_mov_b32_e32 v1, 0
77; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
79; GFX9-NEXT:    s_sub_i32 s4, 0, s3
80; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
81; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
82; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
83; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
84; GFX9-NEXT:    s_mul_i32 s4, s4, s5
85; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
86; GFX9-NEXT:    s_add_i32 s5, s5, s4
87; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
88; GFX9-NEXT:    s_mul_i32 s5, s4, s3
89; GFX9-NEXT:    s_sub_i32 s2, s2, s5
90; GFX9-NEXT:    s_add_i32 s6, s4, 1
91; GFX9-NEXT:    s_sub_i32 s5, s2, s3
92; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
93; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
94; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
95; GFX9-NEXT:    s_add_i32 s5, s4, 1
96; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
97; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
98; GFX9-NEXT:    v_mov_b32_e32 v0, s2
99; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
100; GFX9-NEXT:    s_endpgm
101  %r = udiv i32 %x, %y
102  store i32 %r, ptr addrspace(1) %out
103  ret void
104}
105
106define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
107; CHECK-LABEL: @urem_i32(
108; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
109; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
110; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
111; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
112; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
113; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
114; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
115; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
116; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
117; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
118; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
119; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
120; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
121; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
122; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
123; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
124; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
125; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
126; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
127; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
128; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
129; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
130; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
131; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
132; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
133; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
134; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
135; CHECK-NEXT:    store i32 [[TMP27]], ptr addrspace(1) [[OUT:%.*]], align 4
136; CHECK-NEXT:    ret void
137;
138; GFX6-LABEL: urem_i32:
139; GFX6:       ; %bb.0:
140; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
141; GFX6-NEXT:    s_mov_b32 s7, 0xf000
142; GFX6-NEXT:    s_mov_b32 s6, -1
143; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
144; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
145; GFX6-NEXT:    s_sub_i32 s4, 0, s3
146; GFX6-NEXT:    s_mov_b32 s5, s1
147; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
148; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
149; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
150; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
151; GFX6-NEXT:    s_mov_b32 s4, s0
152; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
153; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
154; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
155; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
156; GFX6-NEXT:    s_mul_i32 s0, s0, s3
157; GFX6-NEXT:    s_sub_i32 s0, s2, s0
158; GFX6-NEXT:    s_sub_i32 s1, s0, s3
159; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
160; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
161; GFX6-NEXT:    s_sub_i32 s1, s0, s3
162; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
163; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
164; GFX6-NEXT:    v_mov_b32_e32 v0, s0
165; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
166; GFX6-NEXT:    s_endpgm
167;
168; GFX9-LABEL: urem_i32:
169; GFX9:       ; %bb.0:
170; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
171; GFX9-NEXT:    v_mov_b32_e32 v1, 0
172; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
174; GFX9-NEXT:    s_sub_i32 s4, 0, s3
175; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
176; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
177; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
178; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
179; GFX9-NEXT:    s_mul_i32 s4, s4, s5
180; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
181; GFX9-NEXT:    s_add_i32 s5, s5, s4
182; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
183; GFX9-NEXT:    s_mul_i32 s4, s4, s3
184; GFX9-NEXT:    s_sub_i32 s2, s2, s4
185; GFX9-NEXT:    s_sub_i32 s4, s2, s3
186; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
187; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
188; GFX9-NEXT:    s_sub_i32 s4, s2, s3
189; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
190; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
191; GFX9-NEXT:    v_mov_b32_e32 v0, s2
192; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
193; GFX9-NEXT:    s_endpgm
194  %r = urem i32 %x, %y
195  store i32 %r, ptr addrspace(1) %out
196  ret void
197}
198
199define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
200; CHECK-LABEL: @sdiv_i32(
201; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
202; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
203; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
204; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
205; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
206; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
207; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
208; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
209; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
210; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
211; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
212; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
213; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
214; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
215; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
216; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
217; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
218; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
219; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
220; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
221; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
222; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
223; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
224; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
225; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
226; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
227; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
228; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
229; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
230; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
231; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
232; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
233; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
234; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
235; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
236; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
237; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
238; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
239; CHECK-NEXT:    store i32 [[TMP38]], ptr addrspace(1) [[OUT:%.*]], align 4
240; CHECK-NEXT:    ret void
241;
242; GFX6-LABEL: sdiv_i32:
243; GFX6:       ; %bb.0:
244; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
245; GFX6-NEXT:    s_mov_b32 s7, 0xf000
246; GFX6-NEXT:    s_mov_b32 s6, -1
247; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX6-NEXT:    s_abs_i32 s8, s3
249; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
250; GFX6-NEXT:    s_sub_i32 s4, 0, s8
251; GFX6-NEXT:    s_mov_b32 s5, s1
252; GFX6-NEXT:    s_xor_b32 s1, s2, s3
253; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
254; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
255; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
256; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
257; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
258; GFX6-NEXT:    s_mov_b32 s4, s0
259; GFX6-NEXT:    s_abs_i32 s0, s2
260; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
261; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
262; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
263; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
264; GFX6-NEXT:    s_mul_i32 s2, s2, s8
265; GFX6-NEXT:    s_sub_i32 s0, s0, s2
266; GFX6-NEXT:    s_sub_i32 s2, s0, s8
267; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
268; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
269; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
270; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
271; GFX6-NEXT:    s_cselect_b32 s0, s2, s0
272; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
273; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
274; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
275; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
276; GFX6-NEXT:    v_xor_b32_e32 v0, s1, v0
277; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s1, v0
278; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
279; GFX6-NEXT:    s_endpgm
280;
281; GFX9-LABEL: sdiv_i32:
282; GFX9:       ; %bb.0:
283; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
284; GFX9-NEXT:    v_mov_b32_e32 v1, 0
285; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX9-NEXT:    s_abs_i32 s4, s3
287; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
288; GFX9-NEXT:    s_sub_i32 s5, 0, s4
289; GFX9-NEXT:    s_xor_b32 s3, s2, s3
290; GFX9-NEXT:    s_abs_i32 s2, s2
291; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
292; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
293; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
294; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
295; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
296; GFX9-NEXT:    s_mul_i32 s5, s5, s6
297; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
298; GFX9-NEXT:    s_add_i32 s6, s6, s5
299; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
300; GFX9-NEXT:    s_mul_i32 s6, s5, s4
301; GFX9-NEXT:    s_sub_i32 s2, s2, s6
302; GFX9-NEXT:    s_add_i32 s7, s5, 1
303; GFX9-NEXT:    s_sub_i32 s6, s2, s4
304; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
305; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
306; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
307; GFX9-NEXT:    s_add_i32 s6, s5, 1
308; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
309; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
310; GFX9-NEXT:    s_xor_b32 s2, s2, s3
311; GFX9-NEXT:    s_sub_i32 s2, s2, s3
312; GFX9-NEXT:    v_mov_b32_e32 v0, s2
313; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
314; GFX9-NEXT:    s_endpgm
315  %r = sdiv i32 %x, %y
316  store i32 %r, ptr addrspace(1) %out
317  ret void
318}
319
320define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
321; CHECK-LABEL: @srem_i32(
322; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
323; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
324; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
325; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
326; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
327; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
328; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
329; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
330; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
331; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
332; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
333; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
334; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
335; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
336; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
337; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
338; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
339; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
340; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
341; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
342; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
343; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
344; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
345; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
346; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
347; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
348; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
349; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
350; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
351; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
352; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
353; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
354; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
355; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
356; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
357; CHECK-NEXT:    store i32 [[TMP35]], ptr addrspace(1) [[OUT:%.*]], align 4
358; CHECK-NEXT:    ret void
359;
360; GFX6-LABEL: srem_i32:
361; GFX6:       ; %bb.0:
362; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
363; GFX6-NEXT:    s_mov_b32 s7, 0xf000
364; GFX6-NEXT:    s_mov_b32 s6, -1
365; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
366; GFX6-NEXT:    s_abs_i32 s3, s3
367; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
368; GFX6-NEXT:    s_sub_i32 s4, 0, s3
369; GFX6-NEXT:    s_abs_i32 s8, s2
370; GFX6-NEXT:    s_mov_b32 s5, s1
371; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
372; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
373; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
374; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
375; GFX6-NEXT:    s_mov_b32 s4, s0
376; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
377; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
378; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
379; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
380; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
381; GFX6-NEXT:    s_mul_i32 s1, s1, s3
382; GFX6-NEXT:    s_sub_i32 s1, s8, s1
383; GFX6-NEXT:    s_sub_i32 s2, s1, s3
384; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
385; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
386; GFX6-NEXT:    s_sub_i32 s2, s1, s3
387; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
388; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
389; GFX6-NEXT:    s_xor_b32 s1, s1, s0
390; GFX6-NEXT:    s_sub_i32 s0, s1, s0
391; GFX6-NEXT:    v_mov_b32_e32 v0, s0
392; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
393; GFX6-NEXT:    s_endpgm
394;
395; GFX9-LABEL: srem_i32:
396; GFX9:       ; %bb.0:
397; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
398; GFX9-NEXT:    v_mov_b32_e32 v1, 0
399; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX9-NEXT:    s_abs_i32 s3, s3
401; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
402; GFX9-NEXT:    s_sub_i32 s5, 0, s3
403; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
404; GFX9-NEXT:    s_abs_i32 s2, s2
405; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
406; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
407; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
408; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
409; GFX9-NEXT:    s_mul_i32 s5, s5, s6
410; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
411; GFX9-NEXT:    s_add_i32 s6, s6, s5
412; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
413; GFX9-NEXT:    s_mul_i32 s5, s5, s3
414; GFX9-NEXT:    s_sub_i32 s2, s2, s5
415; GFX9-NEXT:    s_sub_i32 s5, s2, s3
416; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
417; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
418; GFX9-NEXT:    s_sub_i32 s5, s2, s3
419; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
420; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
421; GFX9-NEXT:    s_xor_b32 s2, s2, s4
422; GFX9-NEXT:    s_sub_i32 s2, s2, s4
423; GFX9-NEXT:    v_mov_b32_e32 v0, s2
424; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
425; GFX9-NEXT:    s_endpgm
426  %r = srem i32 %x, %y
427  store i32 %r, ptr addrspace(1) %out
428  ret void
429}
430
431define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
432; CHECK-LABEL: @udiv_i16(
433; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
434; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
435; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
436; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
437; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
438; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
439; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
440; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
441; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
442; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
443; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
444; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
445; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
446; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
447; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
448; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
449; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
450; CHECK-NEXT:    store i16 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 2
451; CHECK-NEXT:    ret void
452;
453; GFX6-LABEL: udiv_i16:
454; GFX6:       ; %bb.0:
455; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xb
456; GFX6-NEXT:    s_mov_b32 s3, 0xf000
457; GFX6-NEXT:    s_mov_b32 s2, -1
458; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
460; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s1
461; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
462; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
463; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
464; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
465; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
466; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
467; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
468; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
469; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
470; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
471; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
472; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
473; GFX6-NEXT:    s_endpgm
474;
475; GFX9-LABEL: udiv_i16:
476; GFX9:       ; %bb.0:
477; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x2c
478; GFX9-NEXT:    v_mov_b32_e32 v3, 0
479; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX9-NEXT:    s_lshr_b32 s1, s0, 16
481; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
482; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
483; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
484; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
485; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
486; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
487; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
488; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
489; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
490; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
491; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
492; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
494; GFX9-NEXT:    s_endpgm
495  %r = udiv i16 %x, %y
496  store i16 %r, ptr addrspace(1) %out
497  ret void
498}
499
500define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
501; CHECK-LABEL: @urem_i16(
502; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
503; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
504; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
505; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
506; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
507; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
508; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
509; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
510; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
511; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
512; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
513; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
514; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
515; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
516; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
517; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
518; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
519; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
520; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
521; CHECK-NEXT:    store i16 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 2
522; CHECK-NEXT:    ret void
523;
524; GFX6-LABEL: urem_i16:
525; GFX6:       ; %bb.0:
526; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
527; GFX6-NEXT:    s_mov_b32 s3, 0xf000
528; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX6-NEXT:    s_lshr_b32 s2, s6, 16
530; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
531; GFX6-NEXT:    s_and_b32 s0, s6, 0xffff
532; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
533; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
534; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
535; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
536; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
537; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
538; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
539; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
540; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
541; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
542; GFX6-NEXT:    s_mov_b32 s2, -1
543; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
544; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
545; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
546; GFX6-NEXT:    s_endpgm
547;
548; GFX9-LABEL: urem_i16:
549; GFX9:       ; %bb.0:
550; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
551; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
553; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
554; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
555; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
556; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
557; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
558; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
559; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
560; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
561; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
562; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
563; GFX9-NEXT:    v_mov_b32_e32 v1, 0
564; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
565; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
566; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
567; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
568; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
569; GFX9-NEXT:    s_endpgm
570  %r = urem i16 %x, %y
571  store i16 %r, ptr addrspace(1) %out
572  ret void
573}
574
575define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
576; CHECK-LABEL: @sdiv_i16(
577; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
578; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
579; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
580; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
581; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
582; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
583; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
584; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
585; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
586; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
587; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
588; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
589; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
590; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
591; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
592; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
593; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
594; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
595; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
596; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
597; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
598; CHECK-NEXT:    store i16 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 2
599; CHECK-NEXT:    ret void
600;
601; GFX6-LABEL: sdiv_i16:
602; GFX6:       ; %bb.0:
603; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
604; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
605; GFX6-NEXT:    s_mov_b32 s3, 0xf000
606; GFX6-NEXT:    s_mov_b32 s2, -1
607; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
608; GFX6-NEXT:    s_ashr_i32 s4, s6, 16
609; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
610; GFX6-NEXT:    s_sext_i32_i16 s5, s6
611; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
612; GFX6-NEXT:    s_xor_b32 s4, s5, s4
613; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
614; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
615; GFX6-NEXT:    s_or_b32 s6, s4, 1
616; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
617; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
618; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
619; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
620; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
621; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
622; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
623; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
624; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
625; GFX6-NEXT:    s_endpgm
626;
627; GFX9-LABEL: sdiv_i16:
628; GFX9:       ; %bb.0:
629; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
630; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
631; GFX9-NEXT:    v_mov_b32_e32 v1, 0
632; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX9-NEXT:    s_ashr_i32 s3, s2, 16
634; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s3
635; GFX9-NEXT:    s_sext_i32_i16 s2, s2
636; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
637; GFX9-NEXT:    s_xor_b32 s2, s2, s3
638; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
639; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
640; GFX9-NEXT:    s_or_b32 s4, s2, 1
641; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
642; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
643; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
644; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
645; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
646; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
647; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
648; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
649; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
650; GFX9-NEXT:    s_endpgm
651  %r = sdiv i16 %x, %y
652  store i16 %r, ptr addrspace(1) %out
653  ret void
654}
655
656define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
657; CHECK-LABEL: @srem_i16(
658; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
659; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
660; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
661; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
662; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
663; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
664; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
665; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
666; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
667; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
668; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
669; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
670; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
671; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
672; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
673; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
674; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
675; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
676; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
677; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
678; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
679; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
680; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
681; CHECK-NEXT:    store i16 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 2
682; CHECK-NEXT:    ret void
683;
684; GFX6-LABEL: srem_i16:
685; GFX6:       ; %bb.0:
686; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
687; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
688; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX6-NEXT:    s_ashr_i32 s7, s6, 16
690; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s7
691; GFX6-NEXT:    s_sext_i32_i16 s2, s6
692; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
693; GFX6-NEXT:    s_xor_b32 s2, s2, s7
694; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
695; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
696; GFX6-NEXT:    s_or_b32 s4, s2, 1
697; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
698; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
699; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
700; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
701; GFX6-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
702; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], exec
703; GFX6-NEXT:    s_cselect_b32 s2, s4, 0
704; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
705; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s7
706; GFX6-NEXT:    s_mov_b32 s3, 0xf000
707; GFX6-NEXT:    s_mov_b32 s2, -1
708; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
709; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
710; GFX6-NEXT:    s_endpgm
711;
712; GFX9-LABEL: srem_i16:
713; GFX9:       ; %bb.0:
714; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
715; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
716; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX9-NEXT:    s_ashr_i32 s7, s6, 16
718; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s7
719; GFX9-NEXT:    s_sext_i32_i16 s2, s6
720; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
721; GFX9-NEXT:    s_xor_b32 s2, s2, s7
722; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
723; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
724; GFX9-NEXT:    s_or_b32 s4, s2, 1
725; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
726; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
727; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
728; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
729; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
730; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
731; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
732; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
733; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s7
734; GFX9-NEXT:    v_mov_b32_e32 v1, 0
735; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
736; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
737; GFX9-NEXT:    s_endpgm
738  %r = srem i16 %x, %y
739  store i16 %r, ptr addrspace(1) %out
740  ret void
741}
742
743define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
744; CHECK-LABEL: @udiv_i8(
745; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
746; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
747; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
748; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
749; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
750; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
751; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
752; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
753; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
754; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
755; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
756; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
757; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
758; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
759; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
760; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
761; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
762; CHECK-NEXT:    store i8 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1
763; CHECK-NEXT:    ret void
764;
765; GFX6-LABEL: udiv_i8:
766; GFX6:       ; %bb.0:
767; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
768; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
769; GFX6-NEXT:    s_mov_b32 s3, 0xf000
770; GFX6-NEXT:    s_mov_b32 s2, -1
771; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s6
773; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
774; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
775; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
776; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
777; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
778; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
779; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
780; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
781; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
782; GFX6-NEXT:    s_endpgm
783;
784; GFX9-LABEL: udiv_i8:
785; GFX9:       ; %bb.0:
786; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
787; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
788; GFX9-NEXT:    v_mov_b32_e32 v2, 0
789; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
791; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
792; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
793; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
794; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
795; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
796; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
797; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
798; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
799; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
800; GFX9-NEXT:    s_endpgm
801  %r = udiv i8 %x, %y
802  store i8 %r, ptr addrspace(1) %out
803  ret void
804}
805
806define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
807; CHECK-LABEL: @urem_i8(
808; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
809; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
810; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
811; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
812; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
813; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
814; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
815; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
816; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
817; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
818; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
819; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
820; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
821; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
822; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
823; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
824; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
825; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
826; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
827; CHECK-NEXT:    store i8 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1
828; CHECK-NEXT:    ret void
829;
830; GFX6-LABEL: urem_i8:
831; GFX6:       ; %bb.0:
832; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
833; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
834; GFX6-NEXT:    s_mov_b32 s3, 0xf000
835; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
836; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s6
837; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
838; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
839; GFX6-NEXT:    s_lshr_b32 s2, s6, 8
840; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
841; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
842; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
843; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
844; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
845; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
846; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
847; GFX6-NEXT:    s_mov_b32 s2, -1
848; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
849; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
850; GFX6-NEXT:    s_endpgm
851;
852; GFX9-LABEL: urem_i8:
853; GFX9:       ; %bb.0:
854; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
855; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
856; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
858; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
859; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
860; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
861; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
862; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
863; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
864; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
865; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
866; GFX9-NEXT:    v_mov_b32_e32 v1, 0
867; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
868; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
869; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
870; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
871; GFX9-NEXT:    s_endpgm
872  %r = urem i8 %x, %y
873  store i8 %r, ptr addrspace(1) %out
874  ret void
875}
876
877define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
878; CHECK-LABEL: @sdiv_i8(
879; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
880; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
881; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
882; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
883; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
884; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
885; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
886; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
887; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
888; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
889; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
890; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
891; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
892; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
893; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
894; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
895; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
896; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
897; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
898; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
899; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
900; CHECK-NEXT:    store i8 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1
901; CHECK-NEXT:    ret void
902;
903; GFX6-LABEL: sdiv_i8:
904; GFX6:       ; %bb.0:
905; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
906; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
907; GFX6-NEXT:    s_mov_b32 s3, 0xf000
908; GFX6-NEXT:    s_mov_b32 s2, -1
909; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX6-NEXT:    s_bfe_i32 s4, s6, 0x80008
911; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
912; GFX6-NEXT:    s_sext_i32_i8 s5, s6
913; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
914; GFX6-NEXT:    s_xor_b32 s4, s5, s4
915; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
916; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
917; GFX6-NEXT:    s_or_b32 s6, s4, 1
918; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
919; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
920; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
921; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
922; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
923; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
924; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
925; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
926; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
927; GFX6-NEXT:    s_endpgm
928;
929; GFX9-LABEL: sdiv_i8:
930; GFX9:       ; %bb.0:
931; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
932; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
933; GFX9-NEXT:    v_mov_b32_e32 v1, 0
934; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
935; GFX9-NEXT:    s_bfe_i32 s3, s2, 0x80008
936; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s3
937; GFX9-NEXT:    s_sext_i32_i8 s2, s2
938; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
939; GFX9-NEXT:    s_xor_b32 s2, s2, s3
940; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
941; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
942; GFX9-NEXT:    s_or_b32 s4, s2, 1
943; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
944; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
945; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
946; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
947; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
948; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
949; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
950; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
951; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
952; GFX9-NEXT:    s_endpgm
953  %r = sdiv i8 %x, %y
954  store i8 %r, ptr addrspace(1) %out
955  ret void
956}
957
958define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
959; CHECK-LABEL: @srem_i8(
960; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
961; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
962; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
963; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
964; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
965; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
966; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
967; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
968; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
969; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
970; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
971; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
972; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
973; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
974; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
975; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
976; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
977; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
978; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
979; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
980; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
981; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
982; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
983; CHECK-NEXT:    store i8 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1
984; CHECK-NEXT:    ret void
985;
986; GFX6-LABEL: srem_i8:
987; GFX6:       ; %bb.0:
988; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
989; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
990; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX6-NEXT:    s_bfe_i32 s2, s6, 0x80008
992; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
993; GFX6-NEXT:    s_sext_i32_i8 s3, s6
994; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
995; GFX6-NEXT:    s_xor_b32 s2, s3, s2
996; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
997; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
998; GFX6-NEXT:    s_lshr_b32 s4, s6, 8
999; GFX6-NEXT:    s_or_b32 s5, s2, 1
1000; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
1001; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
1002; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
1003; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
1004; GFX6-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
1005; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], exec
1006; GFX6-NEXT:    s_cselect_b32 s2, s5, 0
1007; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1008; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
1009; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1010; GFX6-NEXT:    s_mov_b32 s2, -1
1011; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
1012; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1013; GFX6-NEXT:    s_endpgm
1014;
1015; GFX9-LABEL: srem_i8:
1016; GFX9:       ; %bb.0:
1017; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
1018; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1019; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1020; GFX9-NEXT:    s_bfe_i32 s2, s6, 0x80008
1021; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
1022; GFX9-NEXT:    s_sext_i32_i8 s3, s6
1023; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
1024; GFX9-NEXT:    s_xor_b32 s2, s3, s2
1025; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1026; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
1027; GFX9-NEXT:    s_lshr_b32 s4, s6, 8
1028; GFX9-NEXT:    s_or_b32 s5, s2, 1
1029; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1030; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1031; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1032; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
1033; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
1034; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
1035; GFX9-NEXT:    s_cselect_b32 s2, s5, 0
1036; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
1037; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
1038; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1039; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
1040; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
1041; GFX9-NEXT:    s_endpgm
1042  %r = srem i8 %x, %y
1043  store i8 %r, ptr addrspace(1) %out
1044  ret void
1045}
1046
1047define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
1048; CHECK-LABEL: @udiv_v4i32(
1049; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1050; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1051; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1052; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1053; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1054; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1055; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1056; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1057; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1058; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1059; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1060; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1061; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1062; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1063; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1064; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1065; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1066; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1067; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1068; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1069; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1070; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1071; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1072; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1073; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
1074; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
1075; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1076; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
1077; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
1078; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
1079; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
1080; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP31]], i64 0
1081; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
1082; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1083; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
1084; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
1085; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
1086; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
1087; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
1088; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
1089; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
1090; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
1091; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
1092; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1093; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
1094; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
1095; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
1096; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
1097; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
1098; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
1099; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1100; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
1101; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
1102; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
1103; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
1104; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
1105; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
1106; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
1107; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
1108; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
1109; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
1110; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
1111; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
1112; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
1113; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
1114; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1115; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
1116; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
1117; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
1118; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
1119; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
1120; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
1121; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
1122; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
1123; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
1124; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
1125; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
1126; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
1127; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
1128; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
1129; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
1130; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
1131; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
1132; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
1133; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
1134; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
1135; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
1136; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
1137; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
1138; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
1139; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
1140; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
1141; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
1142; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
1143; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
1144; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
1145; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
1146; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1147; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
1148; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
1149; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
1150; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
1151; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
1152; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
1153; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
1154; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1155; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1156; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1157; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1158; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1159; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
1160; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
1161; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
1162; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1163; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1164; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1165; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1166; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
1167; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
1168; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
1169; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
1170; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
1171; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
1172; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
1173; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
1174; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
1175; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
1176; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
1177; CHECK-NEXT:    store <4 x i32> [[TMP128]], ptr addrspace(1) [[OUT:%.*]], align 16
1178; CHECK-NEXT:    ret void
1179;
1180; GFX6-LABEL: udiv_v4i32:
1181; GFX6:       ; %bb.0:
1182; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
1183; GFX6-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x9
1184; GFX6-NEXT:    s_mov_b32 s19, 0xf000
1185; GFX6-NEXT:    s_mov_b32 s18, -1
1186; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
1188; GFX6-NEXT:    s_sub_i32 s0, 0, s12
1189; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s13
1190; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s14
1191; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1192; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s15
1193; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1194; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1195; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1196; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1197; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
1198; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
1199; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
1200; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1201; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
1202; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
1203; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1204; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
1205; GFX6-NEXT:    s_mul_i32 s0, s0, s12
1206; GFX6-NEXT:    s_sub_i32 s0, s8, s0
1207; GFX6-NEXT:    s_sub_i32 s1, s0, s12
1208; GFX6-NEXT:    s_cmp_ge_u32 s0, s12
1209; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
1210; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
1211; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1212; GFX6-NEXT:    s_cmp_ge_u32 s0, s12
1213; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
1214; GFX6-NEXT:    s_sub_i32 s2, 0, s13
1215; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
1216; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1217; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
1218; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
1219; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1220; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1221; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v1
1222; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
1223; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1224; GFX6-NEXT:    v_readfirstlane_b32 s2, v1
1225; GFX6-NEXT:    s_mul_i32 s2, s2, s13
1226; GFX6-NEXT:    s_sub_i32 s2, s9, s2
1227; GFX6-NEXT:    s_sub_i32 s3, s2, s13
1228; GFX6-NEXT:    s_cmp_ge_u32 s2, s13
1229; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1230; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
1231; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1232; GFX6-NEXT:    s_cmp_ge_u32 s2, s13
1233; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
1234; GFX6-NEXT:    s_sub_i32 s6, 0, s14
1235; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
1236; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1237; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1238; GFX6-NEXT:    v_mul_hi_u32 v5, v3, v5
1239; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
1240; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
1241; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v3
1242; GFX6-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v6
1243; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
1244; GFX6-NEXT:    v_readfirstlane_b32 s6, v3
1245; GFX6-NEXT:    s_mul_i32 s6, s6, s14
1246; GFX6-NEXT:    s_sub_i32 s6, s10, s6
1247; GFX6-NEXT:    s_sub_i32 s7, s6, s14
1248; GFX6-NEXT:    s_cmp_ge_u32 s6, s14
1249; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
1250; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
1251; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1252; GFX6-NEXT:    s_cmp_ge_u32 s6, s14
1253; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
1254; GFX6-NEXT:    s_sub_i32 s8, 0, s15
1255; GFX6-NEXT:    v_mul_lo_u32 v7, s8, v5
1256; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1257; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
1258; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
1259; GFX6-NEXT:    v_cndmask_b32_e64 v2, v3, v6, s[6:7]
1260; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
1261; GFX6-NEXT:    v_mul_hi_u32 v5, s11, v5
1262; GFX6-NEXT:    v_readfirstlane_b32 s0, v5
1263; GFX6-NEXT:    s_mul_i32 s0, s0, s15
1264; GFX6-NEXT:    s_sub_i32 s0, s11, s0
1265; GFX6-NEXT:    s_sub_i32 s1, s0, s15
1266; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
1267; GFX6-NEXT:    s_cmp_ge_u32 s0, s15
1268; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1269; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
1270; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
1271; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
1272; GFX6-NEXT:    s_cmp_ge_u32 s0, s15
1273; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1274; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1275; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
1276; GFX6-NEXT:    s_endpgm
1277;
1278; GFX9-LABEL: udiv_v4i32:
1279; GFX9:       ; %bb.0:
1280; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
1281; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1282; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1283; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1284; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
1285; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
1286; GFX9-NEXT:    s_sub_i32 s2, 0, s12
1287; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s14
1288; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1289; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1290; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1291; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1292; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1293; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1294; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
1295; GFX9-NEXT:    s_mul_i32 s2, s2, s3
1296; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s2
1297; GFX9-NEXT:    s_add_i32 s3, s3, s2
1298; GFX9-NEXT:    s_mul_hi_u32 s2, s8, s3
1299; GFX9-NEXT:    s_mul_i32 s3, s2, s12
1300; GFX9-NEXT:    s_sub_i32 s3, s8, s3
1301; GFX9-NEXT:    s_add_i32 s5, s2, 1
1302; GFX9-NEXT:    s_sub_i32 s6, s3, s12
1303; GFX9-NEXT:    s_cmp_ge_u32 s3, s12
1304; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
1305; GFX9-NEXT:    s_cselect_b32 s3, s6, s3
1306; GFX9-NEXT:    s_add_i32 s5, s2, 1
1307; GFX9-NEXT:    s_cmp_ge_u32 s3, s12
1308; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1309; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
1310; GFX9-NEXT:    s_sub_i32 s3, 0, s13
1311; GFX9-NEXT:    s_mul_i32 s3, s3, s4
1312; GFX9-NEXT:    s_mul_hi_u32 s3, s4, s3
1313; GFX9-NEXT:    s_add_i32 s4, s4, s3
1314; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v2
1315; GFX9-NEXT:    s_mul_hi_u32 s3, s9, s4
1316; GFX9-NEXT:    s_mul_i32 s4, s3, s13
1317; GFX9-NEXT:    s_sub_i32 s4, s9, s4
1318; GFX9-NEXT:    s_add_i32 s5, s3, 1
1319; GFX9-NEXT:    s_sub_i32 s6, s4, s13
1320; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1321; GFX9-NEXT:    s_cmp_ge_u32 s4, s13
1322; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1323; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
1324; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
1325; GFX9-NEXT:    s_add_i32 s5, s3, 1
1326; GFX9-NEXT:    s_cmp_ge_u32 s4, s13
1327; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
1328; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
1329; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s15
1330; GFX9-NEXT:    s_sub_i32 s4, 0, s14
1331; GFX9-NEXT:    s_mul_i32 s4, s4, s5
1332; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
1333; GFX9-NEXT:    s_add_i32 s5, s5, s4
1334; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1335; GFX9-NEXT:    s_mul_hi_u32 s4, s10, s5
1336; GFX9-NEXT:    s_mul_i32 s5, s4, s14
1337; GFX9-NEXT:    s_sub_i32 s5, s10, s5
1338; GFX9-NEXT:    s_add_i32 s6, s4, 1
1339; GFX9-NEXT:    s_sub_i32 s7, s5, s14
1340; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1341; GFX9-NEXT:    s_cmp_ge_u32 s5, s14
1342; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1343; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
1344; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
1345; GFX9-NEXT:    s_add_i32 s6, s4, 1
1346; GFX9-NEXT:    s_cmp_ge_u32 s5, s14
1347; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
1348; GFX9-NEXT:    s_sub_i32 s5, 0, s15
1349; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
1350; GFX9-NEXT:    s_mul_i32 s5, s5, s6
1351; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
1352; GFX9-NEXT:    s_add_i32 s6, s6, s5
1353; GFX9-NEXT:    s_mul_hi_u32 s5, s11, s6
1354; GFX9-NEXT:    s_mul_i32 s6, s5, s15
1355; GFX9-NEXT:    s_sub_i32 s6, s11, s6
1356; GFX9-NEXT:    s_add_i32 s7, s5, 1
1357; GFX9-NEXT:    s_sub_i32 s8, s6, s15
1358; GFX9-NEXT:    s_cmp_ge_u32 s6, s15
1359; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
1360; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
1361; GFX9-NEXT:    s_add_i32 s7, s5, 1
1362; GFX9-NEXT:    s_cmp_ge_u32 s6, s15
1363; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
1364; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1365; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1366; GFX9-NEXT:    v_mov_b32_e32 v2, s4
1367; GFX9-NEXT:    v_mov_b32_e32 v3, s5
1368; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1369; GFX9-NEXT:    s_endpgm
1370  %r = udiv <4 x i32> %x, %y
1371  store <4 x i32> %r, ptr addrspace(1) %out
1372  ret void
1373}
1374
1375define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
1376; CHECK-LABEL: @urem_v4i32(
1377; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1378; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1379; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1380; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1381; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1382; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1383; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1384; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1385; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1386; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1387; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1388; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1389; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1390; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1391; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1392; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1393; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1394; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1395; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1396; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1397; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1398; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1399; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1400; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1401; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1402; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
1403; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
1404; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
1405; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
1406; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i64 0
1407; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
1408; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1409; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
1410; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
1411; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
1412; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
1413; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
1414; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
1415; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
1416; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
1417; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
1418; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
1419; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
1420; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1421; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
1422; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1423; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1424; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1425; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1426; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1427; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1428; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1429; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1430; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1431; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1432; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1433; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1434; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1435; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1436; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1437; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1438; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1439; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1440; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1441; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1442; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1443; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1444; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1445; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1446; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1447; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1448; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1449; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1450; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1451; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1452; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1453; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1454; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1455; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1456; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1457; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1458; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1459; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1460; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1461; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1462; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1463; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1464; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1465; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1466; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1467; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1468; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1469; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1470; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1471; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1472; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1473; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1474; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1475; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1476; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1477; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1478; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1479; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1480; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1481; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1482; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1483; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1484; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1485; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1486; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1487; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1488; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1489; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1490; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1491; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1492; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1493; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1494; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1495; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1496; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1497; CHECK-NEXT:    store <4 x i32> [[TMP120]], ptr addrspace(1) [[OUT:%.*]], align 16
1498; CHECK-NEXT:    ret void
1499;
1500; GFX6-LABEL: urem_v4i32:
1501; GFX6:       ; %bb.0:
1502; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
1503; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1504; GFX6-NEXT:    s_mov_b32 s2, -1
1505; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1506; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
1507; GFX6-NEXT:    s_sub_i32 s0, 0, s12
1508; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s13
1509; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1510; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1511; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1512; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1513; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
1514; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
1515; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1516; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
1517; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
1518; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1519; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s14
1520; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
1521; GFX6-NEXT:    s_mul_i32 s0, s0, s12
1522; GFX6-NEXT:    s_sub_i32 s0, s8, s0
1523; GFX6-NEXT:    s_sub_i32 s1, s0, s12
1524; GFX6-NEXT:    s_cmp_ge_u32 s0, s12
1525; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
1526; GFX6-NEXT:    s_sub_i32 s1, s0, s12
1527; GFX6-NEXT:    s_cmp_ge_u32 s0, s12
1528; GFX6-NEXT:    s_cselect_b32 s6, s1, s0
1529; GFX6-NEXT:    s_sub_i32 s0, 0, s13
1530; GFX6-NEXT:    v_mul_lo_u32 v0, s0, v1
1531; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1532; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
1533; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1534; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
1535; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
1536; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1537; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s15
1538; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
1539; GFX6-NEXT:    s_mul_i32 s0, s0, s13
1540; GFX6-NEXT:    s_sub_i32 s0, s9, s0
1541; GFX6-NEXT:    s_sub_i32 s1, s0, s13
1542; GFX6-NEXT:    s_cmp_ge_u32 s0, s13
1543; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
1544; GFX6-NEXT:    s_sub_i32 s1, s0, s13
1545; GFX6-NEXT:    s_cmp_ge_u32 s0, s13
1546; GFX6-NEXT:    s_cselect_b32 s7, s1, s0
1547; GFX6-NEXT:    s_sub_i32 s0, 0, s14
1548; GFX6-NEXT:    v_mul_lo_u32 v0, s0, v1
1549; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1550; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
1551; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1552; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
1553; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
1554; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1555; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
1556; GFX6-NEXT:    s_mul_i32 s0, s0, s14
1557; GFX6-NEXT:    s_sub_i32 s0, s10, s0
1558; GFX6-NEXT:    s_sub_i32 s1, s0, s14
1559; GFX6-NEXT:    s_cmp_ge_u32 s0, s14
1560; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
1561; GFX6-NEXT:    s_sub_i32 s1, s0, s14
1562; GFX6-NEXT:    s_cmp_ge_u32 s0, s14
1563; GFX6-NEXT:    s_cselect_b32 s8, s1, s0
1564; GFX6-NEXT:    s_sub_i32 s0, 0, s15
1565; GFX6-NEXT:    v_mul_lo_u32 v0, s0, v1
1566; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1567; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
1568; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1569; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v0
1570; GFX6-NEXT:    v_mov_b32_e32 v0, s6
1571; GFX6-NEXT:    v_mov_b32_e32 v1, s7
1572; GFX6-NEXT:    v_readfirstlane_b32 s4, v2
1573; GFX6-NEXT:    s_mul_i32 s4, s4, s15
1574; GFX6-NEXT:    s_sub_i32 s4, s11, s4
1575; GFX6-NEXT:    s_sub_i32 s5, s4, s15
1576; GFX6-NEXT:    s_cmp_ge_u32 s4, s15
1577; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
1578; GFX6-NEXT:    s_sub_i32 s5, s4, s15
1579; GFX6-NEXT:    s_cmp_ge_u32 s4, s15
1580; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
1581; GFX6-NEXT:    v_mov_b32_e32 v2, s8
1582; GFX6-NEXT:    v_mov_b32_e32 v3, s4
1583; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1584; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1585; GFX6-NEXT:    s_endpgm
1586;
1587; GFX9-LABEL: urem_v4i32:
1588; GFX9:       ; %bb.0:
1589; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
1590; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1591; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1592; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1593; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
1594; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
1595; GFX9-NEXT:    s_sub_i32 s2, 0, s12
1596; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s14
1597; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1598; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1599; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1600; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1601; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1602; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1603; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1604; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1605; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
1606; GFX9-NEXT:    s_mul_i32 s2, s2, s3
1607; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s2
1608; GFX9-NEXT:    s_add_i32 s3, s3, s2
1609; GFX9-NEXT:    s_mul_hi_u32 s2, s8, s3
1610; GFX9-NEXT:    s_mul_i32 s2, s2, s12
1611; GFX9-NEXT:    s_sub_i32 s2, s8, s2
1612; GFX9-NEXT:    s_sub_i32 s3, s2, s12
1613; GFX9-NEXT:    s_cmp_ge_u32 s2, s12
1614; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
1615; GFX9-NEXT:    s_sub_i32 s3, s2, s12
1616; GFX9-NEXT:    s_cmp_ge_u32 s2, s12
1617; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1618; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
1619; GFX9-NEXT:    s_sub_i32 s3, 0, s13
1620; GFX9-NEXT:    s_mul_i32 s3, s3, s4
1621; GFX9-NEXT:    s_mul_hi_u32 s3, s4, s3
1622; GFX9-NEXT:    s_add_i32 s4, s4, s3
1623; GFX9-NEXT:    s_mul_hi_u32 s3, s9, s4
1624; GFX9-NEXT:    s_mul_i32 s3, s3, s13
1625; GFX9-NEXT:    s_sub_i32 s3, s9, s3
1626; GFX9-NEXT:    s_sub_i32 s4, s3, s13
1627; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1628; GFX9-NEXT:    s_cmp_ge_u32 s3, s13
1629; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
1630; GFX9-NEXT:    s_sub_i32 s4, s3, s13
1631; GFX9-NEXT:    s_cmp_ge_u32 s3, s13
1632; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s15
1633; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
1634; GFX9-NEXT:    s_sub_i32 s4, 0, s14
1635; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
1636; GFX9-NEXT:    s_mul_i32 s4, s4, s5
1637; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
1638; GFX9-NEXT:    s_add_i32 s5, s5, s4
1639; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1640; GFX9-NEXT:    s_mul_hi_u32 s4, s10, s5
1641; GFX9-NEXT:    s_mul_i32 s4, s4, s14
1642; GFX9-NEXT:    s_sub_i32 s4, s10, s4
1643; GFX9-NEXT:    s_sub_i32 s5, s4, s14
1644; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1645; GFX9-NEXT:    s_cmp_ge_u32 s4, s14
1646; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1647; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
1648; GFX9-NEXT:    s_sub_i32 s5, s4, s14
1649; GFX9-NEXT:    s_cmp_ge_u32 s4, s14
1650; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
1651; GFX9-NEXT:    s_sub_i32 s5, 0, s15
1652; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
1653; GFX9-NEXT:    s_mul_i32 s5, s5, s6
1654; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
1655; GFX9-NEXT:    s_add_i32 s6, s6, s5
1656; GFX9-NEXT:    s_mul_hi_u32 s5, s11, s6
1657; GFX9-NEXT:    s_mul_i32 s5, s5, s15
1658; GFX9-NEXT:    s_sub_i32 s5, s11, s5
1659; GFX9-NEXT:    s_sub_i32 s6, s5, s15
1660; GFX9-NEXT:    s_cmp_ge_u32 s5, s15
1661; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
1662; GFX9-NEXT:    s_sub_i32 s6, s5, s15
1663; GFX9-NEXT:    s_cmp_ge_u32 s5, s15
1664; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
1665; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1666; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1667; GFX9-NEXT:    v_mov_b32_e32 v2, s4
1668; GFX9-NEXT:    v_mov_b32_e32 v3, s5
1669; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1670; GFX9-NEXT:    s_endpgm
1671  %r = urem <4 x i32> %x, %y
1672  store <4 x i32> %r, ptr addrspace(1) %out
1673  ret void
1674}
1675
1676define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
1677; CHECK-LABEL: @sdiv_v4i32(
1678; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1679; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1680; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1681; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1682; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1683; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1684; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1685; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1686; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1687; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1688; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1689; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1690; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1691; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1692; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1693; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1694; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1695; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1696; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1697; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1698; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1699; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1700; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1701; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1702; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1703; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1704; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1705; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1706; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1707; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1708; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1709; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1710; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1711; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1712; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1713; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1714; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1715; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1716; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1717; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1718; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP40]], i64 0
1719; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1720; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1721; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1722; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1723; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1724; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1725; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1726; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1727; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1728; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1729; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1730; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1731; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1732; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1733; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1734; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1735; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1736; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1737; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1738; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1739; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1740; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1741; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1742; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1743; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1744; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1745; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1746; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1747; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1748; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1749; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1750; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1751; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1752; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1753; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1754; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1755; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1756; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1757; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1758; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1759; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1760; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1761; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1762; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1763; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1764; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1765; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1766; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1767; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1768; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1769; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1770; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1771; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1772; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1773; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1774; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1775; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1776; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1777; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1778; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1779; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1780; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1781; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1782; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1783; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1784; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1785; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1786; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1787; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1788; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1789; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1790; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1791; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1792; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1793; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1794; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1795; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1796; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1797; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1798; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1799; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1800; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1801; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1802; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1803; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1804; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1805; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1806; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1807; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1808; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1809; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1810; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1811; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1812; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1813; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1814; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1815; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1816; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1817; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1818; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1819; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1820; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1821; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1822; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1823; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1824; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1825; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1826; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1827; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1828; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1829; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1830; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1831; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1832; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1833; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1834; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1835; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1836; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1837; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1838; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1839; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1840; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1841; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1842; CHECK-NEXT:    store <4 x i32> [[TMP164]], ptr addrspace(1) [[OUT:%.*]], align 16
1843; CHECK-NEXT:    ret void
1844;
1845; GFX6-LABEL: sdiv_v4i32:
1846; GFX6:       ; %bb.0:
1847; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
1848; GFX6-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x9
1849; GFX6-NEXT:    s_mov_b32 s19, 0xf000
1850; GFX6-NEXT:    s_mov_b32 s18, -1
1851; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1852; GFX6-NEXT:    s_abs_i32 s0, s12
1853; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
1854; GFX6-NEXT:    s_sub_i32 s1, 0, s0
1855; GFX6-NEXT:    s_xor_b32 s2, s8, s12
1856; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1857; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1858; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1859; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
1860; GFX6-NEXT:    s_abs_i32 s1, s8
1861; GFX6-NEXT:    s_ashr_i32 s8, s2, 31
1862; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
1863; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1864; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
1865; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
1866; GFX6-NEXT:    s_mul_i32 s2, s2, s0
1867; GFX6-NEXT:    s_sub_i32 s1, s1, s2
1868; GFX6-NEXT:    s_sub_i32 s2, s1, s0
1869; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
1870; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
1871; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
1872; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1873; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
1874; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
1875; GFX6-NEXT:    s_abs_i32 s2, s13
1876; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
1877; GFX6-NEXT:    s_sub_i32 s3, 0, s2
1878; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1879; GFX6-NEXT:    s_xor_b32 s6, s9, s13
1880; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1881; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
1882; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
1883; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1884; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1885; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
1886; GFX6-NEXT:    v_mul_lo_u32 v3, s3, v2
1887; GFX6-NEXT:    s_abs_i32 s3, s9
1888; GFX6-NEXT:    s_ashr_i32 s9, s6, 31
1889; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
1890; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1891; GFX6-NEXT:    v_mul_hi_u32 v2, s3, v2
1892; GFX6-NEXT:    v_readfirstlane_b32 s6, v2
1893; GFX6-NEXT:    s_mul_i32 s6, s6, s2
1894; GFX6-NEXT:    s_sub_i32 s3, s3, s6
1895; GFX6-NEXT:    s_sub_i32 s6, s3, s2
1896; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
1897; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
1898; GFX6-NEXT:    s_cselect_b32 s3, s6, s3
1899; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1900; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
1901; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
1902; GFX6-NEXT:    s_abs_i32 s6, s14
1903; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
1904; GFX6-NEXT:    s_sub_i32 s7, 0, s6
1905; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
1906; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
1907; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1908; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
1909; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1910; GFX6-NEXT:    v_mul_lo_u32 v5, s7, v4
1911; GFX6-NEXT:    s_abs_i32 s7, s10
1912; GFX6-NEXT:    s_xor_b32 s10, s10, s14
1913; GFX6-NEXT:    s_ashr_i32 s10, s10, 31
1914; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1915; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1916; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
1917; GFX6-NEXT:    v_readfirstlane_b32 s12, v4
1918; GFX6-NEXT:    s_mul_i32 s12, s12, s6
1919; GFX6-NEXT:    s_sub_i32 s7, s7, s12
1920; GFX6-NEXT:    s_sub_i32 s12, s7, s6
1921; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
1922; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1923; GFX6-NEXT:    s_cselect_b32 s7, s12, s7
1924; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1925; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
1926; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
1927; GFX6-NEXT:    s_abs_i32 s12, s15
1928; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s12
1929; GFX6-NEXT:    s_sub_i32 s0, 0, s12
1930; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
1931; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1932; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v6
1933; GFX6-NEXT:    s_abs_i32 s1, s11
1934; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
1935; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1936; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v1
1937; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[2:3]
1938; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, v5, s[6:7]
1939; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
1940; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v6
1941; GFX6-NEXT:    s_xor_b32 s0, s11, s15
1942; GFX6-NEXT:    v_xor_b32_e32 v3, s10, v3
1943; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
1944; GFX6-NEXT:    v_mul_hi_u32 v2, v6, v2
1945; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
1946; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
1947; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v2
1948; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v3
1949; GFX6-NEXT:    v_readfirstlane_b32 s2, v4
1950; GFX6-NEXT:    s_mul_i32 s2, s2, s12
1951; GFX6-NEXT:    s_sub_i32 s1, s1, s2
1952; GFX6-NEXT:    s_sub_i32 s2, s1, s12
1953; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
1954; GFX6-NEXT:    s_cmp_ge_u32 s1, s12
1955; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1956; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
1957; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
1958; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
1959; GFX6-NEXT:    s_cmp_ge_u32 s1, s12
1960; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
1961; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1962; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
1963; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v3
1964; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
1965; GFX6-NEXT:    s_endpgm
1966;
1967; GFX9-LABEL: sdiv_v4i32:
1968; GFX9:       ; %bb.0:
1969; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
1970; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1971; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1972; GFX9-NEXT:    s_abs_i32 s0, s12
1973; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
1974; GFX9-NEXT:    s_sub_i32 s3, 0, s0
1975; GFX9-NEXT:    s_abs_i32 s2, s8
1976; GFX9-NEXT:    s_xor_b32 s1, s8, s12
1977; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1978; GFX9-NEXT:    s_ashr_i32 s1, s1, 31
1979; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1980; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1981; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
1982; GFX9-NEXT:    s_mul_i32 s3, s3, s6
1983; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
1984; GFX9-NEXT:    s_add_i32 s6, s6, s3
1985; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
1986; GFX9-NEXT:    s_mul_i32 s6, s3, s0
1987; GFX9-NEXT:    s_sub_i32 s2, s2, s6
1988; GFX9-NEXT:    s_add_i32 s7, s3, 1
1989; GFX9-NEXT:    s_sub_i32 s6, s2, s0
1990; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
1991; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
1992; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
1993; GFX9-NEXT:    s_add_i32 s6, s3, 1
1994; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
1995; GFX9-NEXT:    s_cselect_b32 s0, s6, s3
1996; GFX9-NEXT:    s_abs_i32 s2, s13
1997; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
1998; GFX9-NEXT:    s_xor_b32 s0, s0, s1
1999; GFX9-NEXT:    s_sub_i32 s7, 0, s2
2000; GFX9-NEXT:    s_sub_i32 s8, s0, s1
2001; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2002; GFX9-NEXT:    s_abs_i32 s6, s9
2003; GFX9-NEXT:    s_xor_b32 s3, s9, s13
2004; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
2005; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2006; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2007; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2008; GFX9-NEXT:    s_mul_i32 s7, s7, s0
2009; GFX9-NEXT:    s_mul_hi_u32 s1, s0, s7
2010; GFX9-NEXT:    s_add_i32 s0, s0, s1
2011; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s0
2012; GFX9-NEXT:    s_mul_i32 s1, s0, s2
2013; GFX9-NEXT:    s_sub_i32 s1, s6, s1
2014; GFX9-NEXT:    s_add_i32 s7, s0, 1
2015; GFX9-NEXT:    s_sub_i32 s6, s1, s2
2016; GFX9-NEXT:    s_cmp_ge_u32 s1, s2
2017; GFX9-NEXT:    s_cselect_b32 s0, s7, s0
2018; GFX9-NEXT:    s_cselect_b32 s1, s6, s1
2019; GFX9-NEXT:    s_add_i32 s6, s0, 1
2020; GFX9-NEXT:    s_cmp_ge_u32 s1, s2
2021; GFX9-NEXT:    s_cselect_b32 s0, s6, s0
2022; GFX9-NEXT:    s_abs_i32 s1, s14
2023; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
2024; GFX9-NEXT:    s_xor_b32 s0, s0, s3
2025; GFX9-NEXT:    s_sub_i32 s7, 0, s1
2026; GFX9-NEXT:    s_sub_i32 s3, s0, s3
2027; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2028; GFX9-NEXT:    s_abs_i32 s6, s10
2029; GFX9-NEXT:    s_xor_b32 s2, s10, s14
2030; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
2031; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2032; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2033; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2034; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2035; GFX9-NEXT:    s_mul_i32 s7, s7, s0
2036; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s7
2037; GFX9-NEXT:    s_add_i32 s0, s0, s7
2038; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s0
2039; GFX9-NEXT:    s_mul_i32 s7, s0, s1
2040; GFX9-NEXT:    s_sub_i32 s6, s6, s7
2041; GFX9-NEXT:    s_add_i32 s9, s0, 1
2042; GFX9-NEXT:    s_sub_i32 s7, s6, s1
2043; GFX9-NEXT:    s_cmp_ge_u32 s6, s1
2044; GFX9-NEXT:    s_cselect_b32 s0, s9, s0
2045; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
2046; GFX9-NEXT:    s_add_i32 s7, s0, 1
2047; GFX9-NEXT:    s_cmp_ge_u32 s6, s1
2048; GFX9-NEXT:    s_cselect_b32 s6, s7, s0
2049; GFX9-NEXT:    s_abs_i32 s7, s15
2050; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s7
2051; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2052; GFX9-NEXT:    s_xor_b32 s5, s6, s2
2053; GFX9-NEXT:    s_sub_i32 s6, 0, s7
2054; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2055; GFX9-NEXT:    s_sub_i32 s2, s5, s2
2056; GFX9-NEXT:    s_abs_i32 s4, s11
2057; GFX9-NEXT:    s_xor_b32 s3, s11, s15
2058; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2059; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2060; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2061; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
2062; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
2063; GFX9-NEXT:    s_mul_i32 s6, s6, s5
2064; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s6
2065; GFX9-NEXT:    s_add_i32 s5, s5, s6
2066; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s5
2067; GFX9-NEXT:    s_mul_i32 s6, s5, s7
2068; GFX9-NEXT:    s_sub_i32 s4, s4, s6
2069; GFX9-NEXT:    s_add_i32 s8, s5, 1
2070; GFX9-NEXT:    s_sub_i32 s6, s4, s7
2071; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
2072; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
2073; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
2074; GFX9-NEXT:    s_add_i32 s6, s5, 1
2075; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
2076; GFX9-NEXT:    s_cselect_b32 s4, s6, s5
2077; GFX9-NEXT:    s_xor_b32 s4, s4, s3
2078; GFX9-NEXT:    s_sub_i32 s3, s4, s3
2079; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2080; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2081; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2082; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2083; GFX9-NEXT:    s_endpgm
2084  %r = sdiv <4 x i32> %x, %y
2085  store <4 x i32> %r, ptr addrspace(1) %out
2086  ret void
2087}
2088
2089define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
2090; CHECK-LABEL: @srem_v4i32(
2091; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2092; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2093; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2094; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2095; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
2096; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
2097; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
2098; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
2099; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
2100; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2101; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
2102; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
2103; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
2104; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
2105; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
2106; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
2107; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
2108; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
2109; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
2110; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
2111; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
2112; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
2113; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
2114; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
2115; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
2116; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
2117; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
2118; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
2119; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
2120; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
2121; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
2122; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
2123; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
2124; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
2125; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
2126; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
2127; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
2128; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i64 0
2129; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
2130; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2131; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
2132; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
2133; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
2134; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
2135; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
2136; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
2137; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
2138; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
2139; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
2140; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
2141; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
2142; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
2143; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
2144; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
2145; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
2146; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
2147; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
2148; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
2149; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
2150; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
2151; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
2152; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
2153; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
2154; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
2155; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
2156; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
2157; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
2158; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
2159; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
2160; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
2161; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
2162; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
2163; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
2164; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
2165; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
2166; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
2167; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
2168; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2169; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
2170; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
2171; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
2172; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
2173; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
2174; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
2175; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
2176; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
2177; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
2178; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
2179; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
2180; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
2181; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
2182; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
2183; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
2184; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
2185; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
2186; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
2187; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
2188; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
2189; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2190; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2191; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2192; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2193; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2194; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
2195; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
2196; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
2197; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
2198; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
2199; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
2200; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
2201; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
2202; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
2203; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
2204; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
2205; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
2206; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2207; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
2208; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
2209; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
2210; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
2211; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
2212; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
2213; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
2214; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
2215; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
2216; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
2217; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
2218; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
2219; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
2220; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
2221; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
2222; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
2223; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
2224; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
2225; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
2226; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
2227; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
2228; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
2229; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
2230; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
2231; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
2232; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
2233; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
2234; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
2235; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
2236; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
2237; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
2238; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
2239; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
2240; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
2241; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
2242; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
2243; CHECK-NEXT:    store <4 x i32> [[TMP152]], ptr addrspace(1) [[OUT:%.*]], align 16
2244; CHECK-NEXT:    ret void
2245;
2246; GFX6-LABEL: srem_v4i32:
2247; GFX6:       ; %bb.0:
2248; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
2249; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2250; GFX6-NEXT:    s_abs_i32 s0, s12
2251; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
2252; GFX6-NEXT:    s_sub_i32 s1, 0, s0
2253; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
2254; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2255; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2256; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2257; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
2258; GFX6-NEXT:    s_abs_i32 s1, s8
2259; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
2260; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2261; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
2262; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
2263; GFX6-NEXT:    s_mul_i32 s3, s3, s0
2264; GFX6-NEXT:    s_sub_i32 s1, s1, s3
2265; GFX6-NEXT:    s_sub_i32 s3, s1, s0
2266; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
2267; GFX6-NEXT:    s_cselect_b32 s1, s3, s1
2268; GFX6-NEXT:    s_sub_i32 s3, s1, s0
2269; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
2270; GFX6-NEXT:    s_cselect_b32 s0, s3, s1
2271; GFX6-NEXT:    s_abs_i32 s1, s13
2272; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s1
2273; GFX6-NEXT:    s_sub_i32 s3, 0, s1
2274; GFX6-NEXT:    s_xor_b32 s0, s0, s2
2275; GFX6-NEXT:    s_sub_i32 s7, s0, s2
2276; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2277; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
2278; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2279; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2280; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
2281; GFX6-NEXT:    s_abs_i32 s3, s9
2282; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
2283; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2284; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
2285; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
2286; GFX6-NEXT:    s_mul_i32 s0, s0, s1
2287; GFX6-NEXT:    s_sub_i32 s0, s3, s0
2288; GFX6-NEXT:    s_sub_i32 s2, s0, s1
2289; GFX6-NEXT:    s_cmp_ge_u32 s0, s1
2290; GFX6-NEXT:    s_cselect_b32 s0, s2, s0
2291; GFX6-NEXT:    s_sub_i32 s2, s0, s1
2292; GFX6-NEXT:    s_cmp_ge_u32 s0, s1
2293; GFX6-NEXT:    s_cselect_b32 s0, s2, s0
2294; GFX6-NEXT:    s_abs_i32 s1, s14
2295; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s1
2296; GFX6-NEXT:    s_sub_i32 s2, 0, s1
2297; GFX6-NEXT:    s_xor_b32 s0, s0, s6
2298; GFX6-NEXT:    s_sub_i32 s6, s0, s6
2299; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2300; GFX6-NEXT:    s_ashr_i32 s8, s10, 31
2301; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2302; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2303; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2304; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
2305; GFX6-NEXT:    s_abs_i32 s2, s10
2306; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
2307; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2308; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
2309; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
2310; GFX6-NEXT:    s_mul_i32 s0, s0, s1
2311; GFX6-NEXT:    s_sub_i32 s0, s2, s0
2312; GFX6-NEXT:    s_sub_i32 s2, s0, s1
2313; GFX6-NEXT:    s_cmp_ge_u32 s0, s1
2314; GFX6-NEXT:    s_cselect_b32 s0, s2, s0
2315; GFX6-NEXT:    s_sub_i32 s2, s0, s1
2316; GFX6-NEXT:    s_cmp_ge_u32 s0, s1
2317; GFX6-NEXT:    s_cselect_b32 s9, s2, s0
2318; GFX6-NEXT:    s_abs_i32 s10, s15
2319; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
2320; GFX6-NEXT:    s_sub_i32 s0, 0, s10
2321; GFX6-NEXT:    s_mov_b32 s2, -1
2322; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2323; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2324; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v0
2325; GFX6-NEXT:    v_mov_b32_e32 v0, s7
2326; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v2
2327; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2328; GFX6-NEXT:    s_abs_i32 s4, s11
2329; GFX6-NEXT:    s_ashr_i32 s5, s11, 31
2330; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v1
2331; GFX6-NEXT:    v_mov_b32_e32 v1, s6
2332; GFX6-NEXT:    s_xor_b32 s6, s9, s8
2333; GFX6-NEXT:    s_sub_i32 s6, s6, s8
2334; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
2335; GFX6-NEXT:    v_mul_hi_u32 v2, s4, v2
2336; GFX6-NEXT:    v_readfirstlane_b32 s7, v2
2337; GFX6-NEXT:    s_mul_i32 s7, s7, s10
2338; GFX6-NEXT:    s_sub_i32 s4, s4, s7
2339; GFX6-NEXT:    s_sub_i32 s7, s4, s10
2340; GFX6-NEXT:    s_cmp_ge_u32 s4, s10
2341; GFX6-NEXT:    s_cselect_b32 s4, s7, s4
2342; GFX6-NEXT:    s_sub_i32 s7, s4, s10
2343; GFX6-NEXT:    s_cmp_ge_u32 s4, s10
2344; GFX6-NEXT:    s_cselect_b32 s4, s7, s4
2345; GFX6-NEXT:    s_xor_b32 s4, s4, s5
2346; GFX6-NEXT:    s_sub_i32 s4, s4, s5
2347; GFX6-NEXT:    v_mov_b32_e32 v2, s6
2348; GFX6-NEXT:    v_mov_b32_e32 v3, s4
2349; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2350; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2351; GFX6-NEXT:    s_endpgm
2352;
2353; GFX9-LABEL: srem_v4i32:
2354; GFX9:       ; %bb.0:
2355; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
2356; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2357; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2358; GFX9-NEXT:    s_abs_i32 s0, s12
2359; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
2360; GFX9-NEXT:    s_sub_i32 s3, 0, s0
2361; GFX9-NEXT:    s_abs_i32 s2, s8
2362; GFX9-NEXT:    s_ashr_i32 s1, s8, 31
2363; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2364; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2365; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2366; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
2367; GFX9-NEXT:    s_mul_i32 s3, s3, s6
2368; GFX9-NEXT:    s_mul_hi_u32 s3, s6, s3
2369; GFX9-NEXT:    s_add_i32 s6, s6, s3
2370; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s6
2371; GFX9-NEXT:    s_mul_i32 s3, s3, s0
2372; GFX9-NEXT:    s_sub_i32 s2, s2, s3
2373; GFX9-NEXT:    s_sub_i32 s3, s2, s0
2374; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
2375; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
2376; GFX9-NEXT:    s_sub_i32 s3, s2, s0
2377; GFX9-NEXT:    s_cmp_ge_u32 s2, s0
2378; GFX9-NEXT:    s_cselect_b32 s0, s3, s2
2379; GFX9-NEXT:    s_abs_i32 s2, s13
2380; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2381; GFX9-NEXT:    s_xor_b32 s0, s0, s1
2382; GFX9-NEXT:    s_sub_i32 s7, 0, s2
2383; GFX9-NEXT:    s_sub_i32 s8, s0, s1
2384; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2385; GFX9-NEXT:    s_abs_i32 s6, s9
2386; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
2387; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2388; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2389; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2390; GFX9-NEXT:    s_mul_i32 s7, s7, s0
2391; GFX9-NEXT:    s_mul_hi_u32 s1, s0, s7
2392; GFX9-NEXT:    s_add_i32 s0, s0, s1
2393; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s0
2394; GFX9-NEXT:    s_mul_i32 s0, s0, s2
2395; GFX9-NEXT:    s_sub_i32 s0, s6, s0
2396; GFX9-NEXT:    s_sub_i32 s1, s0, s2
2397; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
2398; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
2399; GFX9-NEXT:    s_sub_i32 s1, s0, s2
2400; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
2401; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
2402; GFX9-NEXT:    s_abs_i32 s1, s14
2403; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
2404; GFX9-NEXT:    s_xor_b32 s0, s0, s3
2405; GFX9-NEXT:    s_sub_i32 s7, 0, s1
2406; GFX9-NEXT:    s_sub_i32 s3, s0, s3
2407; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2408; GFX9-NEXT:    s_abs_i32 s6, s10
2409; GFX9-NEXT:    s_ashr_i32 s2, s10, 31
2410; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2411; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2412; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2413; GFX9-NEXT:    s_mul_i32 s7, s7, s0
2414; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s7
2415; GFX9-NEXT:    s_add_i32 s0, s0, s7
2416; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s0
2417; GFX9-NEXT:    s_mul_i32 s0, s0, s1
2418; GFX9-NEXT:    s_sub_i32 s0, s6, s0
2419; GFX9-NEXT:    s_sub_i32 s6, s0, s1
2420; GFX9-NEXT:    s_cmp_ge_u32 s0, s1
2421; GFX9-NEXT:    s_cselect_b32 s0, s6, s0
2422; GFX9-NEXT:    s_sub_i32 s6, s0, s1
2423; GFX9-NEXT:    s_cmp_ge_u32 s0, s1
2424; GFX9-NEXT:    s_cselect_b32 s6, s6, s0
2425; GFX9-NEXT:    s_abs_i32 s7, s15
2426; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
2427; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2428; GFX9-NEXT:    s_xor_b32 s5, s6, s2
2429; GFX9-NEXT:    s_sub_i32 s6, 0, s7
2430; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v1
2431; GFX9-NEXT:    s_sub_i32 s2, s5, s2
2432; GFX9-NEXT:    s_abs_i32 s4, s11
2433; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2434; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2435; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2436; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
2437; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2438; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
2439; GFX9-NEXT:    s_mul_i32 s6, s6, s5
2440; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s6
2441; GFX9-NEXT:    s_add_i32 s5, s5, s6
2442; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s5
2443; GFX9-NEXT:    s_mul_i32 s5, s5, s7
2444; GFX9-NEXT:    s_sub_i32 s4, s4, s5
2445; GFX9-NEXT:    s_sub_i32 s5, s4, s7
2446; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
2447; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
2448; GFX9-NEXT:    s_sub_i32 s5, s4, s7
2449; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
2450; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
2451; GFX9-NEXT:    s_xor_b32 s4, s4, s3
2452; GFX9-NEXT:    s_sub_i32 s3, s4, s3
2453; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2454; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2455; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2456; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2457; GFX9-NEXT:    s_endpgm
2458  %r = srem <4 x i32> %x, %y
2459  store <4 x i32> %r, ptr addrspace(1) %out
2460  ret void
2461}
2462
2463define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
2464; CHECK-LABEL: @udiv_v4i16(
2465; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2466; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2467; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2468; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2469; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2470; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2471; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2472; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2473; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2474; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2475; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2476; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2477; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2478; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2479; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2480; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2481; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2482; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2483; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2484; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> poison, i16 [[TMP19]], i64 0
2485; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
2486; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2487; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2488; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2489; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2490; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2491; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2492; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2493; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2494; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2495; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2496; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2497; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2498; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2499; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2500; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2501; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2502; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2503; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2504; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2505; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
2506; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2507; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2508; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2509; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2510; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2511; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2512; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2513; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2514; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2515; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2516; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2517; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2518; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2519; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2520; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2521; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2522; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2523; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2524; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2525; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
2526; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2527; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
2528; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
2529; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
2530; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
2531; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
2532; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
2533; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
2534; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
2535; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
2536; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
2537; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
2538; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2539; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
2540; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
2541; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
2542; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
2543; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
2544; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
2545; CHECK-NEXT:    store <4 x i16> [[TMP80]], ptr addrspace(1) [[OUT:%.*]], align 8
2546; CHECK-NEXT:    ret void
2547;
2548; GFX6-LABEL: udiv_v4i16:
2549; GFX6:       ; %bb.0:
2550; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
2551; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2552; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2553; GFX6-NEXT:    s_mov_b32 s2, -1
2554; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2555; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
2556; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
2557; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
2558; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
2559; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
2560; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
2561; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2562; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
2563; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2564; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2565; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
2566; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2567; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
2568; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2569; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2570; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2571; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
2572; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
2573; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
2574; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2575; GFX6-NEXT:    s_and_b32 s4, s9, 0xffff
2576; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
2577; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2578; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2579; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2580; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2581; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
2582; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
2583; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2584; GFX6-NEXT:    s_lshr_b32 s4, s11, 16
2585; GFX6-NEXT:    v_mad_f32 v3, -v1, v4, v5
2586; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2587; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
2588; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
2589; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2590; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2591; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2592; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2593; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2594; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
2595; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2596; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
2597; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
2598; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2599; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2600; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2601; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2602; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2603; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
2604; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2605; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2606; GFX6-NEXT:    s_endpgm
2607;
2608; GFX9-LABEL: udiv_v4i16:
2609; GFX9:       ; %bb.0:
2610; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
2611; GFX9-NEXT:    v_mov_b32_e32 v6, 0
2612; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2613; GFX9-NEXT:    s_and_b32 s7, s2, 0xffff
2614; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
2615; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
2616; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
2617; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
2618; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
2619; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
2620; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
2621; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
2622; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
2623; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
2624; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
2625; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
2626; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
2627; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
2628; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
2629; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
2630; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2631; GFX9-NEXT:    s_and_b32 s0, s1, 0xffff
2632; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2633; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
2634; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2635; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2636; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
2637; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
2638; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
2639; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
2640; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2641; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
2642; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2643; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2644; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
2645; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s0
2646; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
2647; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2648; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2649; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2650; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
2651; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2652; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
2653; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
2654; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2655; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
2656; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2657; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2658; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2659; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2660; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
2661; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
2662; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2663; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
2664; GFX9-NEXT:    s_endpgm
2665  %r = udiv <4 x i16> %x, %y
2666  store <4 x i16> %r, ptr addrspace(1) %out
2667  ret void
2668}
2669
2670define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
2671; CHECK-LABEL: @urem_v4i16(
2672; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2673; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2674; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2675; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2676; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2677; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2678; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2679; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2680; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2681; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2682; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2683; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2684; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2685; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2686; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2687; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2688; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2689; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2690; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2691; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2692; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2693; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> poison, i16 [[TMP21]], i64 0
2694; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
2695; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2696; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2697; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2698; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2699; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2700; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2701; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2702; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2703; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2704; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2705; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2706; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2707; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2708; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2709; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2710; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2711; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2712; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2713; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2714; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2715; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2716; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
2717; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2718; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2719; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2720; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2721; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2722; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2723; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2724; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2725; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2726; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2727; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2728; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2729; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2730; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2731; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2732; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2733; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2734; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2735; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2736; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2737; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2738; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
2739; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2740; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
2741; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
2742; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
2743; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
2744; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
2745; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
2746; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
2747; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
2748; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
2749; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
2750; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
2751; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
2752; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
2753; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
2754; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
2755; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
2756; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
2757; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
2758; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
2759; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
2760; CHECK-NEXT:    store <4 x i16> [[TMP88]], ptr addrspace(1) [[OUT:%.*]], align 8
2761; CHECK-NEXT:    ret void
2762;
2763; GFX6-LABEL: urem_v4i16:
2764; GFX6:       ; %bb.0:
2765; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
2766; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2767; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2768; GFX6-NEXT:    s_mov_b32 s2, -1
2769; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2770; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
2771; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
2772; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
2773; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
2774; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
2775; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
2776; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2777; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
2778; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2779; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2780; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
2781; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2782; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
2783; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
2784; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2785; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2786; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2787; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
2788; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
2789; GFX6-NEXT:    v_mad_f32 v1, -v1, v2, v4
2790; GFX6-NEXT:    s_and_b32 s6, s11, 0xffff
2791; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
2792; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
2793; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
2794; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
2795; GFX6-NEXT:    s_and_b32 s5, s9, 0xffff
2796; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s5
2797; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2798; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v1
2799; GFX6-NEXT:    s_lshr_b32 s4, s11, 16
2800; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
2801; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2802; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
2803; GFX6-NEXT:    s_lshr_b32 s5, s9, 16
2804; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s5
2805; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2806; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2807; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
2808; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
2809; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2810; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2811; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
2812; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2813; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
2814; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2815; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
2816; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2817; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2818; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
2819; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
2820; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2821; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s9, v1
2822; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
2823; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2824; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2825; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
2826; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2827; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2828; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2829; GFX6-NEXT:    s_endpgm
2830;
2831; GFX9-LABEL: urem_v4i16:
2832; GFX9:       ; %bb.0:
2833; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
2834; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
2835; GFX9-NEXT:    v_mov_b32_e32 v6, 0
2836; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX9-NEXT:    s_and_b32 s9, s2, 0xffff
2838; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
2839; GFX9-NEXT:    s_and_b32 s8, s0, 0xffff
2840; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
2841; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
2842; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
2843; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
2844; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
2845; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
2846; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
2847; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
2848; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
2849; GFX9-NEXT:    s_and_b32 s4, s3, 0xffff
2850; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
2851; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
2852; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
2853; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
2854; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2855; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
2856; GFX9-NEXT:    s_and_b32 s5, s1, 0xffff
2857; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2858; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
2859; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2860; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
2861; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2862; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
2863; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
2864; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
2865; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
2866; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
2867; GFX9-NEXT:    s_lshr_b32 s2, s3, 16
2868; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
2869; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
2870; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
2871; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
2872; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2873; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
2874; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2875; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
2876; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2877; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
2878; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2879; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
2880; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
2881; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2882; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
2883; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2884; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
2885; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
2886; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v1
2887; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v2
2888; GFX9-NEXT:    v_sub_u32_e32 v2, s1, v3
2889; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2890; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2891; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
2892; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
2893; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
2894; GFX9-NEXT:    s_endpgm
2895  %r = urem <4 x i16> %x, %y
2896  store <4 x i16> %r, ptr addrspace(1) %out
2897  ret void
2898}
2899
2900define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
2901; CHECK-LABEL: @sdiv_v4i16(
2902; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2903; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2904; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2905; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2906; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2907; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2908; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2909; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2910; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2911; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2912; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2913; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2914; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2915; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2916; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2917; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2918; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2919; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2920; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2921; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2922; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2923; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2924; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2925; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> poison, i16 [[TMP23]], i64 0
2926; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2927; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2928; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2929; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2930; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2931; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2932; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2933; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2934; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2935; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2936; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2937; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2938; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2939; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2940; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2941; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2942; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2943; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2944; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2945; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2946; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2947; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2948; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2949; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2950; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2951; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2952; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2953; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2954; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2955; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2956; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2957; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2958; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2959; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2960; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2961; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2962; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2963; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2964; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2965; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2966; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2967; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2968; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2969; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2970; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2971; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2972; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2973; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2974; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2975; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2976; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2977; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2978; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2979; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2980; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2981; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2982; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2983; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2984; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2985; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2986; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2987; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2988; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2989; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2990; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2991; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2992; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2993; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2994; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2995; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2996; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2997; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2998; CHECK-NEXT:    store <4 x i16> [[TMP96]], ptr addrspace(1) [[OUT:%.*]], align 8
2999; CHECK-NEXT:    ret void
3000;
3001; GFX6-LABEL: sdiv_v4i16:
3002; GFX6:       ; %bb.0:
3003; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
3004; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3005; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3006; GFX6-NEXT:    s_mov_b32 s2, -1
3007; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3008; GFX6-NEXT:    s_sext_i32_i16 s4, s10
3009; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
3010; GFX6-NEXT:    s_sext_i32_i16 s5, s8
3011; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
3012; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3013; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3014; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3015; GFX6-NEXT:    s_or_b32 s6, s4, 1
3016; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3017; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3018; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3019; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3020; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3021; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3022; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
3023; GFX6-NEXT:    s_ashr_i32 s5, s10, 16
3024; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
3025; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v2
3026; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
3027; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
3028; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3029; GFX6-NEXT:    s_xor_b32 s4, s4, s5
3030; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3031; GFX6-NEXT:    s_or_b32 s6, s4, 1
3032; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
3033; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3034; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
3035; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3036; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3037; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3038; GFX6-NEXT:    s_sext_i32_i16 s5, s11
3039; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
3040; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
3041; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
3042; GFX6-NEXT:    s_sext_i32_i16 s4, s9
3043; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
3044; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3045; GFX6-NEXT:    s_xor_b32 s4, s4, s5
3046; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3047; GFX6-NEXT:    s_or_b32 s6, s4, 1
3048; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3049; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3050; GFX6-NEXT:    v_mad_f32 v1, -v4, v0, v1
3051; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3052; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3053; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3054; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
3055; GFX6-NEXT:    s_ashr_i32 s5, s11, 16
3056; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
3057; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
3058; GFX6-NEXT:    s_ashr_i32 s4, s9, 16
3059; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
3060; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v0
3061; GFX6-NEXT:    s_xor_b32 s4, s4, s5
3062; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3063; GFX6-NEXT:    s_or_b32 s6, s4, 1
3064; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3065; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3066; GFX6-NEXT:    v_mad_f32 v4, -v5, v0, v4
3067; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3068; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v0|
3069; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3070; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
3071; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v5
3072; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3073; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3074; GFX6-NEXT:    v_or_b32_e32 v1, v1, v0
3075; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
3076; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3077; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
3078; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3079; GFX6-NEXT:    s_endpgm
3080;
3081; GFX9-LABEL: sdiv_v4i16:
3082; GFX9:       ; %bb.0:
3083; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
3084; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
3085; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3086; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3087; GFX9-NEXT:    s_sext_i32_i16 s4, s2
3088; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
3089; GFX9-NEXT:    s_sext_i32_i16 s5, s0
3090; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s5
3091; GFX9-NEXT:    s_xor_b32 s4, s5, s4
3092; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3093; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
3094; GFX9-NEXT:    s_or_b32 s8, s4, 1
3095; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3096; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3097; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3098; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3099; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3100; GFX9-NEXT:    s_cselect_b32 s4, s8, 0
3101; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
3102; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
3103; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
3104; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
3105; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3106; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3107; GFX9-NEXT:    s_xor_b32 s0, s0, s2
3108; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3109; GFX9-NEXT:    s_sext_i32_i16 s2, s3
3110; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
3111; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3112; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
3113; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3114; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
3115; GFX9-NEXT:    s_or_b32 s0, s0, 1
3116; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3117; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
3118; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3119; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
3120; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
3121; GFX9-NEXT:    s_sext_i32_i16 s0, s1
3122; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
3123; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
3124; GFX9-NEXT:    s_xor_b32 s0, s0, s2
3125; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3126; GFX9-NEXT:    s_or_b32 s0, s0, 1
3127; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
3128; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3129; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
3130; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3131; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3132; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3133; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
3134; GFX9-NEXT:    s_ashr_i32 s2, s3, 16
3135; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
3136; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
3137; GFX9-NEXT:    s_ashr_i32 s0, s1, 16
3138; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
3139; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
3140; GFX9-NEXT:    s_xor_b32 s0, s0, s2
3141; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3142; GFX9-NEXT:    s_or_b32 s2, s0, 1
3143; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3144; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3145; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
3146; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3147; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
3148; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3149; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
3150; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
3151; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3152; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
3153; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
3154; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
3155; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
3156; GFX9-NEXT:    s_endpgm
3157  %r = sdiv <4 x i16> %x, %y
3158  store <4 x i16> %r, ptr addrspace(1) %out
3159  ret void
3160}
3161
3162define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
3163; CHECK-LABEL: @srem_v4i16(
3164; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3165; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3166; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3167; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3168; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3169; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3170; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3171; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3172; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3173; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3174; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3175; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3176; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3177; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3178; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3179; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3180; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3181; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3182; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3183; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3184; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3185; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3186; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3187; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3188; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3189; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
3190; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
3191; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3192; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3193; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3194; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3195; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3196; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3197; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3198; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3199; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3200; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3201; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3202; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3203; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3204; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3205; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3206; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3207; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3208; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3209; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3210; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3211; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3212; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3213; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3214; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3215; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3216; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
3217; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3218; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3219; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3220; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3221; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3222; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3223; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3224; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3225; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3226; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3227; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3228; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3229; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3230; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3231; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3232; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3233; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3234; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3235; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3236; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3237; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3238; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3239; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3240; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3241; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3242; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
3243; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3244; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
3245; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
3246; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
3247; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
3248; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
3249; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
3250; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
3251; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
3252; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
3253; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
3254; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
3255; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
3256; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
3257; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
3258; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
3259; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
3260; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
3261; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
3262; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
3263; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
3264; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
3265; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
3266; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
3267; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
3268; CHECK-NEXT:    store <4 x i16> [[TMP104]], ptr addrspace(1) [[OUT:%.*]], align 8
3269; CHECK-NEXT:    ret void
3270;
3271; GFX6-LABEL: srem_v4i16:
3272; GFX6:       ; %bb.0:
3273; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
3274; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3275; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3276; GFX6-NEXT:    s_mov_b32 s2, -1
3277; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3278; GFX6-NEXT:    s_sext_i32_i16 s4, s10
3279; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
3280; GFX6-NEXT:    s_sext_i32_i16 s5, s8
3281; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
3282; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3283; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3284; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3285; GFX6-NEXT:    s_or_b32 s6, s4, 1
3286; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3287; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3288; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3289; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3290; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3291; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3292; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
3293; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
3294; GFX6-NEXT:    s_ashr_i32 s4, s10, 16
3295; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
3296; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
3297; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
3298; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
3299; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3300; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3301; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3302; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
3303; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3304; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3305; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3306; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3307; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
3308; GFX6-NEXT:    s_lshr_b32 s7, s10, 16
3309; GFX6-NEXT:    s_or_b32 s8, s4, 1
3310; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
3311; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3312; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
3313; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
3314; GFX6-NEXT:    s_sext_i32_i16 s4, s11
3315; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
3316; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3317; GFX6-NEXT:    s_sext_i32_i16 s5, s9
3318; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3319; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v1
3320; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
3321; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3322; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3323; GFX6-NEXT:    s_or_b32 s6, s4, 1
3324; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3325; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3326; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3327; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3328; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3329; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v2|
3330; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3331; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
3332; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
3333; GFX6-NEXT:    s_ashr_i32 s4, s11, 16
3334; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3335; GFX6-NEXT:    s_ashr_i32 s5, s9, 16
3336; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
3337; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3338; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3339; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3340; GFX6-NEXT:    s_lshr_b32 s6, s9, 16
3341; GFX6-NEXT:    s_lshr_b32 s7, s11, 16
3342; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3343; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3344; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3345; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3346; GFX6-NEXT:    s_or_b32 s8, s4, 1
3347; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
3348; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3349; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
3350; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v5
3351; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
3352; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
3353; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s9, v1
3354; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
3355; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3356; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3357; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3358; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3359; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3360; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3361; GFX6-NEXT:    s_endpgm
3362;
3363; GFX9-LABEL: srem_v4i16:
3364; GFX9:       ; %bb.0:
3365; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
3366; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
3367; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3368; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3369; GFX9-NEXT:    s_sext_i32_i16 s8, s2
3370; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
3371; GFX9-NEXT:    s_sext_i32_i16 s9, s0
3372; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
3373; GFX9-NEXT:    s_xor_b32 s4, s9, s8
3374; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3375; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
3376; GFX9-NEXT:    s_or_b32 s10, s4, 1
3377; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3378; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3379; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3380; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3381; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3382; GFX9-NEXT:    s_cselect_b32 s4, s10, 0
3383; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
3384; GFX9-NEXT:    s_ashr_i32 s0, s2, 16
3385; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3386; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3387; GFX9-NEXT:    s_xor_b32 s2, s10, s0
3388; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
3389; GFX9-NEXT:    v_add_u32_e32 v1, s4, v3
3390; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s10
3391; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3392; GFX9-NEXT:    s_or_b32 s2, s2, 1
3393; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
3394; GFX9-NEXT:    s_sext_i32_i16 s8, s1
3395; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
3396; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3397; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
3398; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3399; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
3400; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3401; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
3402; GFX9-NEXT:    v_add_u32_e32 v0, s2, v4
3403; GFX9-NEXT:    s_sext_i32_i16 s2, s3
3404; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
3405; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s8
3406; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
3407; GFX9-NEXT:    s_xor_b32 s0, s8, s2
3408; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3409; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3410; GFX9-NEXT:    s_or_b32 s0, s0, 1
3411; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
3412; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
3413; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3414; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
3415; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
3416; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3417; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3418; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
3419; GFX9-NEXT:    s_ashr_i32 s3, s3, 16
3420; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s3
3421; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
3422; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
3423; GFX9-NEXT:    s_ashr_i32 s2, s1, 16
3424; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
3425; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3426; GFX9-NEXT:    s_xor_b32 s0, s2, s3
3427; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3428; GFX9-NEXT:    s_or_b32 s4, s0, 1
3429; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3430; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3431; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
3432; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3433; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
3434; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3435; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3436; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
3437; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s3
3438; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v1
3439; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v3
3440; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3441; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
3442; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
3443; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v5
3444; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
3445; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
3446; GFX9-NEXT:    s_endpgm
3447  %r = srem <4 x i16> %x, %y
3448  store <4 x i16> %r, ptr addrspace(1) %out
3449  ret void
3450}
3451
3452define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3453; CHECK-LABEL: @udiv_i3(
3454; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3455; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3456; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3457; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3458; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3459; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3460; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3461; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3462; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3463; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3464; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3465; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3466; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3467; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3468; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3469; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
3470; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
3471; CHECK-NEXT:    store i3 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1
3472; CHECK-NEXT:    ret void
3473;
3474; GFX6-LABEL: udiv_i3:
3475; GFX6:       ; %bb.0:
3476; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
3477; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3478; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3479; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3480; GFX6-NEXT:    s_bfe_u32 s2, s6, 0x30008
3481; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
3482; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3483; GFX6-NEXT:    s_and_b32 s4, s6, 7
3484; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
3485; GFX6-NEXT:    s_mov_b32 s2, -1
3486; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3487; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3488; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3489; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3490; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3491; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3492; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3493; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3494; GFX6-NEXT:    s_endpgm
3495;
3496; GFX9-LABEL: udiv_i3:
3497; GFX9:       ; %bb.0:
3498; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
3499; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3500; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3501; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3502; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
3503; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
3504; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3505; GFX9-NEXT:    s_and_b32 s2, s2, 7
3506; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
3507; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
3508; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3509; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
3510; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
3511; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3512; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
3513; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3514; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
3515; GFX9-NEXT:    s_endpgm
3516  %r = udiv i3 %x, %y
3517  store i3 %r, ptr addrspace(1) %out
3518  ret void
3519}
3520
3521define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3522; CHECK-LABEL: @urem_i3(
3523; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3524; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3525; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3526; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3527; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3528; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3529; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3530; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3531; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3532; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3533; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3534; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3535; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3536; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3537; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3538; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
3539; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
3540; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
3541; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
3542; CHECK-NEXT:    store i3 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1
3543; CHECK-NEXT:    ret void
3544;
3545; GFX6-LABEL: urem_i3:
3546; GFX6:       ; %bb.0:
3547; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
3548; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3549; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3550; GFX6-NEXT:    s_bfe_u32 s2, s6, 0x30008
3551; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
3552; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3553; GFX6-NEXT:    s_and_b32 s3, s6, 7
3554; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s3
3555; GFX6-NEXT:    s_lshr_b32 s2, s6, 8
3556; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3557; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3558; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3559; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3560; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3561; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3562; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3563; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3564; GFX6-NEXT:    s_mov_b32 s2, -1
3565; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
3566; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3567; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3568; GFX6-NEXT:    s_endpgm
3569;
3570; GFX9-LABEL: urem_i3:
3571; GFX9:       ; %bb.0:
3572; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
3573; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3574; GFX9-NEXT:    s_bfe_u32 s0, s2, 0x30008
3575; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
3576; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3577; GFX9-NEXT:    s_and_b32 s1, s2, 7
3578; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s1
3579; GFX9-NEXT:    s_lshr_b32 s0, s2, 8
3580; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
3581; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3582; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
3583; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
3584; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3585; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3586; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3587; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
3588; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3589; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
3590; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3591; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3592; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3593; GFX9-NEXT:    s_endpgm
3594  %r = urem i3 %x, %y
3595  store i3 %r, ptr addrspace(1) %out
3596  ret void
3597}
3598
3599define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3600; CHECK-LABEL: @sdiv_i3(
3601; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3602; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3603; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3604; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3605; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3606; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3607; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3608; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3609; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3610; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3611; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3612; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3613; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3614; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3615; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3616; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3617; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3618; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3619; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
3620; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
3621; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
3622; CHECK-NEXT:    store i3 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1
3623; CHECK-NEXT:    ret void
3624;
3625; GFX6-LABEL: sdiv_i3:
3626; GFX6:       ; %bb.0:
3627; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
3628; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3629; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3630; GFX6-NEXT:    s_mov_b32 s2, -1
3631; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3632; GFX6-NEXT:    s_bfe_i32 s4, s6, 0x30008
3633; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
3634; GFX6-NEXT:    s_bfe_i32 s5, s6, 0x30000
3635; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
3636; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3637; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3638; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3639; GFX6-NEXT:    s_or_b32 s6, s4, 1
3640; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3641; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3642; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3643; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3644; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3645; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
3646; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
3647; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
3648; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3649; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3650; GFX6-NEXT:    s_endpgm
3651;
3652; GFX9-LABEL: sdiv_i3:
3653; GFX9:       ; %bb.0:
3654; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
3655; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3656; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3657; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3658; GFX9-NEXT:    s_bfe_i32 s3, s2, 0x30008
3659; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s3
3660; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x30000
3661; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
3662; GFX9-NEXT:    s_xor_b32 s2, s2, s3
3663; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3664; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
3665; GFX9-NEXT:    s_or_b32 s4, s2, 1
3666; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
3667; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3668; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
3669; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3670; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
3671; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
3672; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
3673; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
3674; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3675; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3676; GFX9-NEXT:    s_endpgm
3677  %r = sdiv i3 %x, %y
3678  store i3 %r, ptr addrspace(1) %out
3679  ret void
3680}
3681
3682define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3683; CHECK-LABEL: @srem_i3(
3684; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3685; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3686; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3687; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3688; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3689; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3690; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3691; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3692; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3693; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3694; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3695; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3696; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3697; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3698; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3699; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3700; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3701; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3702; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
3703; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
3704; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
3705; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
3706; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
3707; CHECK-NEXT:    store i3 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1
3708; CHECK-NEXT:    ret void
3709;
3710; GFX6-LABEL: srem_i3:
3711; GFX6:       ; %bb.0:
3712; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
3713; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3714; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3715; GFX6-NEXT:    s_bfe_i32 s2, s6, 0x30008
3716; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
3717; GFX6-NEXT:    s_bfe_i32 s3, s6, 0x30000
3718; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
3719; GFX6-NEXT:    s_xor_b32 s2, s3, s2
3720; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3721; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
3722; GFX6-NEXT:    s_lshr_b32 s4, s6, 8
3723; GFX6-NEXT:    s_or_b32 s5, s2, 1
3724; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3725; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3726; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3727; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3728; GFX6-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
3729; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], exec
3730; GFX6-NEXT:    s_cselect_b32 s2, s5, 0
3731; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
3732; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
3733; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3734; GFX6-NEXT:    s_mov_b32 s2, -1
3735; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
3736; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3737; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3738; GFX6-NEXT:    s_endpgm
3739;
3740; GFX9-LABEL: srem_i3:
3741; GFX9:       ; %bb.0:
3742; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
3743; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3744; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x30008
3745; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3746; GFX9-NEXT:    s_bfe_i32 s1, s2, 0x30000
3747; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3748; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3749; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3750; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3751; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
3752; GFX9-NEXT:    s_or_b32 s6, s0, 1
3753; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
3754; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3755; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
3756; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
3757; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3758; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3759; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
3760; GFX9-NEXT:    v_add_u32_e32 v0, s0, v2
3761; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
3762; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3763; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3764; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
3765; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3766; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3767; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3768; GFX9-NEXT:    s_endpgm
3769  %r = srem i3 %x, %y
3770  store i3 %r, ptr addrspace(1) %out
3771  ret void
3772}
3773
3774define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
3775; CHECK-LABEL: @udiv_v3i16(
3776; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3777; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3778; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3779; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3780; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3781; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3782; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3783; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3784; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3785; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3786; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3787; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3788; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3789; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3790; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3791; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3792; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3793; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
3794; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
3795; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> poison, i16 [[TMP19]], i64 0
3796; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
3797; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3798; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
3799; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
3800; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3801; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3802; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3803; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3804; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3805; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3806; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3807; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3808; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3809; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3810; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3811; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3812; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3813; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
3814; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
3815; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
3816; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
3817; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3818; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
3819; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
3820; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3821; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3822; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3823; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3824; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3825; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3826; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3827; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3828; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3829; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3830; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3831; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3832; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3833; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
3834; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
3835; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
3836; CHECK-NEXT:    store <3 x i16> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
3837; CHECK-NEXT:    ret void
3838;
3839; GFX6-LABEL: udiv_v3i16:
3840; GFX6:       ; %bb.0:
3841; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
3842; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3843; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3844; GFX6-NEXT:    s_mov_b32 s2, -1
3845; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3846; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
3847; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
3848; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
3849; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
3850; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
3851; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
3852; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3853; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
3854; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
3855; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3856; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
3857; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3858; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
3859; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3860; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
3861; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3862; GFX6-NEXT:    s_and_b32 s4, s11, 0xffff
3863; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
3864; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
3865; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
3866; GFX6-NEXT:    s_and_b32 s4, s9, 0xffff
3867; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
3868; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
3869; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3870; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3871; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
3872; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3873; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
3874; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3875; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
3876; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3877; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
3878; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3879; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
3880; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3881; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3882; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
3883; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3884; GFX6-NEXT:    s_endpgm
3885;
3886; GFX9-LABEL: udiv_v3i16:
3887; GFX9:       ; %bb.0:
3888; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
3889; GFX9-NEXT:    v_mov_b32_e32 v6, 0
3890; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3891; GFX9-NEXT:    s_and_b32 s7, s2, 0xffff
3892; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
3893; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
3894; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
3895; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
3896; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
3897; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3898; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
3899; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
3900; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
3901; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
3902; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3903; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
3904; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
3905; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
3906; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
3907; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
3908; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
3909; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
3910; GFX9-NEXT:    s_and_b32 s0, s1, 0xffff
3911; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
3912; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
3913; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
3914; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3915; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3916; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3917; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
3918; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
3919; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3920; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
3921; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
3922; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
3923; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3924; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3925; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
3926; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
3927; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3928; GFX9-NEXT:    global_store_short v6, v2, s[6:7] offset:4
3929; GFX9-NEXT:    global_store_dword v6, v0, s[6:7]
3930; GFX9-NEXT:    s_endpgm
3931  %r = udiv <3 x i16> %x, %y
3932  store <3 x i16> %r, ptr addrspace(1) %out
3933  ret void
3934}
3935
3936define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
3937; CHECK-LABEL: @urem_v3i16(
3938; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3939; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3940; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3941; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3942; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3943; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3944; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3945; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3946; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3947; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3948; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3949; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3950; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3951; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3952; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3953; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3954; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3955; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3956; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3957; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3958; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3959; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> poison, i16 [[TMP21]], i64 0
3960; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
3961; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3962; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3963; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3964; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3965; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3966; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3967; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3968; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3969; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3970; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3971; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3972; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3973; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3974; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3975; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3976; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3977; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3978; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3979; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3980; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3981; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3982; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
3983; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3984; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3985; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3986; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3987; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3988; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3989; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3990; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3991; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3992; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3993; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3994; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3995; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3996; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3997; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3998; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3999; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4000; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4001; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
4002; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
4003; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
4004; CHECK-NEXT:    store <3 x i16> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8
4005; CHECK-NEXT:    ret void
4006;
4007; GFX6-LABEL: urem_v3i16:
4008; GFX6:       ; %bb.0:
4009; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
4010; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4011; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4012; GFX6-NEXT:    s_mov_b32 s2, -1
4013; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4014; GFX6-NEXT:    s_and_b32 s5, s10, 0xffff
4015; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
4016; GFX6-NEXT:    s_lshr_b32 s5, s10, 16
4017; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
4018; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
4019; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
4020; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4021; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
4022; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
4023; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
4024; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
4025; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4026; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
4027; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
4028; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4029; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
4030; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4031; GFX6-NEXT:    s_and_b32 s6, s11, 0xffff
4032; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
4033; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
4034; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
4035; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
4036; GFX6-NEXT:    s_and_b32 s6, s9, 0xffff
4037; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s6
4038; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
4039; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
4040; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4041; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
4042; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
4043; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4044; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
4045; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4046; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
4047; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
4048; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
4049; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
4050; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s11
4051; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
4052; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4053; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
4054; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4055; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4056; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4057; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4058; GFX6-NEXT:    s_endpgm
4059;
4060; GFX9-LABEL: urem_v3i16:
4061; GFX9:       ; %bb.0:
4062; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
4063; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
4064; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4065; GFX9-NEXT:    s_and_b32 s9, s2, 0xffff
4066; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
4067; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
4068; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
4069; GFX9-NEXT:    s_and_b32 s8, s0, 0xffff
4070; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
4071; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
4072; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4073; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
4074; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4075; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
4076; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
4077; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4078; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
4079; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4080; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
4081; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v4
4082; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
4083; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
4084; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
4085; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
4086; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4087; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v5
4088; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
4089; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
4090; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
4091; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
4092; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
4093; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4094; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
4095; GFX9-NEXT:    v_mad_f32 v2, -v2, v3, v5
4096; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
4097; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
4098; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
4099; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
4100; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
4101; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
4102; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4103; GFX9-NEXT:    v_sub_u32_e32 v1, s0, v1
4104; GFX9-NEXT:    v_sub_u32_e32 v2, s1, v2
4105; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4106; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
4107; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
4108; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
4109; GFX9-NEXT:    s_endpgm
4110  %r = urem <3 x i16> %x, %y
4111  store <3 x i16> %r, ptr addrspace(1) %out
4112  ret void
4113}
4114
4115define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
4116; CHECK-LABEL: @sdiv_v3i16(
4117; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4118; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4119; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4120; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4121; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4122; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4123; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4124; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4125; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4126; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4127; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4128; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4129; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4130; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4131; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4132; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4133; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4134; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4135; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4136; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4137; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
4138; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
4139; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
4140; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> poison, i16 [[TMP23]], i64 0
4141; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
4142; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4143; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
4144; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
4145; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4146; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4147; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4148; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4149; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4150; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4151; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4152; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4153; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4154; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4155; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4156; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4157; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4158; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4159; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4160; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4161; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
4162; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
4163; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
4164; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
4165; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
4166; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4167; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
4168; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
4169; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4170; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4171; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4172; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4173; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4174; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4175; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4176; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4177; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4178; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4179; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4180; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4181; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4182; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4183; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4184; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4185; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
4186; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
4187; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
4188; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
4189; CHECK-NEXT:    store <3 x i16> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8
4190; CHECK-NEXT:    ret void
4191;
4192; GFX6-LABEL: sdiv_v3i16:
4193; GFX6:       ; %bb.0:
4194; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
4195; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4196; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4197; GFX6-NEXT:    s_mov_b32 s2, -1
4198; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4199; GFX6-NEXT:    s_sext_i32_i16 s4, s10
4200; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
4201; GFX6-NEXT:    s_sext_i32_i16 s5, s8
4202; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
4203; GFX6-NEXT:    s_xor_b32 s4, s5, s4
4204; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4205; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4206; GFX6-NEXT:    s_or_b32 s6, s4, 1
4207; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4208; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4209; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4210; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
4211; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4212; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4213; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
4214; GFX6-NEXT:    s_ashr_i32 s5, s10, 16
4215; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
4216; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v2
4217; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
4218; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
4219; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4220; GFX6-NEXT:    s_xor_b32 s4, s4, s5
4221; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4222; GFX6-NEXT:    s_or_b32 s6, s4, 1
4223; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4224; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4225; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
4226; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
4227; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4228; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
4229; GFX6-NEXT:    s_sext_i32_i16 s5, s11
4230; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
4231; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
4232; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
4233; GFX6-NEXT:    s_sext_i32_i16 s4, s9
4234; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
4235; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4236; GFX6-NEXT:    s_xor_b32 s4, s4, s5
4237; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4238; GFX6-NEXT:    s_or_b32 s6, s4, 1
4239; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4240; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4241; GFX6-NEXT:    v_mad_f32 v3, -v4, v0, v3
4242; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
4243; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
4244; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4245; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
4246; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v4
4247; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
4248; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4249; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
4250; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
4251; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
4252; GFX6-NEXT:    s_endpgm
4253;
4254; GFX9-LABEL: sdiv_v3i16:
4255; GFX9:       ; %bb.0:
4256; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
4257; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
4258; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4259; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4260; GFX9-NEXT:    s_sext_i32_i16 s4, s2
4261; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
4262; GFX9-NEXT:    s_sext_i32_i16 s5, s0
4263; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s5
4264; GFX9-NEXT:    s_xor_b32 s4, s5, s4
4265; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4266; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
4267; GFX9-NEXT:    s_or_b32 s8, s4, 1
4268; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4269; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4270; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4271; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
4272; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4273; GFX9-NEXT:    s_cselect_b32 s4, s8, 0
4274; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
4275; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4276; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
4277; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
4278; GFX9-NEXT:    v_add_u32_e32 v2, s4, v3
4279; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
4280; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4281; GFX9-NEXT:    s_xor_b32 s0, s0, s2
4282; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4283; GFX9-NEXT:    s_sext_i32_i16 s2, s3
4284; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4285; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4286; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
4287; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4288; GFX9-NEXT:    s_or_b32 s0, s0, 1
4289; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
4290; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
4291; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4292; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
4293; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
4294; GFX9-NEXT:    s_sext_i32_i16 s0, s1
4295; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
4296; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
4297; GFX9-NEXT:    s_xor_b32 s0, s0, s2
4298; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4299; GFX9-NEXT:    s_or_b32 s2, s0, 1
4300; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4301; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4302; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
4303; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
4304; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
4305; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4306; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
4307; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
4308; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4309; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
4310; GFX9-NEXT:    global_store_short v1, v0, s[6:7] offset:4
4311; GFX9-NEXT:    global_store_dword v1, v2, s[6:7]
4312; GFX9-NEXT:    s_endpgm
4313  %r = sdiv <3 x i16> %x, %y
4314  store <3 x i16> %r, ptr addrspace(1) %out
4315  ret void
4316}
4317
4318define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
4319; CHECK-LABEL: @srem_v3i16(
4320; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4321; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4322; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4323; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4324; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4325; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4326; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4327; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4328; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4329; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4330; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4331; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4332; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4333; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4334; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4335; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4336; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4337; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4338; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4339; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4340; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
4341; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
4342; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
4343; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
4344; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
4345; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> poison, i16 [[TMP25]], i64 0
4346; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
4347; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4348; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
4349; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
4350; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
4351; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
4352; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
4353; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
4354; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
4355; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
4356; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
4357; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
4358; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
4359; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
4360; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
4361; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
4362; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
4363; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
4364; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
4365; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
4366; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
4367; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
4368; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
4369; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
4370; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
4371; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
4372; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
4373; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4374; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
4375; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
4376; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
4377; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
4378; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
4379; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
4380; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
4381; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
4382; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
4383; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
4384; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
4385; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
4386; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
4387; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
4388; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
4389; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
4390; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
4391; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
4392; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
4393; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
4394; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
4395; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
4396; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
4397; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
4398; CHECK-NEXT:    store <3 x i16> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8
4399; CHECK-NEXT:    ret void
4400;
4401; GFX6-LABEL: srem_v3i16:
4402; GFX6:       ; %bb.0:
4403; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
4404; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4405; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4406; GFX6-NEXT:    s_mov_b32 s2, -1
4407; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4408; GFX6-NEXT:    s_sext_i32_i16 s4, s10
4409; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
4410; GFX6-NEXT:    s_sext_i32_i16 s5, s8
4411; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
4412; GFX6-NEXT:    s_xor_b32 s4, s5, s4
4413; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4414; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4415; GFX6-NEXT:    s_or_b32 s6, s4, 1
4416; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4417; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4418; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4419; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4420; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
4421; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4422; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
4423; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
4424; GFX6-NEXT:    s_ashr_i32 s4, s10, 16
4425; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
4426; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
4427; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
4428; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
4429; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
4430; GFX6-NEXT:    s_xor_b32 s4, s5, s4
4431; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4432; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
4433; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4434; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4435; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
4436; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
4437; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
4438; GFX6-NEXT:    s_lshr_b32 s7, s10, 16
4439; GFX6-NEXT:    s_or_b32 s8, s4, 1
4440; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
4441; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4442; GFX6-NEXT:    s_cselect_b32 s4, s8, 0
4443; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
4444; GFX6-NEXT:    s_sext_i32_i16 s4, s11
4445; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
4446; GFX6-NEXT:    s_sext_i32_i16 s5, s9
4447; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
4448; GFX6-NEXT:    s_xor_b32 s4, s5, s4
4449; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4450; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4451; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
4452; GFX6-NEXT:    s_or_b32 s7, s4, 1
4453; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4454; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4455; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
4456; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
4457; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
4458; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4459; GFX6-NEXT:    s_cselect_b32 s4, s7, 0
4460; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
4461; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s11
4462; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
4463; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4464; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
4465; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4466; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4467; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4468; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4469; GFX6-NEXT:    s_endpgm
4470;
4471; GFX9-LABEL: srem_v3i16:
4472; GFX9:       ; %bb.0:
4473; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
4474; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
4475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4476; GFX9-NEXT:    s_sext_i32_i16 s8, s2
4477; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
4478; GFX9-NEXT:    s_sext_i32_i16 s9, s0
4479; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
4480; GFX9-NEXT:    s_xor_b32 s4, s9, s8
4481; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4482; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
4483; GFX9-NEXT:    s_or_b32 s10, s4, 1
4484; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
4485; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4486; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
4487; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
4488; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4489; GFX9-NEXT:    s_cselect_b32 s4, s10, 0
4490; GFX9-NEXT:    s_ashr_i32 s10, s0, 16
4491; GFX9-NEXT:    s_ashr_i32 s0, s2, 16
4492; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
4493; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
4494; GFX9-NEXT:    s_xor_b32 s2, s10, s0
4495; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4496; GFX9-NEXT:    v_add_u32_e32 v1, s4, v2
4497; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s10
4498; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4499; GFX9-NEXT:    s_or_b32 s2, s2, 1
4500; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
4501; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4502; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4503; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4504; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4505; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
4506; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
4507; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
4508; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
4509; GFX9-NEXT:    s_sext_i32_i16 s2, s3
4510; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
4511; GFX9-NEXT:    s_sext_i32_i16 s3, s1
4512; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s3
4513; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
4514; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4515; GFX9-NEXT:    s_xor_b32 s0, s3, s2
4516; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4517; GFX9-NEXT:    s_or_b32 s4, s0, 1
4518; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4519; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4520; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
4521; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4522; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
4523; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4524; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4525; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
4526; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
4527; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
4528; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4529; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
4530; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
4531; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4532; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
4533; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
4534; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
4535; GFX9-NEXT:    s_endpgm
4536  %r = srem <3 x i16> %x, %y
4537  store <3 x i16> %r, ptr addrspace(1) %out
4538  ret void
4539}
4540
4541define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
4542; CHECK-LABEL: @udiv_v3i15(
4543; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4544; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4545; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4546; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4547; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4548; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4549; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4550; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4551; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4552; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4553; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4554; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4555; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4556; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4557; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4558; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4559; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4560; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
4561; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
4562; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> poison, i15 [[TMP19]], i64 0
4563; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
4564; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4565; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
4566; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
4567; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
4568; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
4569; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
4570; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
4571; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
4572; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
4573; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
4574; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
4575; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
4576; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
4577; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
4578; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
4579; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
4580; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
4581; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
4582; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
4583; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
4584; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4585; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
4586; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
4587; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
4588; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
4589; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
4590; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
4591; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
4592; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
4593; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
4594; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
4595; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
4596; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
4597; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
4598; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
4599; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
4600; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
4601; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
4602; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
4603; CHECK-NEXT:    store <3 x i15> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
4604; CHECK-NEXT:    ret void
4605;
4606; GFX6-LABEL: udiv_v3i15:
4607; GFX6:       ; %bb.0:
4608; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
4609; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
4610; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4611; GFX6-NEXT:    s_mov_b32 s2, -1
4612; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4613; GFX6-NEXT:    s_and_b32 s6, s10, 0x7fff
4614; GFX6-NEXT:    s_and_b32 s7, s4, 0x7fff
4615; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
4616; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4617; GFX6-NEXT:    s_bfe_u32 s4, s4, 0xf000f
4618; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
4619; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4620; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
4621; GFX6-NEXT:    s_bfe_u32 s7, s10, 0xf000f
4622; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 30
4623; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4624; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s7
4625; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
4626; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
4627; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4628; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4629; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4630; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
4631; GFX6-NEXT:    v_mov_b32_e32 v0, s10
4632; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
4633; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4634; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
4635; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4636; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4637; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4638; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
4639; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4640; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
4641; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
4642; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
4643; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4644; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
4645; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4646; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
4647; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
4648; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
4649; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
4650; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
4651; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
4652; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4653; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4654; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
4655; GFX6-NEXT:    s_mov_b32 s0, s8
4656; GFX6-NEXT:    s_mov_b32 s1, s9
4657; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4658; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4659; GFX6-NEXT:    s_waitcnt expcnt(0)
4660; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4661; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
4662; GFX6-NEXT:    s_endpgm
4663;
4664; GFX9-LABEL: udiv_v3i15:
4665; GFX9:       ; %bb.0:
4666; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4667; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4668; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4669; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4670; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4671; GFX9-NEXT:    s_and_b32 s5, s6, 0x7fff
4672; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
4673; GFX9-NEXT:    s_and_b32 s4, s2, 0x7fff
4674; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4675; GFX9-NEXT:    s_bfe_u32 s3, s6, 0xf000f
4676; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
4677; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4678; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s3
4679; GFX9-NEXT:    s_bfe_u32 s2, s2, 0xf000f
4680; GFX9-NEXT:    v_mov_b32_e32 v3, s6
4681; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
4682; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4683; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
4684; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4685; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4686; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4687; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4688; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4689; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
4690; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4691; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
4692; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4693; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4694; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
4695; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
4696; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
4697; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
4698; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
4699; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
4700; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4701; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
4702; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4703; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
4704; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
4705; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
4706; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
4707; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4708; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
4709; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4710; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4711; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4712; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4713; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
4714; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4715; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
4716; GFX9-NEXT:    s_endpgm
4717  %r = udiv <3 x i15> %x, %y
4718  store <3 x i15> %r, ptr addrspace(1) %out
4719  ret void
4720}
4721
4722define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
4723; CHECK-LABEL: @urem_v3i15(
4724; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4725; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4726; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4727; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4728; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4729; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4730; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4731; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4732; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4733; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4734; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4735; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4736; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4737; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4738; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4739; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4740; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4741; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
4742; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
4743; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
4744; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
4745; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> poison, i15 [[TMP21]], i64 0
4746; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
4747; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4748; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
4749; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
4750; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
4751; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
4752; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
4753; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
4754; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
4755; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
4756; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
4757; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
4758; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4759; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
4760; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
4761; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
4762; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
4763; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
4764; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
4765; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
4766; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
4767; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
4768; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
4769; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4770; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
4771; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
4772; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
4773; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
4774; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
4775; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
4776; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
4777; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
4778; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
4779; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
4780; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
4781; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
4782; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
4783; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
4784; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
4785; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4786; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4787; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
4788; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
4789; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
4790; CHECK-NEXT:    store <3 x i15> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8
4791; CHECK-NEXT:    ret void
4792;
4793; GFX6-LABEL: urem_v3i15:
4794; GFX6:       ; %bb.0:
4795; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
4796; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
4797; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4798; GFX6-NEXT:    s_mov_b32 s2, -1
4799; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4800; GFX6-NEXT:    s_mov_b32 s0, s8
4801; GFX6-NEXT:    s_and_b32 s8, s4, 0x7fff
4802; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
4803; GFX6-NEXT:    s_and_b32 s7, s10, 0x7fff
4804; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s7
4805; GFX6-NEXT:    v_mov_b32_e32 v2, s4
4806; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4807; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 30
4808; GFX6-NEXT:    s_bfe_u32 s5, s4, 0xf000f
4809; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s5
4810; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4811; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4812; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4813; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4814; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4815; GFX6-NEXT:    s_bfe_u32 s8, s10, 0xf000f
4816; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
4817; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
4818; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
4819; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
4820; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
4821; GFX6-NEXT:    v_mov_b32_e32 v0, s10
4822; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s10, v1
4823; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
4824; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
4825; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
4826; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4827; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
4828; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
4829; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4830; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
4831; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4832; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
4833; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4834; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4835; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
4836; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4837; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
4838; GFX6-NEXT:    s_lshr_b32 s4, s4, 15
4839; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
4840; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
4841; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
4842; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4843; GFX6-NEXT:    s_lshr_b32 s6, s10, 15
4844; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v1
4845; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
4846; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4847; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4848; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
4849; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4850; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
4851; GFX6-NEXT:    s_mov_b32 s1, s9
4852; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4853; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4854; GFX6-NEXT:    s_waitcnt expcnt(0)
4855; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4856; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
4857; GFX6-NEXT:    s_endpgm
4858;
4859; GFX9-LABEL: urem_v3i15:
4860; GFX9:       ; %bb.0:
4861; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4862; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
4863; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4864; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4865; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4866; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4867; GFX9-NEXT:    s_and_b32 s3, s6, 0x7fff
4868; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
4869; GFX9-NEXT:    s_and_b32 s4, s2, 0x7fff
4870; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
4871; GFX9-NEXT:    s_bfe_u32 s4, s6, 0xf000f
4872; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4873; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s4
4874; GFX9-NEXT:    v_mov_b32_e32 v3, s6
4875; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
4876; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4877; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4878; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4879; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4880; GFX9-NEXT:    s_bfe_u32 s5, s2, 0xf000f
4881; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4882; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4883; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s5
4884; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4885; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
4886; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
4887; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4888; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
4889; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
4890; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
4891; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4892; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
4893; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
4894; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
4895; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
4896; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
4897; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
4898; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
4899; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
4900; GFX9-NEXT:    s_lshr_b32 s3, s6, 15
4901; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
4902; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s3
4903; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
4904; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4905; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
4906; GFX9-NEXT:    s_lshr_b32 s3, s2, 15
4907; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
4908; GFX9-NEXT:    v_sub_u32_e32 v5, s2, v1
4909; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
4910; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
4911; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4912; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v5
4913; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4914; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4915; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4916; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
4917; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4918; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
4919; GFX9-NEXT:    s_endpgm
4920  %r = urem <3 x i15> %x, %y
4921  store <3 x i15> %r, ptr addrspace(1) %out
4922  ret void
4923}
4924
4925define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
4926; CHECK-LABEL: @sdiv_v3i15(
4927; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4928; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4929; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
4930; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
4931; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4932; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4933; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4934; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4935; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4936; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4937; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4938; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4939; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4940; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4941; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4942; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4943; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4944; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4945; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4946; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4947; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
4948; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
4949; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
4950; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> poison, i15 [[TMP23]], i64 0
4951; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
4952; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4953; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
4954; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
4955; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4956; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4957; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4958; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4959; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4960; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4961; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4962; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4963; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4964; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4965; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4966; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4967; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4968; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4969; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4970; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4971; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
4972; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
4973; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
4974; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
4975; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
4976; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4977; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
4978; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
4979; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4980; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4981; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4982; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4983; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4984; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4985; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4986; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4987; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4988; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4989; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4990; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4991; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4992; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4993; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4994; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4995; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
4996; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
4997; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
4998; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
4999; CHECK-NEXT:    store <3 x i15> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8
5000; CHECK-NEXT:    ret void
5001;
5002; GFX6-LABEL: sdiv_v3i15:
5003; GFX6:       ; %bb.0:
5004; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
5005; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
5006; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5007; GFX6-NEXT:    s_mov_b32 s2, -1
5008; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5009; GFX6-NEXT:    v_mov_b32_e32 v0, s10
5010; GFX6-NEXT:    s_bfe_i32 s6, s4, 0xf0000
5011; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
5012; GFX6-NEXT:    v_mov_b32_e32 v1, s4
5013; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 30
5014; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf0000
5015; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
5016; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5017; GFX6-NEXT:    s_xor_b32 s5, s5, s6
5018; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
5019; GFX6-NEXT:    s_or_b32 s5, s5, 1
5020; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5021; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5022; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
5023; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v2|
5024; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
5025; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5026; GFX6-NEXT:    s_cselect_b32 s5, s5, 0
5027; GFX6-NEXT:    s_bfe_i32 s4, s4, 0xf000f
5028; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
5029; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s5, v4
5030; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf000f
5031; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
5032; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
5033; GFX6-NEXT:    s_xor_b32 s4, s5, s4
5034; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
5035; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
5036; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
5037; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5038; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
5039; GFX6-NEXT:    s_or_b32 s6, s4, 1
5040; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5041; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
5042; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, v1
5043; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
5044; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
5045; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
5046; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5047; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s4, v5
5048; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
5049; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
5050; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
5051; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5052; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5053; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
5054; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
5055; GFX6-NEXT:    v_mad_f32 v5, -v1, v2, v5
5056; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
5057; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v2|
5058; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5059; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
5060; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5061; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
5062; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5063; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5064; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
5065; GFX6-NEXT:    s_mov_b32 s0, s8
5066; GFX6-NEXT:    s_mov_b32 s1, s9
5067; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5068; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5069; GFX6-NEXT:    s_waitcnt expcnt(0)
5070; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5071; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
5072; GFX6-NEXT:    s_endpgm
5073;
5074; GFX9-LABEL: sdiv_v3i15:
5075; GFX9:       ; %bb.0:
5076; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5077; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5078; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5079; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5080; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5081; GFX9-NEXT:    s_bfe_i32 s4, s6, 0xf0000
5082; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
5083; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5084; GFX9-NEXT:    s_bfe_i32 s3, s2, 0xf0000
5085; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s3
5086; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5087; GFX9-NEXT:    s_xor_b32 s3, s3, s4
5088; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
5089; GFX9-NEXT:    s_or_b32 s3, s3, 1
5090; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5091; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5092; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
5093; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
5094; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
5095; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
5096; GFX9-NEXT:    s_bfe_i32 s4, s6, 0xf000f
5097; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5098; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
5099; GFX9-NEXT:    s_bfe_i32 s2, s2, 0xf000f
5100; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5101; GFX9-NEXT:    v_add_u32_e32 v4, s3, v5
5102; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
5103; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
5104; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5105; GFX9-NEXT:    s_xor_b32 s2, s2, s4
5106; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
5107; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5108; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5109; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
5110; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5111; GFX9-NEXT:    s_or_b32 s4, s2, 1
5112; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5113; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v5|, |v3|
5114; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
5115; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
5116; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
5117; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
5118; GFX9-NEXT:    v_add_u32_e32 v5, s2, v6
5119; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
5120; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
5121; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
5122; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5123; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
5124; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
5125; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
5126; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
5127; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
5128; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
5129; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5130; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
5131; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
5132; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
5133; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5134; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5135; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5136; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5137; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
5138; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5139; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
5140; GFX9-NEXT:    s_endpgm
5141  %r = sdiv <3 x i15> %x, %y
5142  store <3 x i15> %r, ptr addrspace(1) %out
5143  ret void
5144}
5145
5146define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
5147; CHECK-LABEL: @srem_v3i15(
5148; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
5149; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
5150; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
5151; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
5152; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5153; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5154; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
5155; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5156; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5157; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5158; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5159; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5160; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
5161; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5162; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5163; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5164; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5165; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5166; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5167; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5168; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
5169; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
5170; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
5171; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
5172; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
5173; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> poison, i15 [[TMP25]], i64 0
5174; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
5175; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
5176; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
5177; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
5178; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
5179; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
5180; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
5181; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
5182; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
5183; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5184; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
5185; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
5186; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
5187; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
5188; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
5189; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
5190; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
5191; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
5192; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
5193; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
5194; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
5195; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
5196; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
5197; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
5198; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
5199; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
5200; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
5201; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
5202; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
5203; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
5204; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
5205; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
5206; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
5207; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
5208; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
5209; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
5210; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
5211; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
5212; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
5213; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
5214; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
5215; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
5216; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
5217; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
5218; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
5219; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
5220; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
5221; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
5222; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
5223; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
5224; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
5225; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
5226; CHECK-NEXT:    store <3 x i15> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8
5227; CHECK-NEXT:    ret void
5228;
5229; GFX6-LABEL: srem_v3i15:
5230; GFX6:       ; %bb.0:
5231; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
5232; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
5233; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5234; GFX6-NEXT:    s_mov_b32 s2, -1
5235; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5236; GFX6-NEXT:    s_bfe_i32 s6, s10, 0xf0000
5237; GFX6-NEXT:    v_mov_b32_e32 v2, s4
5238; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 30
5239; GFX6-NEXT:    s_bfe_i32 s5, s4, 0xf0000
5240; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
5241; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s6
5242; GFX6-NEXT:    s_xor_b32 s5, s6, s5
5243; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
5244; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5245; GFX6-NEXT:    s_mov_b32 s0, s8
5246; GFX6-NEXT:    s_mov_b32 s1, s9
5247; GFX6-NEXT:    s_lshr_b32 s8, s10, 15
5248; GFX6-NEXT:    v_mul_f32_e32 v6, v5, v6
5249; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
5250; GFX6-NEXT:    v_mad_f32 v5, -v6, v4, v5
5251; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
5252; GFX6-NEXT:    s_lshr_b32 s9, s4, 15
5253; GFX6-NEXT:    s_or_b32 s5, s5, 1
5254; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v5|, |v4|
5255; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
5256; GFX6-NEXT:    s_cselect_b32 s5, s5, 0
5257; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s5, v6
5258; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s4
5259; GFX6-NEXT:    s_bfe_i32 s4, s4, 0xf000f
5260; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s4
5261; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf000f
5262; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, s5
5263; GFX6-NEXT:    s_xor_b32 s4, s5, s4
5264; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5265; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
5266; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
5267; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
5268; GFX6-NEXT:    v_mul_f32_e32 v7, v6, v7
5269; GFX6-NEXT:    v_trunc_f32_e32 v7, v7
5270; GFX6-NEXT:    v_mad_f32 v6, -v7, v5, v6
5271; GFX6-NEXT:    s_or_b32 s6, s4, 1
5272; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
5273; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
5274; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v2
5275; GFX6-NEXT:    v_mov_b32_e32 v0, s10
5276; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
5277; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
5278; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
5279; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
5280; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5281; GFX6-NEXT:    v_add_i32_e32 v5, vcc, s4, v7
5282; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
5283; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
5284; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
5285; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
5286; GFX6-NEXT:    v_mul_f32_e32 v2, v7, v8
5287; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
5288; GFX6-NEXT:    v_mad_f32 v7, -v2, v6, v7
5289; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
5290; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5291; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5292; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5293; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5294; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s9
5295; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5296; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
5297; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s8, v5
5298; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
5299; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
5300; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5301; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
5302; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
5303; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5304; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5305; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5306; GFX6-NEXT:    s_waitcnt expcnt(0)
5307; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5308; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
5309; GFX6-NEXT:    s_endpgm
5310;
5311; GFX9-LABEL: srem_v3i15:
5312; GFX9:       ; %bb.0:
5313; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5314; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
5315; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5316; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5317; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5318; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5319; GFX9-NEXT:    s_bfe_i32 s3, s6, 0xf0000
5320; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s3
5321; GFX9-NEXT:    s_bfe_i32 s4, s2, 0xf0000
5322; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s4
5323; GFX9-NEXT:    s_xor_b32 s3, s4, s3
5324; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5325; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5326; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
5327; GFX9-NEXT:    s_lshr_b32 s8, s2, 15
5328; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5329; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5330; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
5331; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5332; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5333; GFX9-NEXT:    s_lshr_b32 s7, s6, 15
5334; GFX9-NEXT:    s_or_b32 s3, s3, 1
5335; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v5|, |v4|
5336; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
5337; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
5338; GFX9-NEXT:    v_add_u32_e32 v4, s3, v6
5339; GFX9-NEXT:    s_bfe_i32 s3, s6, 0xf000f
5340; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s3
5341; GFX9-NEXT:    s_bfe_i32 s4, s2, 0xf000f
5342; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s4
5343; GFX9-NEXT:    s_xor_b32 s3, s4, s3
5344; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5345; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
5346; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
5347; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5348; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v7
5349; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
5350; GFX9-NEXT:    v_mad_f32 v6, -v7, v5, v6
5351; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v7
5352; GFX9-NEXT:    s_or_b32 s3, s3, 1
5353; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
5354; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v1
5355; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
5356; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
5357; GFX9-NEXT:    v_add_u32_e32 v5, s3, v7
5358; GFX9-NEXT:    v_bfe_i32 v7, v0, 0, 15
5359; GFX9-NEXT:    v_cvt_f32_i32_e32 v8, v7
5360; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v6
5361; GFX9-NEXT:    v_xor_b32_e32 v1, v7, v1
5362; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
5363; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
5364; GFX9-NEXT:    v_mul_f32_e32 v7, v8, v9
5365; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
5366; GFX9-NEXT:    v_cvt_i32_f32_e32 v9, v7
5367; GFX9-NEXT:    v_mad_f32 v7, -v7, v6, v8
5368; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5369; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5370; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
5371; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s7
5372; GFX9-NEXT:    v_add_u32_e32 v1, v9, v1
5373; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
5374; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
5375; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
5376; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v5
5377; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
5378; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
5379; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5380; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
5381; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5382; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5383; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5384; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
5385; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5386; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
5387; GFX9-NEXT:    s_endpgm
5388  %r = srem <3 x i15> %x, %y
5389  store <3 x i15> %r, ptr addrspace(1) %out
5390  ret void
5391}
5392
5393define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
5394; CHECK-LABEL: @udiv_i32_oddk_denom(
5395; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
5396; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5397; CHECK-NEXT:    ret void
5398;
5399; GFX6-LABEL: udiv_i32_oddk_denom:
5400; GFX6:       ; %bb.0:
5401; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
5402; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5403; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5404; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5405; GFX6-NEXT:    s_mov_b32 s2, -1
5406; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5407; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
5408; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v0
5409; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5410; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
5411; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5412; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5413; GFX6-NEXT:    s_endpgm
5414;
5415; GFX9-LABEL: udiv_i32_oddk_denom:
5416; GFX9:       ; %bb.0:
5417; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
5418; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5419; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5420; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5421; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xb2a50881
5422; GFX9-NEXT:    s_sub_i32 s2, s2, s3
5423; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
5424; GFX9-NEXT:    s_add_i32 s2, s2, s3
5425; GFX9-NEXT:    s_lshr_b32 s2, s2, 20
5426; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5427; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5428; GFX9-NEXT:    s_endpgm
5429  %r = udiv i32 %x, 1235195
5430  store i32 %r, ptr addrspace(1) %out
5431  ret void
5432}
5433
5434define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
5435; CHECK-LABEL: @udiv_i32_pow2k_denom(
5436; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
5437; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5438; CHECK-NEXT:    ret void
5439;
5440; GFX6-LABEL: udiv_i32_pow2k_denom:
5441; GFX6:       ; %bb.0:
5442; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
5443; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5444; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5445; GFX6-NEXT:    s_mov_b32 s2, -1
5446; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5447; GFX6-NEXT:    s_lshr_b32 s4, s6, 12
5448; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5449; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5450; GFX6-NEXT:    s_endpgm
5451;
5452; GFX9-LABEL: udiv_i32_pow2k_denom:
5453; GFX9:       ; %bb.0:
5454; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
5455; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5456; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5457; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5458; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
5459; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5460; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5461; GFX9-NEXT:    s_endpgm
5462  %r = udiv i32 %x, 4096
5463  store i32 %r, ptr addrspace(1) %out
5464  ret void
5465}
5466
5467define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
5468; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
5469; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5470; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
5471; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5472; CHECK-NEXT:    ret void
5473;
5474; GFX6-LABEL: udiv_i32_pow2_shl_denom:
5475; GFX6:       ; %bb.0:
5476; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5477; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5478; GFX6-NEXT:    s_mov_b32 s6, -1
5479; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5480; GFX6-NEXT:    s_mov_b32 s4, s0
5481; GFX6-NEXT:    s_add_i32 s0, s3, 12
5482; GFX6-NEXT:    s_lshr_b32 s0, s2, s0
5483; GFX6-NEXT:    s_mov_b32 s5, s1
5484; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5485; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5486; GFX6-NEXT:    s_endpgm
5487;
5488; GFX9-LABEL: udiv_i32_pow2_shl_denom:
5489; GFX9:       ; %bb.0:
5490; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5491; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5492; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5493; GFX9-NEXT:    s_add_i32 s3, s3, 12
5494; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
5495; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5496; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5497; GFX9-NEXT:    s_endpgm
5498  %shl.y = shl i32 4096, %y
5499  %r = udiv i32 %x, %shl.y
5500  store i32 %r, ptr addrspace(1) %out
5501  ret void
5502}
5503
5504define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
5505; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
5506; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5507; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5508; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
5509; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5510; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
5511; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5512; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
5513; CHECK-NEXT:    ret void
5514;
5515; GFX6-LABEL: udiv_v2i32_pow2k_denom:
5516; GFX6:       ; %bb.0:
5517; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5518; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5519; GFX6-NEXT:    s_mov_b32 s6, -1
5520; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5521; GFX6-NEXT:    s_mov_b32 s4, s0
5522; GFX6-NEXT:    s_mov_b32 s5, s1
5523; GFX6-NEXT:    s_lshr_b32 s0, s2, 12
5524; GFX6-NEXT:    s_lshr_b32 s1, s3, 12
5525; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5526; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5527; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5528; GFX6-NEXT:    s_endpgm
5529;
5530; GFX9-LABEL: udiv_v2i32_pow2k_denom:
5531; GFX9:       ; %bb.0:
5532; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5533; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5534; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5535; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
5536; GFX9-NEXT:    s_lshr_b32 s3, s3, 12
5537; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5538; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5539; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
5540; GFX9-NEXT:    s_endpgm
5541  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
5542  store <2 x i32> %r, ptr addrspace(1) %out
5543  ret void
5544}
5545
5546define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
5547; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
5548; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5549; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5550; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
5551; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5552; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
5553; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5554; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
5555; CHECK-NEXT:    ret void
5556;
5557; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
5558; GFX6:       ; %bb.0:
5559; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5560; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100101
5561; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5562; GFX6-NEXT:    s_mov_b32 s6, -1
5563; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5564; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
5565; GFX6-NEXT:    s_mov_b32 s4, s0
5566; GFX6-NEXT:    s_lshr_b32 s0, s2, 12
5567; GFX6-NEXT:    s_mov_b32 s5, s1
5568; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s3, v0
5569; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5570; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
5571; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
5572; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5573; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5574; GFX6-NEXT:    s_endpgm
5575;
5576; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
5577; GFX9:       ; %bb.0:
5578; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5579; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5580; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5581; GFX9-NEXT:    s_mul_hi_u32 s4, s3, 0x100101
5582; GFX9-NEXT:    s_sub_i32 s3, s3, s4
5583; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
5584; GFX9-NEXT:    s_add_i32 s3, s3, s4
5585; GFX9-NEXT:    s_lshr_b32 s2, s2, 12
5586; GFX9-NEXT:    s_lshr_b32 s3, s3, 11
5587; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5588; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5589; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
5590; GFX9-NEXT:    s_endpgm
5591  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
5592  store <2 x i32> %r, ptr addrspace(1) %out
5593  ret void
5594}
5595
5596define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
5597; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
5598; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
5599; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5600; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5601; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5602; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5603; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5604; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5605; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5606; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5607; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5608; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5609; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5610; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5611; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5612; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5613; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5614; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5615; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5616; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5617; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5618; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5619; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5620; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5621; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5622; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5623; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
5624; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
5625; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5626; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
5627; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
5628; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
5629; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
5630; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i64 0
5631; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
5632; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5633; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
5634; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5635; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
5636; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
5637; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
5638; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
5639; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
5640; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
5641; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
5642; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5643; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
5644; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
5645; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
5646; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
5647; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
5648; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
5649; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5650; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
5651; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
5652; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
5653; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
5654; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
5655; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
5656; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
5657; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
5658; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
5659; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
5660; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
5661; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
5662; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
5663; CHECK-NEXT:    store <2 x i32> [[TMP64]], ptr addrspace(1) [[OUT:%.*]], align 8
5664; CHECK-NEXT:    ret void
5665;
5666; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
5667; GFX6:       ; %bb.0:
5668; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xb
5669; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
5670; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5671; GFX6-NEXT:    s_mov_b32 s6, -1
5672; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5673; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s10
5674; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
5675; GFX6-NEXT:    s_sub_i32 s1, 0, s0
5676; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s11
5677; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
5678; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5679; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
5680; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
5681; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
5682; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
5683; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
5684; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5685; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
5686; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
5687; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5688; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
5689; GFX6-NEXT:    s_mul_i32 s1, s1, s0
5690; GFX6-NEXT:    s_sub_i32 s1, s8, s1
5691; GFX6-NEXT:    s_sub_i32 s3, s1, s0
5692; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
5693; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
5694; GFX6-NEXT:    s_cselect_b32 s1, s3, s1
5695; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
5696; GFX6-NEXT:    s_cmp_ge_u32 s1, s0
5697; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
5698; GFX6-NEXT:    s_sub_i32 s3, 0, s2
5699; GFX6-NEXT:    v_mul_lo_u32 v3, s3, v1
5700; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5701; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
5702; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
5703; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
5704; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5705; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v1
5706; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
5707; GFX6-NEXT:    s_mul_i32 s0, s0, s2
5708; GFX6-NEXT:    s_sub_i32 s0, s9, s0
5709; GFX6-NEXT:    s_sub_i32 s1, s0, s2
5710; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
5711; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
5712; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
5713; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5714; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
5715; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
5716; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
5717; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
5718; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5719; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5720; GFX6-NEXT:    s_endpgm
5721;
5722; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
5723; GFX9:       ; %bb.0:
5724; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
5725; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5726; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5727; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s2
5728; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
5729; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
5730; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
5731; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
5732; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5733; GFX9-NEXT:    s_sub_i32 s4, 0, s7
5734; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5735; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
5736; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
5737; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
5738; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
5739; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
5740; GFX9-NEXT:    s_mul_i32 s4, s4, s5
5741; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
5742; GFX9-NEXT:    s_add_i32 s5, s5, s4
5743; GFX9-NEXT:    s_mul_hi_u32 s4, s0, s5
5744; GFX9-NEXT:    s_mul_i32 s5, s4, s7
5745; GFX9-NEXT:    s_sub_i32 s0, s0, s5
5746; GFX9-NEXT:    s_add_i32 s9, s4, 1
5747; GFX9-NEXT:    s_sub_i32 s5, s0, s7
5748; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
5749; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
5750; GFX9-NEXT:    s_cselect_b32 s0, s5, s0
5751; GFX9-NEXT:    s_add_i32 s5, s4, 1
5752; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
5753; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
5754; GFX9-NEXT:    s_cselect_b32 s0, s5, s4
5755; GFX9-NEXT:    s_sub_i32 s4, 0, s6
5756; GFX9-NEXT:    s_mul_i32 s4, s4, s8
5757; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
5758; GFX9-NEXT:    s_add_i32 s8, s8, s4
5759; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s8
5760; GFX9-NEXT:    s_mul_i32 s5, s4, s6
5761; GFX9-NEXT:    s_sub_i32 s1, s1, s5
5762; GFX9-NEXT:    s_add_i32 s7, s4, 1
5763; GFX9-NEXT:    s_sub_i32 s5, s1, s6
5764; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
5765; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
5766; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
5767; GFX9-NEXT:    s_add_i32 s5, s4, 1
5768; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
5769; GFX9-NEXT:    s_cselect_b32 s1, s5, s4
5770; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5771; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5772; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5773; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5774; GFX9-NEXT:    s_endpgm
5775  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
5776  %r = udiv <2 x i32> %x, %shl.y
5777  store <2 x i32> %r, ptr addrspace(1) %out
5778  ret void
5779}
5780
5781define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
5782; CHECK-LABEL: @urem_i32_oddk_denom(
5783; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
5784; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5785; CHECK-NEXT:    ret void
5786;
5787; GFX6-LABEL: urem_i32_oddk_denom:
5788; GFX6:       ; %bb.0:
5789; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
5790; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5791; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5792; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
5793; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5794; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5795; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
5796; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v0
5797; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5798; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
5799; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5800; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
5801; GFX6-NEXT:    s_mov_b32 s2, -1
5802; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
5803; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5804; GFX6-NEXT:    s_endpgm
5805;
5806; GFX9-LABEL: urem_i32_oddk_denom:
5807; GFX9:       ; %bb.0:
5808; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
5809; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5810; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5811; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5812; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xb2a50881
5813; GFX9-NEXT:    s_sub_i32 s4, s2, s3
5814; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
5815; GFX9-NEXT:    s_add_i32 s4, s4, s3
5816; GFX9-NEXT:    s_lshr_b32 s3, s4, 20
5817; GFX9-NEXT:    s_mul_i32 s3, s3, 0x12d8fb
5818; GFX9-NEXT:    s_sub_i32 s2, s2, s3
5819; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5820; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5821; GFX9-NEXT:    s_endpgm
5822  %r = urem i32 %x, 1235195
5823  store i32 %r, ptr addrspace(1) %out
5824  ret void
5825}
5826
5827define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
5828; CHECK-LABEL: @urem_i32_pow2k_denom(
5829; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
5830; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5831; CHECK-NEXT:    ret void
5832;
5833; GFX6-LABEL: urem_i32_pow2k_denom:
5834; GFX6:       ; %bb.0:
5835; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
5836; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5837; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5838; GFX6-NEXT:    s_mov_b32 s2, -1
5839; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5840; GFX6-NEXT:    s_and_b32 s4, s6, 0xfff
5841; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5842; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5843; GFX6-NEXT:    s_endpgm
5844;
5845; GFX9-LABEL: urem_i32_pow2k_denom:
5846; GFX9:       ; %bb.0:
5847; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
5848; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5849; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5850; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5851; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
5852; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5853; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5854; GFX9-NEXT:    s_endpgm
5855  %r = urem i32 %x, 4096
5856  store i32 %r, ptr addrspace(1) %out
5857  ret void
5858}
5859
5860define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
5861; CHECK-LABEL: @urem_i32_pow2_shl_denom(
5862; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5863; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
5864; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5865; CHECK-NEXT:    ret void
5866;
5867; GFX6-LABEL: urem_i32_pow2_shl_denom:
5868; GFX6:       ; %bb.0:
5869; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5870; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5871; GFX6-NEXT:    s_mov_b32 s6, -1
5872; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5873; GFX6-NEXT:    s_mov_b32 s4, s0
5874; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s3
5875; GFX6-NEXT:    s_add_i32 s0, s0, -1
5876; GFX6-NEXT:    s_and_b32 s0, s2, s0
5877; GFX6-NEXT:    s_mov_b32 s5, s1
5878; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5879; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5880; GFX6-NEXT:    s_endpgm
5881;
5882; GFX9-LABEL: urem_i32_pow2_shl_denom:
5883; GFX9:       ; %bb.0:
5884; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5885; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5887; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
5888; GFX9-NEXT:    s_add_i32 s3, s3, -1
5889; GFX9-NEXT:    s_and_b32 s2, s2, s3
5890; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5891; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5892; GFX9-NEXT:    s_endpgm
5893  %shl.y = shl i32 4096, %y
5894  %r = urem i32 %x, %shl.y
5895  store i32 %r, ptr addrspace(1) %out
5896  ret void
5897}
5898
5899define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
5900; CHECK-LABEL: @urem_v2i32_pow2k_denom(
5901; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5902; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
5903; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
5904; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5905; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
5906; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5907; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
5908; CHECK-NEXT:    ret void
5909;
5910; GFX6-LABEL: urem_v2i32_pow2k_denom:
5911; GFX6:       ; %bb.0:
5912; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5913; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5914; GFX6-NEXT:    s_mov_b32 s6, -1
5915; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5916; GFX6-NEXT:    s_mov_b32 s4, s0
5917; GFX6-NEXT:    s_mov_b32 s5, s1
5918; GFX6-NEXT:    s_and_b32 s0, s2, 0xfff
5919; GFX6-NEXT:    s_and_b32 s1, s3, 0xfff
5920; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5921; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5922; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5923; GFX6-NEXT:    s_endpgm
5924;
5925; GFX9-LABEL: urem_v2i32_pow2k_denom:
5926; GFX9:       ; %bb.0:
5927; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5928; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5929; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5930; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
5931; GFX9-NEXT:    s_and_b32 s3, s3, 0xfff
5932; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5933; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5934; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
5935; GFX9-NEXT:    s_endpgm
5936  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
5937  store <2 x i32> %r, ptr addrspace(1) %out
5938  ret void
5939}
5940
5941define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
5942; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
5943; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
5944; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5945; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5946; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5947; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5948; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5949; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5950; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5951; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5952; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5953; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5954; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5955; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5956; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5957; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5958; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5959; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5960; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5961; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5962; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5963; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5964; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5965; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5966; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5967; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5968; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5969; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
5970; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
5971; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
5972; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
5973; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i64 0
5974; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
5975; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5976; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
5977; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
5978; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
5979; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
5980; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
5981; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
5982; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
5983; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
5984; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
5985; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
5986; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
5987; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5988; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
5989; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
5990; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
5991; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
5992; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
5993; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
5994; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5995; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
5996; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
5997; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
5998; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
5999; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
6000; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
6001; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
6002; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
6003; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
6004; CHECK-NEXT:    store <2 x i32> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
6005; CHECK-NEXT:    ret void
6006;
6007; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
6008; GFX6:       ; %bb.0:
6009; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
6010; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
6011; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6012; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6013; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
6014; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
6015; GFX6-NEXT:    s_sub_i32 s6, 0, s2
6016; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6017; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
6018; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6019; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
6020; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6021; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6022; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
6023; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6024; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6025; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
6026; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
6027; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6028; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
6029; GFX6-NEXT:    s_mul_i32 s6, s6, s2
6030; GFX6-NEXT:    s_sub_i32 s0, s0, s6
6031; GFX6-NEXT:    s_sub_i32 s6, s0, s2
6032; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
6033; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
6034; GFX6-NEXT:    s_sub_i32 s6, s0, s2
6035; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
6036; GFX6-NEXT:    s_cselect_b32 s0, s6, s0
6037; GFX6-NEXT:    s_sub_i32 s2, 0, s3
6038; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
6039; GFX6-NEXT:    s_mov_b32 s6, -1
6040; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
6041; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6042; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
6043; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
6044; GFX6-NEXT:    s_mul_i32 s2, s2, s3
6045; GFX6-NEXT:    s_sub_i32 s1, s1, s2
6046; GFX6-NEXT:    s_sub_i32 s2, s1, s3
6047; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
6048; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
6049; GFX6-NEXT:    s_sub_i32 s2, s1, s3
6050; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
6051; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
6052; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6053; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6054; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6055; GFX6-NEXT:    s_endpgm
6056;
6057; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
6058; GFX9:       ; %bb.0:
6059; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6060; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6061; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6062; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s2
6063; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
6064; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
6065; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
6066; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
6067; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6068; GFX9-NEXT:    s_sub_i32 s4, 0, s7
6069; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6070; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6071; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6072; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6073; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6074; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
6075; GFX9-NEXT:    s_mul_i32 s4, s4, s5
6076; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
6077; GFX9-NEXT:    s_add_i32 s5, s5, s4
6078; GFX9-NEXT:    s_mul_hi_u32 s4, s0, s5
6079; GFX9-NEXT:    s_mul_i32 s4, s4, s7
6080; GFX9-NEXT:    s_sub_i32 s0, s0, s4
6081; GFX9-NEXT:    s_sub_i32 s4, s0, s7
6082; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
6083; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
6084; GFX9-NEXT:    s_sub_i32 s4, s0, s7
6085; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
6086; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
6087; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
6088; GFX9-NEXT:    s_sub_i32 s4, 0, s6
6089; GFX9-NEXT:    s_mul_i32 s4, s4, s8
6090; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
6091; GFX9-NEXT:    s_add_i32 s8, s8, s4
6092; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s8
6093; GFX9-NEXT:    s_mul_i32 s4, s4, s6
6094; GFX9-NEXT:    s_sub_i32 s1, s1, s4
6095; GFX9-NEXT:    s_sub_i32 s4, s1, s6
6096; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
6097; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
6098; GFX9-NEXT:    s_sub_i32 s4, s1, s6
6099; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
6100; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
6101; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6102; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6103; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6104; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6105; GFX9-NEXT:    s_endpgm
6106  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6107  %r = urem <2 x i32> %x, %shl.y
6108  store <2 x i32> %r, ptr addrspace(1) %out
6109  ret void
6110}
6111
6112define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
6113; CHECK-LABEL: @sdiv_i32_oddk_denom(
6114; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
6115; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6116; CHECK-NEXT:    ret void
6117;
6118; GFX6-LABEL: sdiv_i32_oddk_denom:
6119; GFX6:       ; %bb.0:
6120; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
6121; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6122; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6123; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6124; GFX6-NEXT:    s_mov_b32 s2, -1
6125; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6126; GFX6-NEXT:    v_mul_hi_i32 v0, s6, v0
6127; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
6128; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6129; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6130; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6131; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6132; GFX6-NEXT:    s_endpgm
6133;
6134; GFX9-LABEL: sdiv_i32_oddk_denom:
6135; GFX9:       ; %bb.0:
6136; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
6137; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6138; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6140; GFX9-NEXT:    s_mul_hi_i32 s3, s2, 0xd9528441
6141; GFX9-NEXT:    s_add_i32 s3, s3, s2
6142; GFX9-NEXT:    s_lshr_b32 s2, s3, 31
6143; GFX9-NEXT:    s_ashr_i32 s3, s3, 20
6144; GFX9-NEXT:    s_add_i32 s2, s3, s2
6145; GFX9-NEXT:    v_mov_b32_e32 v1, s2
6146; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
6147; GFX9-NEXT:    s_endpgm
6148  %r = sdiv i32 %x, 1235195
6149  store i32 %r, ptr addrspace(1) %out
6150  ret void
6151}
6152
6153define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
6154; CHECK-LABEL: @sdiv_i32_pow2k_denom(
6155; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
6156; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6157; CHECK-NEXT:    ret void
6158;
6159; GFX6-LABEL: sdiv_i32_pow2k_denom:
6160; GFX6:       ; %bb.0:
6161; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
6162; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6163; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6164; GFX6-NEXT:    s_mov_b32 s2, -1
6165; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6166; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
6167; GFX6-NEXT:    s_lshr_b32 s4, s4, 20
6168; GFX6-NEXT:    s_add_i32 s6, s6, s4
6169; GFX6-NEXT:    s_ashr_i32 s4, s6, 12
6170; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6171; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6172; GFX6-NEXT:    s_endpgm
6173;
6174; GFX9-LABEL: sdiv_i32_pow2k_denom:
6175; GFX9:       ; %bb.0:
6176; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
6177; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6178; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6179; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6180; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
6181; GFX9-NEXT:    s_lshr_b32 s3, s3, 20
6182; GFX9-NEXT:    s_add_i32 s2, s2, s3
6183; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
6184; GFX9-NEXT:    v_mov_b32_e32 v1, s2
6185; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
6186; GFX9-NEXT:    s_endpgm
6187  %r = sdiv i32 %x, 4096
6188  store i32 %r, ptr addrspace(1) %out
6189  ret void
6190}
6191
6192define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
6193; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
6194; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6195; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
6196; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6197; CHECK-NEXT:    ret void
6198;
6199; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
6200; GFX6:       ; %bb.0:
6201; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6202; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6203; GFX6-NEXT:    s_mov_b32 s6, -1
6204; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6205; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6206; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
6207; GFX6-NEXT:    s_add_i32 s3, s3, s8
6208; GFX6-NEXT:    s_xor_b32 s3, s3, s8
6209; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
6210; GFX6-NEXT:    s_sub_i32 s4, 0, s3
6211; GFX6-NEXT:    s_ashr_i32 s9, s2, 31
6212; GFX6-NEXT:    s_add_i32 s2, s2, s9
6213; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6214; GFX6-NEXT:    s_xor_b32 s2, s2, s9
6215; GFX6-NEXT:    s_mov_b32 s5, s1
6216; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6217; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6218; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
6219; GFX6-NEXT:    s_mov_b32 s4, s0
6220; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6221; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6222; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
6223; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
6224; GFX6-NEXT:    s_mul_i32 s0, s0, s3
6225; GFX6-NEXT:    s_sub_i32 s0, s2, s0
6226; GFX6-NEXT:    s_sub_i32 s1, s0, s3
6227; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
6228; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
6229; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
6230; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
6231; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6232; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
6233; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
6234; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
6235; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6236; GFX6-NEXT:    s_xor_b32 s0, s9, s8
6237; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
6238; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
6239; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6240; GFX6-NEXT:    s_endpgm
6241;
6242; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
6243; GFX9:       ; %bb.0:
6244; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6245; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6246; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6247; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6248; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6249; GFX9-NEXT:    s_add_i32 s3, s3, s4
6250; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6251; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6252; GFX9-NEXT:    s_sub_i32 s6, 0, s3
6253; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
6254; GFX9-NEXT:    s_add_i32 s2, s2, s5
6255; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6256; GFX9-NEXT:    s_xor_b32 s2, s2, s5
6257; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6258; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6259; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
6260; GFX9-NEXT:    s_mul_i32 s6, s6, s7
6261; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
6262; GFX9-NEXT:    s_add_i32 s7, s7, s6
6263; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
6264; GFX9-NEXT:    s_mul_i32 s8, s6, s3
6265; GFX9-NEXT:    s_sub_i32 s2, s2, s8
6266; GFX9-NEXT:    s_add_i32 s7, s6, 1
6267; GFX9-NEXT:    s_sub_i32 s8, s2, s3
6268; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
6269; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
6270; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
6271; GFX9-NEXT:    s_add_i32 s7, s6, 1
6272; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
6273; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
6274; GFX9-NEXT:    s_xor_b32 s3, s5, s4
6275; GFX9-NEXT:    s_xor_b32 s2, s2, s3
6276; GFX9-NEXT:    s_sub_i32 s2, s2, s3
6277; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6278; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
6279; GFX9-NEXT:    s_endpgm
6280  %shl.y = shl i32 4096, %y
6281  %r = sdiv i32 %x, %shl.y
6282  store i32 %r, ptr addrspace(1) %out
6283  ret void
6284}
6285
6286define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
6287; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
6288; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6289; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6290; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
6291; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6292; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
6293; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6294; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
6295; CHECK-NEXT:    ret void
6296;
6297; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
6298; GFX6:       ; %bb.0:
6299; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6300; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6301; GFX6-NEXT:    s_mov_b32 s6, -1
6302; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6303; GFX6-NEXT:    s_mov_b32 s4, s0
6304; GFX6-NEXT:    s_mov_b32 s5, s1
6305; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
6306; GFX6-NEXT:    s_ashr_i32 s1, s3, 31
6307; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
6308; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
6309; GFX6-NEXT:    s_add_i32 s0, s2, s0
6310; GFX6-NEXT:    s_add_i32 s1, s3, s1
6311; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6312; GFX6-NEXT:    s_ashr_i32 s1, s1, 12
6313; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6314; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6315; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6316; GFX6-NEXT:    s_endpgm
6317;
6318; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
6319; GFX9:       ; %bb.0:
6320; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6321; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6323; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6324; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
6325; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
6326; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
6327; GFX9-NEXT:    s_add_i32 s2, s2, s4
6328; GFX9-NEXT:    s_add_i32 s3, s3, s5
6329; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
6330; GFX9-NEXT:    s_ashr_i32 s3, s3, 12
6331; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6332; GFX9-NEXT:    v_mov_b32_e32 v1, s3
6333; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
6334; GFX9-NEXT:    s_endpgm
6335  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
6336  store <2 x i32> %r, ptr addrspace(1) %out
6337  ret void
6338}
6339
6340define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
6341; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
6342; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6343; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6344; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
6345; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6346; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
6347; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6348; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
6349; CHECK-NEXT:    ret void
6350;
6351; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6352; GFX6:       ; %bb.0:
6353; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6354; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080081
6355; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6356; GFX6-NEXT:    s_mov_b32 s6, -1
6357; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6358; GFX6-NEXT:    v_mul_hi_i32 v0, s3, v0
6359; GFX6-NEXT:    s_mov_b32 s4, s0
6360; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
6361; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
6362; GFX6-NEXT:    s_add_i32 s0, s2, s0
6363; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s3, v0
6364; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6365; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6366; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
6367; GFX6-NEXT:    s_mov_b32 s5, s1
6368; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
6369; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6370; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6371; GFX6-NEXT:    s_endpgm
6372;
6373; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6374; GFX9:       ; %bb.0:
6375; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6376; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6377; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6378; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6379; GFX9-NEXT:    s_mul_hi_i32 s5, s3, 0x80080081
6380; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
6381; GFX9-NEXT:    s_add_i32 s5, s5, s3
6382; GFX9-NEXT:    s_add_i32 s2, s2, s4
6383; GFX9-NEXT:    s_lshr_b32 s3, s5, 31
6384; GFX9-NEXT:    s_ashr_i32 s4, s5, 11
6385; GFX9-NEXT:    s_ashr_i32 s2, s2, 12
6386; GFX9-NEXT:    s_add_i32 s4, s4, s3
6387; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6388; GFX9-NEXT:    v_mov_b32_e32 v1, s4
6389; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
6390; GFX9-NEXT:    s_endpgm
6391  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
6392  store <2 x i32> %r, ptr addrspace(1) %out
6393  ret void
6394}
6395
6396define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
6397; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
6398; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
6399; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6400; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6401; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6402; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6403; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6404; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
6405; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
6406; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
6407; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
6408; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
6409; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
6410; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
6411; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
6412; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
6413; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
6414; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
6415; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
6416; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
6417; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
6418; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
6419; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
6420; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
6421; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
6422; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
6423; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
6424; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
6425; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
6426; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
6427; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
6428; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
6429; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
6430; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
6431; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
6432; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
6433; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
6434; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
6435; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
6436; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
6437; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
6438; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
6439; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> poison, i32 [[TMP40]], i64 0
6440; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
6441; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6442; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
6443; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
6444; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
6445; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
6446; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
6447; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
6448; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
6449; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
6450; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
6451; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
6452; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
6453; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
6454; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
6455; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
6456; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
6457; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
6458; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
6459; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
6460; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
6461; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
6462; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
6463; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
6464; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
6465; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
6466; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
6467; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
6468; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
6469; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
6470; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
6471; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
6472; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
6473; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
6474; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
6475; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
6476; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
6477; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
6478; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
6479; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
6480; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
6481; CHECK-NEXT:    store <2 x i32> [[TMP82]], ptr addrspace(1) [[OUT:%.*]], align 8
6482; CHECK-NEXT:    ret void
6483;
6484; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
6485; GFX6:       ; %bb.0:
6486; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
6487; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
6488; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6489; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
6490; GFX6-NEXT:    s_abs_i32 s6, s2
6491; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
6492; GFX6-NEXT:    s_sub_i32 s7, 0, s6
6493; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6494; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6495; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6496; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6497; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v0
6498; GFX6-NEXT:    s_abs_i32 s7, s0
6499; GFX6-NEXT:    s_xor_b32 s0, s0, s2
6500; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
6501; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6502; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6503; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
6504; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
6505; GFX6-NEXT:    s_mul_i32 s2, s2, s6
6506; GFX6-NEXT:    s_sub_i32 s2, s7, s2
6507; GFX6-NEXT:    s_sub_i32 s7, s2, s6
6508; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
6509; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
6510; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
6511; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6512; GFX6-NEXT:    s_cselect_b32 s2, s7, s2
6513; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
6514; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
6515; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
6516; GFX6-NEXT:    s_abs_i32 s2, s3
6517; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
6518; GFX6-NEXT:    s_sub_i32 s6, 0, s2
6519; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6520; GFX6-NEXT:    s_xor_b32 s3, s1, s3
6521; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
6522; GFX6-NEXT:    s_abs_i32 s1, s1
6523; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
6524; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
6525; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
6526; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
6527; GFX6-NEXT:    s_ashr_i32 s3, s3, 31
6528; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6529; GFX6-NEXT:    v_mul_lo_u32 v3, s6, v2
6530; GFX6-NEXT:    s_mov_b32 s6, -1
6531; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v3
6532; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6533; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6534; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
6535; GFX6-NEXT:    s_mul_i32 s0, s0, s2
6536; GFX6-NEXT:    s_sub_i32 s0, s1, s0
6537; GFX6-NEXT:    s_sub_i32 s1, s0, s2
6538; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
6539; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
6540; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
6541; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6542; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
6543; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
6544; GFX6-NEXT:    s_cmp_ge_u32 s0, s2
6545; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
6546; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6547; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
6548; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s3, v1
6549; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6550; GFX6-NEXT:    s_endpgm
6551;
6552; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
6553; GFX9:       ; %bb.0:
6554; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6555; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6557; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s2
6558; GFX9-NEXT:    s_abs_i32 s6, s2
6559; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
6560; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s3
6561; GFX9-NEXT:    s_abs_i32 s3, s0
6562; GFX9-NEXT:    s_xor_b32 s0, s0, s2
6563; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6564; GFX9-NEXT:    s_sub_i32 s2, 0, s6
6565; GFX9-NEXT:    s_ashr_i32 s0, s0, 31
6566; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6567; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6568; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
6569; GFX9-NEXT:    s_mul_i32 s2, s2, s8
6570; GFX9-NEXT:    s_mul_hi_u32 s2, s8, s2
6571; GFX9-NEXT:    s_add_i32 s8, s8, s2
6572; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s8
6573; GFX9-NEXT:    s_mul_i32 s8, s2, s6
6574; GFX9-NEXT:    s_sub_i32 s3, s3, s8
6575; GFX9-NEXT:    s_add_i32 s9, s2, 1
6576; GFX9-NEXT:    s_sub_i32 s8, s3, s6
6577; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
6578; GFX9-NEXT:    s_cselect_b32 s2, s9, s2
6579; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
6580; GFX9-NEXT:    s_add_i32 s8, s2, 1
6581; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
6582; GFX9-NEXT:    s_cselect_b32 s6, s8, s2
6583; GFX9-NEXT:    s_abs_i32 s8, s7
6584; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
6585; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
6586; GFX9-NEXT:    s_xor_b32 s5, s6, s0
6587; GFX9-NEXT:    s_sub_i32 s6, 0, s8
6588; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6589; GFX9-NEXT:    s_sub_i32 s0, s5, s0
6590; GFX9-NEXT:    s_xor_b32 s4, s1, s7
6591; GFX9-NEXT:    s_abs_i32 s1, s1
6592; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6593; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6594; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
6595; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
6596; GFX9-NEXT:    s_mul_i32 s6, s6, s5
6597; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s6
6598; GFX9-NEXT:    s_add_i32 s5, s5, s6
6599; GFX9-NEXT:    s_mul_hi_u32 s5, s1, s5
6600; GFX9-NEXT:    s_mul_i32 s6, s5, s8
6601; GFX9-NEXT:    s_sub_i32 s1, s1, s6
6602; GFX9-NEXT:    s_add_i32 s7, s5, 1
6603; GFX9-NEXT:    s_sub_i32 s6, s1, s8
6604; GFX9-NEXT:    s_cmp_ge_u32 s1, s8
6605; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
6606; GFX9-NEXT:    s_cselect_b32 s1, s6, s1
6607; GFX9-NEXT:    s_add_i32 s6, s5, 1
6608; GFX9-NEXT:    s_cmp_ge_u32 s1, s8
6609; GFX9-NEXT:    s_cselect_b32 s1, s6, s5
6610; GFX9-NEXT:    s_xor_b32 s1, s1, s4
6611; GFX9-NEXT:    s_sub_i32 s1, s1, s4
6612; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6613; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6614; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6615; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6616; GFX9-NEXT:    s_endpgm
6617  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6618  %r = sdiv <2 x i32> %x, %shl.y
6619  store <2 x i32> %r, ptr addrspace(1) %out
6620  ret void
6621}
6622
6623define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
6624; CHECK-LABEL: @srem_i32_oddk_denom(
6625; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
6626; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6627; CHECK-NEXT:    ret void
6628;
6629; GFX6-LABEL: srem_i32_oddk_denom:
6630; GFX6:       ; %bb.0:
6631; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
6632; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6633; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6634; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6635; GFX6-NEXT:    s_mov_b32 s2, -1
6636; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6637; GFX6-NEXT:    v_mul_hi_i32 v0, s6, v0
6638; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
6639; GFX6-NEXT:    s_add_i32 s4, s4, s6
6640; GFX6-NEXT:    s_lshr_b32 s5, s4, 31
6641; GFX6-NEXT:    s_ashr_i32 s4, s4, 20
6642; GFX6-NEXT:    s_add_i32 s4, s4, s5
6643; GFX6-NEXT:    s_mul_i32 s4, s4, 0x12d8fb
6644; GFX6-NEXT:    s_sub_i32 s4, s6, s4
6645; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6646; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6647; GFX6-NEXT:    s_endpgm
6648;
6649; GFX9-LABEL: srem_i32_oddk_denom:
6650; GFX9:       ; %bb.0:
6651; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
6652; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6653; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6654; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6655; GFX9-NEXT:    s_mul_hi_i32 s3, s2, 0xd9528441
6656; GFX9-NEXT:    s_add_i32 s3, s3, s2
6657; GFX9-NEXT:    s_lshr_b32 s4, s3, 31
6658; GFX9-NEXT:    s_ashr_i32 s3, s3, 20
6659; GFX9-NEXT:    s_add_i32 s3, s3, s4
6660; GFX9-NEXT:    s_mul_i32 s3, s3, 0x12d8fb
6661; GFX9-NEXT:    s_sub_i32 s2, s2, s3
6662; GFX9-NEXT:    v_mov_b32_e32 v1, s2
6663; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
6664; GFX9-NEXT:    s_endpgm
6665  %r = srem i32 %x, 1235195
6666  store i32 %r, ptr addrspace(1) %out
6667  ret void
6668}
6669
6670define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
6671; CHECK-LABEL: @srem_i32_pow2k_denom(
6672; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
6673; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6674; CHECK-NEXT:    ret void
6675;
6676; GFX6-LABEL: srem_i32_pow2k_denom:
6677; GFX6:       ; %bb.0:
6678; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
6679; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
6680; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6681; GFX6-NEXT:    s_mov_b32 s2, -1
6682; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6683; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
6684; GFX6-NEXT:    s_lshr_b32 s4, s4, 20
6685; GFX6-NEXT:    s_add_i32 s4, s6, s4
6686; GFX6-NEXT:    s_and_b32 s4, s4, 0xfffff000
6687; GFX6-NEXT:    s_sub_i32 s4, s6, s4
6688; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6689; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6690; GFX6-NEXT:    s_endpgm
6691;
6692; GFX9-LABEL: srem_i32_pow2k_denom:
6693; GFX9:       ; %bb.0:
6694; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
6695; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6696; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6698; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
6699; GFX9-NEXT:    s_lshr_b32 s3, s3, 20
6700; GFX9-NEXT:    s_add_i32 s3, s2, s3
6701; GFX9-NEXT:    s_and_b32 s3, s3, 0xfffff000
6702; GFX9-NEXT:    s_sub_i32 s2, s2, s3
6703; GFX9-NEXT:    v_mov_b32_e32 v1, s2
6704; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
6705; GFX9-NEXT:    s_endpgm
6706  %r = srem i32 %x, 4096
6707  store i32 %r, ptr addrspace(1) %out
6708  ret void
6709}
6710
6711define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
6712; CHECK-LABEL: @srem_i32_pow2_shl_denom(
6713; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6714; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
6715; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6716; CHECK-NEXT:    ret void
6717;
6718; GFX6-LABEL: srem_i32_pow2_shl_denom:
6719; GFX6:       ; %bb.0:
6720; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6721; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6722; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6723; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
6724; GFX6-NEXT:    s_add_i32 s3, s3, s4
6725; GFX6-NEXT:    s_xor_b32 s4, s3, s4
6726; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
6727; GFX6-NEXT:    s_sub_i32 s3, 0, s4
6728; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
6729; GFX6-NEXT:    s_add_i32 s2, s2, s5
6730; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6731; GFX6-NEXT:    s_xor_b32 s6, s2, s5
6732; GFX6-NEXT:    s_mov_b32 s2, -1
6733; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6734; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6735; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
6736; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6737; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6738; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6739; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
6740; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
6741; GFX6-NEXT:    s_mul_i32 s7, s7, s4
6742; GFX6-NEXT:    s_sub_i32 s6, s6, s7
6743; GFX6-NEXT:    s_sub_i32 s7, s6, s4
6744; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
6745; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
6746; GFX6-NEXT:    s_sub_i32 s7, s6, s4
6747; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
6748; GFX6-NEXT:    s_cselect_b32 s4, s7, s6
6749; GFX6-NEXT:    s_xor_b32 s4, s4, s5
6750; GFX6-NEXT:    s_sub_i32 s4, s4, s5
6751; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6752; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6753; GFX6-NEXT:    s_endpgm
6754;
6755; GFX9-LABEL: srem_i32_pow2_shl_denom:
6756; GFX9:       ; %bb.0:
6757; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6758; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6759; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6760; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6761; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6762; GFX9-NEXT:    s_add_i32 s3, s3, s4
6763; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6764; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6765; GFX9-NEXT:    s_sub_i32 s5, 0, s3
6766; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6767; GFX9-NEXT:    s_add_i32 s2, s2, s4
6768; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6769; GFX9-NEXT:    s_xor_b32 s2, s2, s4
6770; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6771; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6772; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
6773; GFX9-NEXT:    s_mul_i32 s5, s5, s6
6774; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
6775; GFX9-NEXT:    s_add_i32 s6, s6, s5
6776; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
6777; GFX9-NEXT:    s_mul_i32 s5, s5, s3
6778; GFX9-NEXT:    s_sub_i32 s2, s2, s5
6779; GFX9-NEXT:    s_sub_i32 s5, s2, s3
6780; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
6781; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
6782; GFX9-NEXT:    s_sub_i32 s5, s2, s3
6783; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
6784; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
6785; GFX9-NEXT:    s_xor_b32 s2, s2, s4
6786; GFX9-NEXT:    s_sub_i32 s2, s2, s4
6787; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6788; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
6789; GFX9-NEXT:    s_endpgm
6790  %shl.y = shl i32 4096, %y
6791  %r = srem i32 %x, %shl.y
6792  store i32 %r, ptr addrspace(1) %out
6793  ret void
6794}
6795
6796define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
6797; CHECK-LABEL: @srem_v2i32_pow2k_denom(
6798; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6799; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
6800; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
6801; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6802; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
6803; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6804; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
6805; CHECK-NEXT:    ret void
6806;
6807; GFX6-LABEL: srem_v2i32_pow2k_denom:
6808; GFX6:       ; %bb.0:
6809; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
6810; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6811; GFX6-NEXT:    s_mov_b32 s6, -1
6812; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6813; GFX6-NEXT:    s_mov_b32 s4, s0
6814; GFX6-NEXT:    s_mov_b32 s5, s1
6815; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
6816; GFX6-NEXT:    s_ashr_i32 s1, s3, 31
6817; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
6818; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
6819; GFX6-NEXT:    s_add_i32 s0, s2, s0
6820; GFX6-NEXT:    s_add_i32 s1, s3, s1
6821; GFX6-NEXT:    s_and_b32 s0, s0, 0xfffff000
6822; GFX6-NEXT:    s_and_b32 s1, s1, 0xfffff000
6823; GFX6-NEXT:    s_sub_i32 s0, s2, s0
6824; GFX6-NEXT:    s_sub_i32 s1, s3, s1
6825; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6826; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6827; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6828; GFX6-NEXT:    s_endpgm
6829;
6830; GFX9-LABEL: srem_v2i32_pow2k_denom:
6831; GFX9:       ; %bb.0:
6832; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
6833; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6834; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6835; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6836; GFX9-NEXT:    s_ashr_i32 s5, s3, 31
6837; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
6838; GFX9-NEXT:    s_lshr_b32 s5, s5, 20
6839; GFX9-NEXT:    s_add_i32 s4, s2, s4
6840; GFX9-NEXT:    s_add_i32 s5, s3, s5
6841; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
6842; GFX9-NEXT:    s_sub_i32 s2, s2, s4
6843; GFX9-NEXT:    s_and_b32 s4, s5, 0xfffff000
6844; GFX9-NEXT:    s_sub_i32 s3, s3, s4
6845; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6846; GFX9-NEXT:    v_mov_b32_e32 v1, s3
6847; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
6848; GFX9-NEXT:    s_endpgm
6849  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
6850  store <2 x i32> %r, ptr addrspace(1) %out
6851  ret void
6852}
6853
6854define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
6855; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
6856; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
6857; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6858; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6859; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6860; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6861; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
6862; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
6863; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
6864; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
6865; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
6866; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6867; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
6868; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
6869; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
6870; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
6871; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
6872; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
6873; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
6874; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
6875; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
6876; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
6877; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
6878; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
6879; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
6880; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
6881; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
6882; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
6883; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
6884; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
6885; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
6886; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
6887; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
6888; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
6889; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
6890; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
6891; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
6892; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
6893; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
6894; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> poison, i32 [[TMP37]], i64 0
6895; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
6896; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6897; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
6898; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
6899; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
6900; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
6901; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
6902; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
6903; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
6904; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
6905; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
6906; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
6907; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
6908; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
6909; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
6910; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
6911; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
6912; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
6913; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
6914; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
6915; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
6916; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
6917; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
6918; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
6919; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
6920; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
6921; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
6922; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
6923; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
6924; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
6925; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
6926; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
6927; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
6928; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
6929; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
6930; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
6931; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
6932; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
6933; CHECK-NEXT:    store <2 x i32> [[TMP76]], ptr addrspace(1) [[OUT:%.*]], align 8
6934; CHECK-NEXT:    ret void
6935;
6936; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
6937; GFX6:       ; %bb.0:
6938; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
6939; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
6940; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6941; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s2
6942; GFX6-NEXT:    s_abs_i32 s2, s2
6943; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
6944; GFX6-NEXT:    s_sub_i32 s6, 0, s2
6945; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6946; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6947; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6948; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6949; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
6950; GFX6-NEXT:    s_abs_i32 s6, s0
6951; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
6952; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6953; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6954; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
6955; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
6956; GFX6-NEXT:    s_mul_i32 s7, s7, s2
6957; GFX6-NEXT:    s_sub_i32 s6, s6, s7
6958; GFX6-NEXT:    s_sub_i32 s7, s6, s2
6959; GFX6-NEXT:    s_cmp_ge_u32 s6, s2
6960; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
6961; GFX6-NEXT:    s_sub_i32 s7, s6, s2
6962; GFX6-NEXT:    s_cmp_ge_u32 s6, s2
6963; GFX6-NEXT:    s_cselect_b32 s2, s7, s6
6964; GFX6-NEXT:    s_abs_i32 s3, s3
6965; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
6966; GFX6-NEXT:    s_sub_i32 s6, 0, s3
6967; GFX6-NEXT:    s_abs_i32 s8, s1
6968; GFX6-NEXT:    s_xor_b32 s2, s2, s0
6969; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6970; GFX6-NEXT:    s_sub_i32 s0, s2, s0
6971; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
6972; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6973; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6974; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6975; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
6976; GFX6-NEXT:    s_mov_b32 s6, -1
6977; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6978; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6979; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
6980; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
6981; GFX6-NEXT:    s_mul_i32 s2, s2, s3
6982; GFX6-NEXT:    s_sub_i32 s2, s8, s2
6983; GFX6-NEXT:    s_sub_i32 s8, s2, s3
6984; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
6985; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
6986; GFX6-NEXT:    s_sub_i32 s8, s2, s3
6987; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
6988; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
6989; GFX6-NEXT:    s_xor_b32 s2, s2, s1
6990; GFX6-NEXT:    s_sub_i32 s1, s2, s1
6991; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6992; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6993; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6994; GFX6-NEXT:    s_endpgm
6995;
6996; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
6997; GFX9:       ; %bb.0:
6998; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
6999; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7000; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7001; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s2
7002; GFX9-NEXT:    s_abs_i32 s2, s2
7003; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
7004; GFX9-NEXT:    s_sub_i32 s7, 0, s2
7005; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
7006; GFX9-NEXT:    s_abs_i32 s0, s0
7007; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7008; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
7009; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
7010; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7011; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
7012; GFX9-NEXT:    s_mul_i32 s7, s7, s8
7013; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
7014; GFX9-NEXT:    s_add_i32 s8, s8, s7
7015; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s8
7016; GFX9-NEXT:    s_mul_i32 s7, s7, s2
7017; GFX9-NEXT:    s_sub_i32 s0, s0, s7
7018; GFX9-NEXT:    s_sub_i32 s7, s0, s2
7019; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
7020; GFX9-NEXT:    s_cselect_b32 s0, s7, s0
7021; GFX9-NEXT:    s_sub_i32 s7, s0, s2
7022; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
7023; GFX9-NEXT:    s_cselect_b32 s0, s7, s0
7024; GFX9-NEXT:    s_abs_i32 s7, s3
7025; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
7026; GFX9-NEXT:    s_xor_b32 s0, s0, s6
7027; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
7028; GFX9-NEXT:    s_sub_i32 s5, 0, s7
7029; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7030; GFX9-NEXT:    s_sub_i32 s0, s0, s6
7031; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
7032; GFX9-NEXT:    s_abs_i32 s1, s1
7033; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
7034; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7035; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
7036; GFX9-NEXT:    s_mul_i32 s5, s5, s6
7037; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
7038; GFX9-NEXT:    s_add_i32 s6, s6, s5
7039; GFX9-NEXT:    s_mul_hi_u32 s5, s1, s6
7040; GFX9-NEXT:    s_mul_i32 s5, s5, s7
7041; GFX9-NEXT:    s_sub_i32 s1, s1, s5
7042; GFX9-NEXT:    s_sub_i32 s5, s1, s7
7043; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
7044; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
7045; GFX9-NEXT:    s_sub_i32 s5, s1, s7
7046; GFX9-NEXT:    s_cmp_ge_u32 s1, s7
7047; GFX9-NEXT:    s_cselect_b32 s1, s5, s1
7048; GFX9-NEXT:    s_xor_b32 s1, s1, s4
7049; GFX9-NEXT:    s_sub_i32 s1, s1, s4
7050; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7051; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7052; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7053; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7054; GFX9-NEXT:    s_endpgm
7055  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7056  %r = srem <2 x i32> %x, %shl.y
7057  store <2 x i32> %r, ptr addrspace(1) %out
7058  ret void
7059}
7060
7061define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
7062; CHECK-LABEL: @udiv_i64_oddk_denom(
7063; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
7064; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7065; CHECK-NEXT:    ret void
7066;
7067; GFX6-LABEL: udiv_i64_oddk_denom:
7068; GFX6:       ; %bb.0:
7069; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7070; GFX6-NEXT:    v_mov_b32_e32 v1, 0x64c139ef
7071; GFX6-NEXT:    v_mov_b32_e32 v0, 0x38f83e5
7072; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7073; GFX6-NEXT:    s_mov_b32 s6, -1
7074; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7075; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
7076; GFX6-NEXT:    v_mul_hi_u32 v3, s3, v1
7077; GFX6-NEXT:    s_mov_b32 s5, s1
7078; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v0
7079; GFX6-NEXT:    s_mul_i32 s1, s3, 0x64c139ef
7080; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s1, v4
7081; GFX6-NEXT:    s_mov_b32 s4, s0
7082; GFX6-NEXT:    s_mul_i32 s0, s2, 0x38f83e5
7083; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
7084; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s0, v4
7085; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
7086; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7087; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7088; GFX6-NEXT:    v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
7089; GFX6-NEXT:    s_mul_i32 s0, s3, 0x38f83e5
7090; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
7091; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v0, v3, vcc
7092; GFX6-NEXT:    v_mov_b32_e32 v1, 0
7093; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 2, v0
7094; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7095; GFX6-NEXT:    s_endpgm
7096;
7097; GFX9-LABEL: udiv_i64_oddk_denom:
7098; GFX9:       ; %bb.0:
7099; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7100; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7102; GFX9-NEXT:    s_mul_hi_u32 s4, s2, 0x38f83e5
7103; GFX9-NEXT:    s_mul_i32 s5, s2, 0x38f83e5
7104; GFX9-NEXT:    s_mul_i32 s7, s3, 0x64c139ef
7105; GFX9-NEXT:    s_mul_hi_u32 s2, s2, 0x64c139ef
7106; GFX9-NEXT:    s_mul_hi_u32 s6, s3, 0x64c139ef
7107; GFX9-NEXT:    s_add_u32 s2, s7, s2
7108; GFX9-NEXT:    s_addc_u32 s6, s6, 0
7109; GFX9-NEXT:    s_add_u32 s2, s5, s2
7110; GFX9-NEXT:    s_addc_u32 s2, s4, 0
7111; GFX9-NEXT:    s_add_u32 s2, s6, s2
7112; GFX9-NEXT:    s_addc_u32 s4, 0, 0
7113; GFX9-NEXT:    s_mul_hi_u32 s5, s3, 0x38f83e5
7114; GFX9-NEXT:    s_mul_i32 s3, s3, 0x38f83e5
7115; GFX9-NEXT:    s_add_u32 s2, s3, s2
7116; GFX9-NEXT:    s_addc_u32 s2, s5, s4
7117; GFX9-NEXT:    s_lshr_b32 s2, s2, 2
7118; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7119; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
7120; GFX9-NEXT:    s_endpgm
7121  %r = udiv i64 %x, 1235195949943
7122  store i64 %r, ptr addrspace(1) %out
7123  ret void
7124}
7125
7126define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
7127; CHECK-LABEL: @udiv_i64_pow2k_denom(
7128; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
7129; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7130; CHECK-NEXT:    ret void
7131;
7132; GFX6-LABEL: udiv_i64_pow2k_denom:
7133; GFX6:       ; %bb.0:
7134; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7135; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7136; GFX6-NEXT:    s_mov_b32 s6, -1
7137; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7138; GFX6-NEXT:    s_mov_b32 s4, s0
7139; GFX6-NEXT:    s_mov_b32 s5, s1
7140; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
7141; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7142; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7143; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7144; GFX6-NEXT:    s_endpgm
7145;
7146; GFX9-LABEL: udiv_i64_pow2k_denom:
7147; GFX9:       ; %bb.0:
7148; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7149; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7150; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7151; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7152; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7153; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7154; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7155; GFX9-NEXT:    s_endpgm
7156  %r = udiv i64 %x, 4096
7157  store i64 %r, ptr addrspace(1) %out
7158  ret void
7159}
7160
7161define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
7162; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
7163; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7164; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
7165; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7166; CHECK-NEXT:    ret void
7167;
7168; GFX6-LABEL: udiv_i64_pow2_shl_denom:
7169; GFX6:       ; %bb.0:
7170; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7171; GFX6-NEXT:    s_load_dword s8, s[4:5], 0xd
7172; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7173; GFX6-NEXT:    s_mov_b32 s6, -1
7174; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7175; GFX6-NEXT:    s_mov_b32 s4, s0
7176; GFX6-NEXT:    s_add_i32 s8, s8, 12
7177; GFX6-NEXT:    s_mov_b32 s5, s1
7178; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], s8
7179; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7180; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7181; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7182; GFX6-NEXT:    s_endpgm
7183;
7184; GFX9-LABEL: udiv_i64_pow2_shl_denom:
7185; GFX9:       ; %bb.0:
7186; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
7187; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7188; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7189; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7190; GFX9-NEXT:    s_add_i32 s6, s6, 12
7191; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
7192; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7193; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7194; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7195; GFX9-NEXT:    s_endpgm
7196  %shl.y = shl i64 4096, %y
7197  %r = udiv i64 %x, %shl.y
7198  store i64 %r, ptr addrspace(1) %out
7199  ret void
7200}
7201
7202define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
7203; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
7204; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7205; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7206; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
7207; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7208; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
7209; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7210; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
7211; CHECK-NEXT:    ret void
7212;
7213; GFX6-LABEL: udiv_v2i64_pow2k_denom:
7214; GFX6:       ; %bb.0:
7215; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
7216; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
7217; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7218; GFX6-NEXT:    s_mov_b32 s6, -1
7219; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7220; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
7221; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7222; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7223; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7224; GFX6-NEXT:    v_mov_b32_e32 v2, s2
7225; GFX6-NEXT:    v_mov_b32_e32 v3, s3
7226; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7227; GFX6-NEXT:    s_endpgm
7228;
7229; GFX9-LABEL: udiv_v2i64_pow2k_denom:
7230; GFX9:       ; %bb.0:
7231; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
7232; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
7233; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7235; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
7236; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7237; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7238; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7239; GFX9-NEXT:    v_mov_b32_e32 v2, s2
7240; GFX9-NEXT:    v_mov_b32_e32 v3, s3
7241; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
7242; GFX9-NEXT:    s_endpgm
7243  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
7244  store <2 x i64> %r, ptr addrspace(1) %out
7245  ret void
7246}
7247
7248define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
7249; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
7250; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7251; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7252; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
7253; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7254; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
7255; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7256; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
7257; CHECK-NEXT:    ret void
7258;
7259; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
7260; GFX6:       ; %bb.0:
7261; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
7262; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7263; GFX6-NEXT:    v_mov_b32_e32 v2, 0x10010011
7264; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100100
7265; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7266; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7267; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
7268; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
7269; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
7270; GFX6-NEXT:    s_mul_i32 s7, s11, 0x10010011
7271; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s7, v3
7272; GFX6-NEXT:    s_mul_i32 s6, s10, 0x100100
7273; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
7274; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
7275; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7276; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7277; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
7278; GFX6-NEXT:    v_addc_u32_e64 v2, s[6:7], 0, 0, vcc
7279; GFX6-NEXT:    s_mul_i32 s6, s11, 0x100100
7280; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s6, v1
7281; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v0, v2, vcc
7282; GFX6-NEXT:    v_mov_b32_e32 v1, s11
7283; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s10, v3
7284; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
7285; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
7286; GFX6-NEXT:    s_lshr_b64 s[4:5], s[8:9], 12
7287; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
7288; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
7289; GFX6-NEXT:    v_lshr_b64 v[2:3], v[0:1], 11
7290; GFX6-NEXT:    s_mov_b32 s2, -1
7291; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7292; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7293; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7294; GFX6-NEXT:    s_endpgm
7295;
7296; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
7297; GFX9:       ; %bb.0:
7298; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
7299; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
7300; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7301; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7302; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
7303; GFX9-NEXT:    s_mul_i32 s9, s3, 0x10010011
7304; GFX9-NEXT:    s_mul_hi_u32 s10, s2, 0x10010011
7305; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x10010011
7306; GFX9-NEXT:    s_add_u32 s9, s9, s10
7307; GFX9-NEXT:    s_mul_i32 s5, s2, 0x100100
7308; GFX9-NEXT:    s_addc_u32 s8, s8, 0
7309; GFX9-NEXT:    s_mul_hi_u32 s4, s2, 0x100100
7310; GFX9-NEXT:    s_add_u32 s5, s5, s9
7311; GFX9-NEXT:    s_addc_u32 s4, s4, 0
7312; GFX9-NEXT:    s_add_u32 s4, s8, s4
7313; GFX9-NEXT:    s_addc_u32 s5, 0, 0
7314; GFX9-NEXT:    s_mul_i32 s9, s3, 0x100100
7315; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x100100
7316; GFX9-NEXT:    s_add_u32 s4, s9, s4
7317; GFX9-NEXT:    s_addc_u32 s5, s8, s5
7318; GFX9-NEXT:    s_sub_u32 s2, s2, s4
7319; GFX9-NEXT:    s_subb_u32 s3, s3, s5
7320; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
7321; GFX9-NEXT:    s_add_u32 s2, s2, s4
7322; GFX9-NEXT:    s_addc_u32 s3, s3, s5
7323; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 11
7324; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7325; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7326; GFX9-NEXT:    v_mov_b32_e32 v2, s2
7327; GFX9-NEXT:    v_mov_b32_e32 v3, s3
7328; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
7329; GFX9-NEXT:    s_endpgm
7330  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
7331  store <2 x i64> %r, ptr addrspace(1) %out
7332  ret void
7333}
7334
7335define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
7336; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
7337; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
7338; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7339; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7340; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
7341; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
7342; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7343; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7344; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
7345; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7346; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
7347; CHECK-NEXT:    ret void
7348;
7349; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
7350; GFX6:       ; %bb.0:
7351; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
7352; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7353; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7354; GFX6-NEXT:    s_mov_b32 s2, -1
7355; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7356; GFX6-NEXT:    s_add_i32 s4, s12, 12
7357; GFX6-NEXT:    s_add_i32 s6, s14, 12
7358; GFX6-NEXT:    s_lshr_b64 s[4:5], s[8:9], s4
7359; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], s6
7360; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7361; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7362; GFX6-NEXT:    v_mov_b32_e32 v2, s6
7363; GFX6-NEXT:    v_mov_b32_e32 v3, s7
7364; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7365; GFX6-NEXT:    s_endpgm
7366;
7367; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
7368; GFX9:       ; %bb.0:
7369; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
7370; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7371; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7372; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7373; GFX9-NEXT:    s_add_i32 s2, s12, 12
7374; GFX9-NEXT:    s_add_i32 s4, s14, 12
7375; GFX9-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
7376; GFX9-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
7377; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7378; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7379; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7380; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7381; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
7382; GFX9-NEXT:    s_endpgm
7383  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7384  %r = udiv <2 x i64> %x, %shl.y
7385  store <2 x i64> %r, ptr addrspace(1) %out
7386  ret void
7387}
7388
7389define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
7390; CHECK-LABEL: @urem_i64_oddk_denom(
7391; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
7392; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7393; CHECK-NEXT:    ret void
7394;
7395; GFX6-LABEL: urem_i64_oddk_denom:
7396; GFX6:       ; %bb.0:
7397; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7398; GFX6-NEXT:    v_mov_b32_e32 v2, 0xf6841139
7399; GFX6-NEXT:    v_mov_b32_e32 v0, 0xe3e10011
7400; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7401; GFX6-NEXT:    s_mov_b32 s6, -1
7402; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7403; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v2
7404; GFX6-NEXT:    v_mul_hi_u32 v2, s3, v2
7405; GFX6-NEXT:    v_mul_hi_u32 v1, s2, v0
7406; GFX6-NEXT:    s_mul_i32 s5, s3, 0xf6841139
7407; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s5, v3
7408; GFX6-NEXT:    s_mov_b32 s4, s0
7409; GFX6-NEXT:    s_mul_i32 s0, s2, 0xe3e10011
7410; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
7411; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s0, v3
7412; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7413; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7414; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7415; GFX6-NEXT:    s_mul_i32 s0, s3, 0xe3e10011
7416; GFX6-NEXT:    v_addc_u32_e64 v2, s[8:9], 0, 0, vcc
7417; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
7418; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v0, v2, vcc
7419; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
7420; GFX6-NEXT:    s_movk_i32 s0, 0x11f
7421; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s0
7422; GFX6-NEXT:    s_mov_b32 s0, 0x9761f7c9
7423; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s0
7424; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
7425; GFX6-NEXT:    s_mov_b32 s5, s1
7426; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
7427; GFX6-NEXT:    v_mov_b32_e32 v2, s3
7428; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
7429; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
7430; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7431; GFX6-NEXT:    s_endpgm
7432;
7433; GFX9-LABEL: urem_i64_oddk_denom:
7434; GFX9:       ; %bb.0:
7435; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7436; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7437; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7438; GFX9-NEXT:    s_mul_i32 s7, s3, 0xf6841139
7439; GFX9-NEXT:    s_mul_hi_u32 s8, s2, 0xf6841139
7440; GFX9-NEXT:    s_mul_hi_u32 s6, s3, 0xf6841139
7441; GFX9-NEXT:    s_add_u32 s7, s7, s8
7442; GFX9-NEXT:    s_mul_i32 s5, s2, 0xe3e10011
7443; GFX9-NEXT:    s_addc_u32 s6, s6, 0
7444; GFX9-NEXT:    s_mul_hi_u32 s4, s2, 0xe3e10011
7445; GFX9-NEXT:    s_add_u32 s5, s5, s7
7446; GFX9-NEXT:    s_addc_u32 s4, s4, 0
7447; GFX9-NEXT:    s_add_u32 s4, s6, s4
7448; GFX9-NEXT:    s_addc_u32 s5, 0, 0
7449; GFX9-NEXT:    s_mul_i32 s7, s3, 0xe3e10011
7450; GFX9-NEXT:    s_mul_hi_u32 s6, s3, 0xe3e10011
7451; GFX9-NEXT:    s_add_u32 s4, s7, s4
7452; GFX9-NEXT:    s_addc_u32 s4, s6, s5
7453; GFX9-NEXT:    s_lshr_b32 s4, s4, 8
7454; GFX9-NEXT:    s_mul_i32 s5, s4, 0x11f
7455; GFX9-NEXT:    s_mul_hi_u32 s6, s4, 0x9761f7c9
7456; GFX9-NEXT:    s_add_i32 s6, s6, s5
7457; GFX9-NEXT:    s_mul_i32 s4, s4, 0x9761f7c9
7458; GFX9-NEXT:    s_sub_u32 s2, s2, s4
7459; GFX9-NEXT:    s_subb_u32 s3, s3, s6
7460; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7461; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7462; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7463; GFX9-NEXT:    s_endpgm
7464  %r = urem i64 %x, 1235195393993
7465  store i64 %r, ptr addrspace(1) %out
7466  ret void
7467}
7468
7469define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
7470; CHECK-LABEL: @urem_i64_pow2k_denom(
7471; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
7472; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7473; CHECK-NEXT:    ret void
7474;
7475; GFX6-LABEL: urem_i64_pow2k_denom:
7476; GFX6:       ; %bb.0:
7477; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7478; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7479; GFX6-NEXT:    s_mov_b32 s6, -1
7480; GFX6-NEXT:    v_mov_b32_e32 v1, 0
7481; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7482; GFX6-NEXT:    s_mov_b32 s4, s0
7483; GFX6-NEXT:    s_and_b32 s0, s2, 0xfff
7484; GFX6-NEXT:    s_mov_b32 s5, s1
7485; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7486; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7487; GFX6-NEXT:    s_endpgm
7488;
7489; GFX9-LABEL: urem_i64_pow2k_denom:
7490; GFX9:       ; %bb.0:
7491; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7492; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7493; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7494; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
7495; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7496; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
7497; GFX9-NEXT:    s_endpgm
7498  %r = urem i64 %x, 4096
7499  store i64 %r, ptr addrspace(1) %out
7500  ret void
7501}
7502
7503define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
7504; CHECK-LABEL: @urem_i64_pow2_shl_denom(
7505; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7506; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
7507; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7508; CHECK-NEXT:    ret void
7509;
7510; GFX6-LABEL: urem_i64_pow2_shl_denom:
7511; GFX6:       ; %bb.0:
7512; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7513; GFX6-NEXT:    s_load_dword s8, s[4:5], 0xd
7514; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7515; GFX6-NEXT:    s_mov_b32 s6, -1
7516; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7517; GFX6-NEXT:    s_mov_b32 s4, s0
7518; GFX6-NEXT:    s_mov_b32 s5, s1
7519; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s8
7520; GFX6-NEXT:    s_add_u32 s0, s0, -1
7521; GFX6-NEXT:    s_addc_u32 s1, s1, -1
7522; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
7523; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7524; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7525; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7526; GFX6-NEXT:    s_endpgm
7527;
7528; GFX9-LABEL: urem_i64_pow2_shl_denom:
7529; GFX9:       ; %bb.0:
7530; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
7531; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7532; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7533; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7534; GFX9-NEXT:    s_lshl_b64 s[4:5], 0x1000, s6
7535; GFX9-NEXT:    s_add_u32 s4, s4, -1
7536; GFX9-NEXT:    s_addc_u32 s5, s5, -1
7537; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
7538; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7539; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7540; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7541; GFX9-NEXT:    s_endpgm
7542  %shl.y = shl i64 4096, %y
7543  %r = urem i64 %x, %shl.y
7544  store i64 %r, ptr addrspace(1) %out
7545  ret void
7546}
7547
7548define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
7549; CHECK-LABEL: @urem_v2i64_pow2k_denom(
7550; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7551; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
7552; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
7553; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7554; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
7555; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7556; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
7557; CHECK-NEXT:    ret void
7558;
7559; GFX6-LABEL: urem_v2i64_pow2k_denom:
7560; GFX6:       ; %bb.0:
7561; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
7562; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
7563; GFX6-NEXT:    v_mov_b32_e32 v1, 0
7564; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7565; GFX6-NEXT:    s_mov_b32 s6, -1
7566; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7567; GFX6-NEXT:    s_and_b32 s0, s0, 0xfff
7568; GFX6-NEXT:    s_and_b32 s1, s2, 0xfff
7569; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7570; GFX6-NEXT:    v_mov_b32_e32 v2, s1
7571; GFX6-NEXT:    v_mov_b32_e32 v3, v1
7572; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7573; GFX6-NEXT:    s_endpgm
7574;
7575; GFX9-LABEL: urem_v2i64_pow2k_denom:
7576; GFX9:       ; %bb.0:
7577; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
7578; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
7579; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7580; GFX9-NEXT:    v_mov_b32_e32 v3, v1
7581; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7582; GFX9-NEXT:    s_and_b32 s0, s0, 0xfff
7583; GFX9-NEXT:    s_and_b32 s1, s2, 0xfff
7584; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7585; GFX9-NEXT:    v_mov_b32_e32 v2, s1
7586; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[6:7]
7587; GFX9-NEXT:    s_endpgm
7588  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
7589  store <2 x i64> %r, ptr addrspace(1) %out
7590  ret void
7591}
7592
7593define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
7594; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
7595; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
7596; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7597; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7598; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
7599; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
7600; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7601; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7602; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
7603; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7604; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
7605; CHECK-NEXT:    ret void
7606;
7607; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
7608; GFX6:       ; %bb.0:
7609; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
7610; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
7611; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7612; GFX6-NEXT:    s_mov_b32 s2, -1
7613; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7614; GFX6-NEXT:    s_lshl_b64 s[4:5], 0x1000, s14
7615; GFX6-NEXT:    s_lshl_b64 s[6:7], 0x1000, s12
7616; GFX6-NEXT:    s_add_u32 s6, s6, -1
7617; GFX6-NEXT:    s_addc_u32 s7, s7, -1
7618; GFX6-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
7619; GFX6-NEXT:    s_add_u32 s4, s4, -1
7620; GFX6-NEXT:    s_addc_u32 s5, s5, -1
7621; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
7622; GFX6-NEXT:    v_mov_b32_e32 v0, s6
7623; GFX6-NEXT:    v_mov_b32_e32 v1, s7
7624; GFX6-NEXT:    v_mov_b32_e32 v2, s4
7625; GFX6-NEXT:    v_mov_b32_e32 v3, s5
7626; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7627; GFX6-NEXT:    s_endpgm
7628;
7629; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
7630; GFX9:       ; %bb.0:
7631; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
7632; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
7633; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7634; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7635; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s14
7636; GFX9-NEXT:    s_lshl_b64 s[4:5], 0x1000, s12
7637; GFX9-NEXT:    s_add_u32 s4, s4, -1
7638; GFX9-NEXT:    s_addc_u32 s5, s5, -1
7639; GFX9-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
7640; GFX9-NEXT:    s_add_u32 s2, s2, -1
7641; GFX9-NEXT:    s_addc_u32 s3, s3, -1
7642; GFX9-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
7643; GFX9-NEXT:    v_mov_b32_e32 v0, s4
7644; GFX9-NEXT:    v_mov_b32_e32 v1, s5
7645; GFX9-NEXT:    v_mov_b32_e32 v2, s2
7646; GFX9-NEXT:    v_mov_b32_e32 v3, s3
7647; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
7648; GFX9-NEXT:    s_endpgm
7649  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7650  %r = urem <2 x i64> %x, %shl.y
7651  store <2 x i64> %r, ptr addrspace(1) %out
7652  ret void
7653}
7654
7655define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
7656; CHECK-LABEL: @sdiv_i64_oddk_denom(
7657; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
7658; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7659; CHECK-NEXT:    ret void
7660;
7661; GFX6-LABEL: sdiv_i64_oddk_denom:
7662; GFX6:       ; %bb.0:
7663; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7664; GFX6-NEXT:    v_mov_b32_e32 v2, 0xfd81e19
7665; GFX6-NEXT:    v_mov_b32_e32 v0, 0x6ca94220
7666; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7667; GFX6-NEXT:    s_mov_b32 s6, -1
7668; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7669; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v2
7670; GFX6-NEXT:    v_mul_hi_u32 v4, s3, v2
7671; GFX6-NEXT:    s_mov_b32 s5, s1
7672; GFX6-NEXT:    v_mul_hi_u32 v1, s2, v0
7673; GFX6-NEXT:    s_mul_i32 s1, s3, 0xfd81e19
7674; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s1, v3
7675; GFX6-NEXT:    s_mov_b32 s4, s0
7676; GFX6-NEXT:    s_mul_i32 s0, s2, 0x6ca94220
7677; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
7678; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s0, v3
7679; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7680; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
7681; GFX6-NEXT:    v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
7682; GFX6-NEXT:    s_ashr_i32 s1, s3, 31
7683; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7684; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
7685; GFX6-NEXT:    s_mul_i32 s0, s3, 0x6ca94220
7686; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
7687; GFX6-NEXT:    s_mul_i32 s0, s1, 0x6ca94220
7688; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v0, v3, vcc
7689; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
7690; GFX6-NEXT:    s_mul_i32 s1, s1, 0xfd81e19
7691; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s1, v0
7692; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
7693; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
7694; GFX6-NEXT:    v_ashr_i64 v[2:3], v[0:1], 19
7695; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
7696; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
7697; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
7698; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7699; GFX6-NEXT:    s_endpgm
7700;
7701; GFX9-LABEL: sdiv_i64_oddk_denom:
7702; GFX9:       ; %bb.0:
7703; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7704; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7705; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7706; GFX9-NEXT:    s_mul_hi_u32 s4, s2, 0x6ca94220
7707; GFX9-NEXT:    s_mul_i32 s5, s2, 0x6ca94220
7708; GFX9-NEXT:    s_mul_i32 s7, s3, 0xfd81e19
7709; GFX9-NEXT:    s_mul_hi_u32 s2, s2, 0xfd81e19
7710; GFX9-NEXT:    s_mul_hi_u32 s6, s3, 0xfd81e19
7711; GFX9-NEXT:    s_add_u32 s2, s7, s2
7712; GFX9-NEXT:    s_addc_u32 s6, s6, 0
7713; GFX9-NEXT:    s_add_u32 s2, s5, s2
7714; GFX9-NEXT:    s_addc_u32 s2, s4, 0
7715; GFX9-NEXT:    s_add_u32 s2, s6, s2
7716; GFX9-NEXT:    s_addc_u32 s4, 0, 0
7717; GFX9-NEXT:    s_mul_i32 s6, s3, 0x6ca94220
7718; GFX9-NEXT:    s_mul_hi_u32 s5, s3, 0x6ca94220
7719; GFX9-NEXT:    s_add_u32 s2, s6, s2
7720; GFX9-NEXT:    s_addc_u32 s4, s5, s4
7721; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
7722; GFX9-NEXT:    s_mul_i32 s5, s3, 0x6ca94220
7723; GFX9-NEXT:    s_mul_hi_u32 s6, s3, 0xfd81e19
7724; GFX9-NEXT:    s_add_i32 s5, s6, s5
7725; GFX9-NEXT:    s_mul_i32 s3, s3, 0xfd81e19
7726; GFX9-NEXT:    s_add_i32 s5, s5, s3
7727; GFX9-NEXT:    s_add_u32 s2, s2, s3
7728; GFX9-NEXT:    s_addc_u32 s3, s4, s5
7729; GFX9-NEXT:    s_ashr_i64 s[4:5], s[2:3], 19
7730; GFX9-NEXT:    s_lshr_b32 s2, s3, 31
7731; GFX9-NEXT:    s_add_u32 s2, s4, s2
7732; GFX9-NEXT:    s_addc_u32 s3, s5, 0
7733; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7734; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7735; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7736; GFX9-NEXT:    s_endpgm
7737  %r = sdiv i64 %x, 1235195
7738  store i64 %r, ptr addrspace(1) %out
7739  ret void
7740}
7741
7742define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
7743; CHECK-LABEL: @sdiv_i64_pow2k_denom(
7744; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
7745; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7746; CHECK-NEXT:    ret void
7747;
7748; GFX6-LABEL: sdiv_i64_pow2k_denom:
7749; GFX6:       ; %bb.0:
7750; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7751; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7752; GFX6-NEXT:    s_mov_b32 s6, -1
7753; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7754; GFX6-NEXT:    s_mov_b32 s4, s0
7755; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
7756; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
7757; GFX6-NEXT:    s_add_u32 s0, s2, s0
7758; GFX6-NEXT:    s_mov_b32 s5, s1
7759; GFX6-NEXT:    s_addc_u32 s1, s3, 0
7760; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
7761; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7762; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7763; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7764; GFX6-NEXT:    s_endpgm
7765;
7766; GFX9-LABEL: sdiv_i64_pow2k_denom:
7767; GFX9:       ; %bb.0:
7768; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
7769; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7770; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7771; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
7772; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
7773; GFX9-NEXT:    s_add_u32 s2, s2, s4
7774; GFX9-NEXT:    s_addc_u32 s3, s3, 0
7775; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
7776; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7777; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7778; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7779; GFX9-NEXT:    s_endpgm
7780  %r = sdiv i64 %x, 4096
7781  store i64 %r, ptr addrspace(1) %out
7782  ret void
7783}
7784
7785define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
7786; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
7787; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7788; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
7789; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7790; CHECK-NEXT:    ret void
7791;
7792; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
7793; GFX6:       ; %bb.0:
7794; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
7795; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7796; GFX6-NEXT:    s_mov_b32 s6, -1
7797; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7798; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
7799; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
7800; GFX6-NEXT:    s_add_u32 s0, s0, s8
7801; GFX6-NEXT:    s_mov_b32 s9, s8
7802; GFX6-NEXT:    s_addc_u32 s1, s1, s8
7803; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
7804; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
7805; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
7806; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
7807; GFX6-NEXT:    s_sub_u32 s4, 0, s10
7808; GFX6-NEXT:    s_subb_u32 s5, 0, s11
7809; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
7810; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7811; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7812; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
7813; GFX6-NEXT:    s_add_u32 s2, s2, s12
7814; GFX6-NEXT:    s_mov_b32 s13, s12
7815; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7816; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7817; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7818; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
7819; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7820; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7821; GFX6-NEXT:    s_addc_u32 s3, s3, s12
7822; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
7823; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
7824; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
7825; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
7826; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
7827; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7828; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
7829; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
7830; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
7831; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
7832; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
7833; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
7834; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
7835; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
7836; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
7837; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7838; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
7839; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
7840; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
7841; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7842; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7843; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7844; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7845; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
7846; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
7847; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
7848; GFX6-NEXT:    s_mov_b32 s5, s1
7849; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7850; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
7851; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7852; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
7853; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
7854; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
7855; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
7856; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
7857; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
7858; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
7859; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
7860; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7861; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
7862; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
7863; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
7864; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7865; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7866; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7867; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7868; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
7869; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
7870; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
7871; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
7872; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
7873; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7874; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7875; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
7876; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7877; GFX6-NEXT:    s_mov_b32 s4, s0
7878; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7879; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7880; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
7881; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7882; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
7883; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
7884; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
7885; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
7886; GFX6-NEXT:    v_mov_b32_e32 v5, s11
7887; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7888; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
7889; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7890; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
7891; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
7892; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
7893; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s10, v3
7894; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7895; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
7896; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7897; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v5
7898; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7899; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
7900; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
7901; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
7902; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
7903; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
7904; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
7905; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7906; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
7907; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
7908; GFX6-NEXT:    v_mov_b32_e32 v6, s3
7909; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
7910; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
7911; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7912; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
7913; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7914; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
7915; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
7916; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7917; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
7918; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
7919; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
7920; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
7921; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
7922; GFX6-NEXT:    v_mov_b32_e32 v2, s1
7923; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
7924; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
7925; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7926; GFX6-NEXT:    s_endpgm
7927;
7928; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
7929; GFX9:       ; %bb.0:
7930; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
7931; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
7932; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7933; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
7934; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
7935; GFX9-NEXT:    s_add_u32 s0, s0, s2
7936; GFX9-NEXT:    s_mov_b32 s3, s2
7937; GFX9-NEXT:    s_addc_u32 s1, s1, s2
7938; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
7939; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
7940; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
7941; GFX9-NEXT:    s_sub_u32 s0, 0, s6
7942; GFX9-NEXT:    s_subb_u32 s1, 0, s7
7943; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
7944; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
7945; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7946; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
7947; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
7948; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
7949; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
7950; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
7951; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7952; GFX9-NEXT:    v_readfirstlane_b32 s4, v2
7953; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
7954; GFX9-NEXT:    s_mul_i32 s12, s0, s4
7955; GFX9-NEXT:    s_mul_hi_u32 s14, s0, s5
7956; GFX9-NEXT:    s_mul_i32 s13, s1, s5
7957; GFX9-NEXT:    s_add_i32 s12, s14, s12
7958; GFX9-NEXT:    s_mul_i32 s15, s0, s5
7959; GFX9-NEXT:    s_add_i32 s12, s12, s13
7960; GFX9-NEXT:    s_mul_hi_u32 s14, s5, s15
7961; GFX9-NEXT:    s_mul_hi_u32 s13, s5, s12
7962; GFX9-NEXT:    s_mul_i32 s5, s5, s12
7963; GFX9-NEXT:    s_add_u32 s5, s14, s5
7964; GFX9-NEXT:    s_addc_u32 s13, 0, s13
7965; GFX9-NEXT:    s_mul_hi_u32 s16, s4, s15
7966; GFX9-NEXT:    s_mul_i32 s15, s4, s15
7967; GFX9-NEXT:    s_add_u32 s5, s5, s15
7968; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s12
7969; GFX9-NEXT:    s_addc_u32 s5, s13, s16
7970; GFX9-NEXT:    s_addc_u32 s13, s14, 0
7971; GFX9-NEXT:    s_mul_i32 s12, s4, s12
7972; GFX9-NEXT:    s_add_u32 s5, s5, s12
7973; GFX9-NEXT:    s_addc_u32 s12, 0, s13
7974; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s5, v1
7975; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
7976; GFX9-NEXT:    s_addc_u32 s4, s4, s12
7977; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
7978; GFX9-NEXT:    s_mul_i32 s5, s0, s4
7979; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
7980; GFX9-NEXT:    s_add_i32 s5, s13, s5
7981; GFX9-NEXT:    s_mul_i32 s1, s1, s12
7982; GFX9-NEXT:    s_add_i32 s5, s5, s1
7983; GFX9-NEXT:    s_mul_i32 s0, s0, s12
7984; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s0
7985; GFX9-NEXT:    s_mul_i32 s14, s4, s0
7986; GFX9-NEXT:    s_mul_i32 s16, s12, s5
7987; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
7988; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s5
7989; GFX9-NEXT:    s_add_u32 s0, s0, s16
7990; GFX9-NEXT:    s_addc_u32 s12, 0, s15
7991; GFX9-NEXT:    s_add_u32 s0, s0, s14
7992; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s5
7993; GFX9-NEXT:    s_addc_u32 s0, s12, s13
7994; GFX9-NEXT:    s_addc_u32 s1, s1, 0
7995; GFX9-NEXT:    s_mul_i32 s5, s4, s5
7996; GFX9-NEXT:    s_add_u32 s0, s0, s5
7997; GFX9-NEXT:    s_addc_u32 s1, 0, s1
7998; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
7999; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8000; GFX9-NEXT:    s_addc_u32 s12, s4, s1
8001; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
8002; GFX9-NEXT:    s_add_u32 s0, s10, s4
8003; GFX9-NEXT:    s_mov_b32 s5, s4
8004; GFX9-NEXT:    s_addc_u32 s1, s11, s4
8005; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[4:5]
8006; GFX9-NEXT:    v_readfirstlane_b32 s13, v1
8007; GFX9-NEXT:    s_mul_i32 s1, s10, s12
8008; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s13
8009; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s12
8010; GFX9-NEXT:    s_add_u32 s1, s14, s1
8011; GFX9-NEXT:    s_addc_u32 s0, 0, s0
8012; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s13
8013; GFX9-NEXT:    s_mul_i32 s13, s11, s13
8014; GFX9-NEXT:    s_add_u32 s1, s1, s13
8015; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s12
8016; GFX9-NEXT:    s_addc_u32 s0, s0, s15
8017; GFX9-NEXT:    s_addc_u32 s1, s14, 0
8018; GFX9-NEXT:    s_mul_i32 s12, s11, s12
8019; GFX9-NEXT:    s_add_u32 s12, s0, s12
8020; GFX9-NEXT:    s_addc_u32 s13, 0, s1
8021; GFX9-NEXT:    s_mul_i32 s0, s6, s13
8022; GFX9-NEXT:    s_mul_hi_u32 s1, s6, s12
8023; GFX9-NEXT:    s_add_i32 s0, s1, s0
8024; GFX9-NEXT:    s_mul_i32 s1, s7, s12
8025; GFX9-NEXT:    s_add_i32 s14, s0, s1
8026; GFX9-NEXT:    s_mul_i32 s1, s6, s12
8027; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8028; GFX9-NEXT:    s_sub_i32 s0, s11, s14
8029; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
8030; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8031; GFX9-NEXT:    s_subb_u32 s10, s0, s7
8032; GFX9-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s6, v1
8033; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
8034; GFX9-NEXT:    s_subb_u32 s10, s10, 0
8035; GFX9-NEXT:    s_cmp_ge_u32 s10, s7
8036; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
8037; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v2
8038; GFX9-NEXT:    s_cmp_eq_u32 s10, s7
8039; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
8040; GFX9-NEXT:    v_mov_b32_e32 v3, s15
8041; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
8042; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
8043; GFX9-NEXT:    s_add_u32 s0, s12, 1
8044; GFX9-NEXT:    s_addc_u32 s10, s13, 0
8045; GFX9-NEXT:    s_add_u32 s1, s12, 2
8046; GFX9-NEXT:    s_addc_u32 s15, s13, 0
8047; GFX9-NEXT:    v_mov_b32_e32 v3, s0
8048; GFX9-NEXT:    v_mov_b32_e32 v4, s1
8049; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
8050; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
8051; GFX9-NEXT:    v_mov_b32_e32 v3, s10
8052; GFX9-NEXT:    v_mov_b32_e32 v4, s15
8053; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8054; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
8055; GFX9-NEXT:    s_subb_u32 s0, s11, s14
8056; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
8057; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
8058; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
8059; GFX9-NEXT:    s_cmp_eq_u32 s0, s7
8060; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
8061; GFX9-NEXT:    v_mov_b32_e32 v4, s1
8062; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
8063; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
8064; GFX9-NEXT:    v_mov_b32_e32 v4, s13
8065; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
8066; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
8067; GFX9-NEXT:    v_mov_b32_e32 v3, s12
8068; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
8069; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[2:3]
8070; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
8071; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v1
8072; GFX9-NEXT:    v_mov_b32_e32 v4, s1
8073; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s0, v2
8074; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v4, vcc
8075; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
8076; GFX9-NEXT:    s_endpgm
8077  %shl.y = shl i64 4096, %y
8078  %r = sdiv i64 %x, %shl.y
8079  store i64 %r, ptr addrspace(1) %out
8080  ret void
8081}
8082
8083define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
8084; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
8085; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8086; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8087; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
8088; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8089; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
8090; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8091; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
8092; CHECK-NEXT:    ret void
8093;
8094; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
8095; GFX6:       ; %bb.0:
8096; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
8097; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
8098; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8099; GFX6-NEXT:    s_mov_b32 s6, -1
8100; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8101; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
8102; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8103; GFX6-NEXT:    s_add_u32 s0, s0, s8
8104; GFX6-NEXT:    s_addc_u32 s1, s1, 0
8105; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8106; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8107; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8108; GFX6-NEXT:    s_add_u32 s2, s2, s8
8109; GFX6-NEXT:    s_addc_u32 s3, s3, 0
8110; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8111; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8112; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8113; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8114; GFX6-NEXT:    v_mov_b32_e32 v3, s3
8115; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8116; GFX6-NEXT:    s_endpgm
8117;
8118; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
8119; GFX9:       ; %bb.0:
8120; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
8121; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
8122; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8123; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8124; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
8125; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8126; GFX9-NEXT:    s_add_u32 s0, s0, s4
8127; GFX9-NEXT:    s_addc_u32 s1, s1, 0
8128; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8129; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8130; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8131; GFX9-NEXT:    s_add_u32 s2, s2, s4
8132; GFX9-NEXT:    s_addc_u32 s3, s3, 0
8133; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8134; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8135; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8136; GFX9-NEXT:    v_mov_b32_e32 v2, s2
8137; GFX9-NEXT:    v_mov_b32_e32 v3, s3
8138; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
8139; GFX9-NEXT:    s_endpgm
8140  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
8141  store <2 x i64> %r, ptr addrspace(1) %out
8142  ret void
8143}
8144
8145define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
8146; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
8147; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8148; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8149; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
8150; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8151; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
8152; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8153; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
8154; CHECK-NEXT:    ret void
8155;
8156; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8157; GFX6:       ; %bb.0:
8158; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
8159; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
8160; GFX6-NEXT:    v_mov_b32_e32 v2, 0x8008009
8161; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080080
8162; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8163; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8164; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
8165; GFX6-NEXT:    v_mul_hi_u32 v4, s11, v2
8166; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
8167; GFX6-NEXT:    s_mul_i32 s7, s11, 0x8008009
8168; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s7, v3
8169; GFX6-NEXT:    s_mul_i32 s6, s10, 0x80080080
8170; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8171; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
8172; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
8173; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8174; GFX6-NEXT:    s_lshr_b32 s4, s4, 20
8175; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
8176; GFX6-NEXT:    s_add_u32 s4, s8, s4
8177; GFX6-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
8178; GFX6-NEXT:    s_addc_u32 s5, s9, 0
8179; GFX6-NEXT:    s_ashr_i32 s7, s11, 31
8180; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
8181; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
8182; GFX6-NEXT:    s_mul_i32 s6, s11, 0x80080080
8183; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
8184; GFX6-NEXT:    s_mul_i32 s6, s7, 0x80080080
8185; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v0, v3, vcc
8186; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
8187; GFX6-NEXT:    s_mul_i32 s6, s7, 0x8008009
8188; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
8189; GFX6-NEXT:    v_mov_b32_e32 v3, s6
8190; GFX6-NEXT:    v_mov_b32_e32 v4, s11
8191; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v3
8192; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v4, vcc
8193; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
8194; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v0, v2, vcc
8195; GFX6-NEXT:    v_mov_b32_e32 v3, s11
8196; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s10, v1
8197; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
8198; GFX6-NEXT:    v_ashr_i64 v[2:3], v[0:1], 11
8199; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
8200; GFX6-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8201; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v0
8202; GFX6-NEXT:    s_mov_b32 s2, -1
8203; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
8204; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8205; GFX6-NEXT:    v_mov_b32_e32 v1, s5
8206; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
8207; GFX6-NEXT:    s_endpgm
8208;
8209; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8210; GFX9:       ; %bb.0:
8211; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
8212; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
8213; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8215; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
8216; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8217; GFX9-NEXT:    s_add_u32 s0, s0, s4
8218; GFX9-NEXT:    s_addc_u32 s1, s1, 0
8219; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8220; GFX9-NEXT:    s_mul_i32 s9, s3, 0x8008009
8221; GFX9-NEXT:    s_mul_hi_u32 s10, s2, 0x8008009
8222; GFX9-NEXT:    s_mul_hi_u32 s4, s3, 0x8008009
8223; GFX9-NEXT:    s_add_u32 s9, s9, s10
8224; GFX9-NEXT:    s_mul_i32 s8, s2, 0x80080080
8225; GFX9-NEXT:    s_addc_u32 s4, s4, 0
8226; GFX9-NEXT:    s_mul_hi_u32 s5, s2, 0x80080080
8227; GFX9-NEXT:    s_add_u32 s8, s8, s9
8228; GFX9-NEXT:    s_addc_u32 s5, s5, 0
8229; GFX9-NEXT:    s_add_u32 s4, s4, s5
8230; GFX9-NEXT:    s_addc_u32 s5, 0, 0
8231; GFX9-NEXT:    s_mul_i32 s9, s3, 0x80080080
8232; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x80080080
8233; GFX9-NEXT:    s_add_u32 s4, s9, s4
8234; GFX9-NEXT:    s_addc_u32 s5, s8, s5
8235; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
8236; GFX9-NEXT:    s_mul_i32 s9, s8, 0x80080080
8237; GFX9-NEXT:    s_mul_hi_u32 s10, s8, 0x8008009
8238; GFX9-NEXT:    s_add_i32 s9, s10, s9
8239; GFX9-NEXT:    s_mul_i32 s8, s8, 0x8008009
8240; GFX9-NEXT:    s_add_i32 s9, s9, s8
8241; GFX9-NEXT:    s_sub_u32 s8, s8, s2
8242; GFX9-NEXT:    s_subb_u32 s9, s9, s3
8243; GFX9-NEXT:    s_add_u32 s4, s4, s8
8244; GFX9-NEXT:    s_addc_u32 s5, s5, s9
8245; GFX9-NEXT:    s_add_u32 s2, s4, s2
8246; GFX9-NEXT:    s_addc_u32 s3, s5, s3
8247; GFX9-NEXT:    s_ashr_i64 s[4:5], s[2:3], 11
8248; GFX9-NEXT:    s_lshr_b32 s2, s3, 31
8249; GFX9-NEXT:    s_add_u32 s2, s4, s2
8250; GFX9-NEXT:    s_addc_u32 s3, s5, 0
8251; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8252; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8253; GFX9-NEXT:    v_mov_b32_e32 v2, s2
8254; GFX9-NEXT:    v_mov_b32_e32 v3, s3
8255; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
8256; GFX9-NEXT:    s_endpgm
8257  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
8258  store <2 x i64> %r, ptr addrspace(1) %out
8259  ret void
8260}
8261
8262define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
8263; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
8264; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
8265; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8266; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
8267; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
8268; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
8269; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
8270; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
8271; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
8272; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
8273; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
8274; CHECK-NEXT:    ret void
8275;
8276; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
8277; GFX6:       ; %bb.0:
8278; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
8279; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
8280; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8281; GFX6-NEXT:    s_mov_b32 s6, -1
8282; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8283; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s12
8284; GFX6-NEXT:    s_lshl_b64 s[14:15], 0x1000, s14
8285; GFX6-NEXT:    s_ashr_i32 s12, s1, 31
8286; GFX6-NEXT:    s_add_u32 s0, s0, s12
8287; GFX6-NEXT:    s_mov_b32 s13, s12
8288; GFX6-NEXT:    s_addc_u32 s1, s1, s12
8289; GFX6-NEXT:    s_xor_b64 s[2:3], s[0:1], s[12:13]
8290; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
8291; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
8292; GFX6-NEXT:    s_sub_u32 s0, 0, s2
8293; GFX6-NEXT:    s_subb_u32 s1, 0, s3
8294; GFX6-NEXT:    s_ashr_i32 s16, s9, 31
8295; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8296; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8297; GFX6-NEXT:    s_mov_b32 s17, s16
8298; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8299; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8300; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8301; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8302; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8303; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8304; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8305; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8306; GFX6-NEXT:    v_mul_lo_u32 v5, s1, v0
8307; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v0
8308; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8309; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8310; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
8311; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8312; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
8313; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
8314; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
8315; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
8316; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
8317; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
8318; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8319; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
8320; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
8321; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
8322; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8323; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8324; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8325; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8326; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8327; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8328; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
8329; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8330; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v0
8331; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8332; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
8333; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
8334; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
8335; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
8336; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
8337; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
8338; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8339; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8340; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8341; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
8342; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
8343; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8344; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8345; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8346; GFX6-NEXT:    s_add_u32 s0, s8, s16
8347; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8348; GFX6-NEXT:    s_addc_u32 s1, s9, s16
8349; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8350; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[16:17]
8351; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
8352; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
8353; GFX6-NEXT:    v_mul_hi_u32 v4, s8, v1
8354; GFX6-NEXT:    v_mul_hi_u32 v5, s9, v1
8355; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
8356; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8357; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8358; GFX6-NEXT:    v_mul_lo_u32 v4, s9, v0
8359; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
8360; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8361; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8362; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
8363; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8364; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
8365; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
8366; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8367; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
8368; GFX6-NEXT:    v_mov_b32_e32 v5, s3
8369; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8370; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
8371; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
8372; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s9, v2
8373; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
8374; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
8375; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
8376; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
8377; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
8378; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8379; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
8380; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8381; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
8382; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
8383; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
8384; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
8385; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
8386; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
8387; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8388; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
8389; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
8390; GFX6-NEXT:    s_xor_b64 s[0:1], s[16:17], s[12:13]
8391; GFX6-NEXT:    s_ashr_i32 s8, s15, 31
8392; GFX6-NEXT:    s_add_u32 s12, s14, s8
8393; GFX6-NEXT:    v_mov_b32_e32 v6, s9
8394; GFX6-NEXT:    s_mov_b32 s9, s8
8395; GFX6-NEXT:    s_addc_u32 s13, s15, s8
8396; GFX6-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
8397; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
8398; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s12
8399; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s13
8400; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
8401; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
8402; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
8403; GFX6-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
8404; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
8405; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8406; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
8407; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
8408; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8409; GFX6-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v6
8410; GFX6-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
8411; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
8412; GFX6-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
8413; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
8414; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
8415; GFX6-NEXT:    s_sub_u32 s2, 0, s12
8416; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
8417; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
8418; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v2
8419; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v3
8420; GFX6-NEXT:    s_subb_u32 s3, 0, s13
8421; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
8422; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
8423; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8424; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
8425; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
8426; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v4
8427; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v5
8428; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v4
8429; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v4
8430; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
8431; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8432; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8433; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v5
8434; GFX6-NEXT:    v_mul_hi_u32 v5, v3, v5
8435; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
8436; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
8437; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
8438; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
8439; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8440; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8441; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8442; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
8443; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v3
8444; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v2
8445; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
8446; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
8447; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
8448; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
8449; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
8450; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
8451; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
8452; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
8453; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
8454; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
8455; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
8456; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
8457; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
8458; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
8459; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
8460; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
8461; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8462; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
8463; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8464; GFX6-NEXT:    s_add_u32 s10, s10, s2
8465; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8466; GFX6-NEXT:    s_mov_b32 s3, s2
8467; GFX6-NEXT:    s_addc_u32 s11, s11, s2
8468; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
8469; GFX6-NEXT:    s_xor_b64 s[10:11], s[10:11], s[2:3]
8470; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v3
8471; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v2
8472; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v3
8473; GFX6-NEXT:    v_mul_hi_u32 v8, s11, v3
8474; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
8475; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8476; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
8477; GFX6-NEXT:    v_mul_lo_u32 v7, s11, v2
8478; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
8479; GFX6-NEXT:    v_mov_b32_e32 v6, s1
8480; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
8481; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
8482; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
8483; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8484; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8485; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v3
8486; GFX6-NEXT:    v_mul_hi_u32 v5, s12, v2
8487; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
8488; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
8489; GFX6-NEXT:    v_mul_lo_u32 v6, s13, v2
8490; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
8491; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v2
8492; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
8493; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
8494; GFX6-NEXT:    v_mov_b32_e32 v7, s13
8495; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
8496; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
8497; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
8498; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
8499; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
8500; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
8501; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
8502; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
8503; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
8504; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
8505; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v2
8506; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
8507; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 2, v2
8508; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
8509; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
8510; GFX6-NEXT:    v_cndmask_b32_e64 v6, v7, v9, s[0:1]
8511; GFX6-NEXT:    v_cndmask_b32_e64 v7, v8, v10, s[0:1]
8512; GFX6-NEXT:    v_mov_b32_e32 v8, s11
8513; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
8514; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
8515; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
8516; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
8517; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
8518; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
8519; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
8520; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
8521; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
8522; GFX6-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
8523; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
8524; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
8525; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
8526; GFX6-NEXT:    v_mov_b32_e32 v4, s1
8527; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
8528; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
8529; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8530; GFX6-NEXT:    s_endpgm
8531;
8532; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
8533; GFX9:       ; %bb.0:
8534; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
8535; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
8536; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8537; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8538; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s12
8539; GFX9-NEXT:    s_lshl_b64 s[6:7], 0x1000, s14
8540; GFX9-NEXT:    s_ashr_i32 s12, s1, 31
8541; GFX9-NEXT:    s_add_u32 s0, s0, s12
8542; GFX9-NEXT:    s_mov_b32 s13, s12
8543; GFX9-NEXT:    s_addc_u32 s1, s1, s12
8544; GFX9-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
8545; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
8546; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
8547; GFX9-NEXT:    s_sub_u32 s0, 0, s14
8548; GFX9-NEXT:    s_subb_u32 s1, 0, s15
8549; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8550; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8551; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8552; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8553; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8554; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8555; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8556; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8557; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
8558; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
8559; GFX9-NEXT:    s_mul_i32 s16, s0, s4
8560; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s5
8561; GFX9-NEXT:    s_mul_i32 s17, s1, s5
8562; GFX9-NEXT:    s_add_i32 s16, s18, s16
8563; GFX9-NEXT:    s_mul_i32 s19, s0, s5
8564; GFX9-NEXT:    s_add_i32 s16, s16, s17
8565; GFX9-NEXT:    s_mul_hi_u32 s17, s5, s16
8566; GFX9-NEXT:    s_mul_i32 s18, s5, s16
8567; GFX9-NEXT:    s_mul_hi_u32 s5, s5, s19
8568; GFX9-NEXT:    s_add_u32 s5, s5, s18
8569; GFX9-NEXT:    s_addc_u32 s17, 0, s17
8570; GFX9-NEXT:    s_mul_hi_u32 s20, s4, s19
8571; GFX9-NEXT:    s_mul_i32 s19, s4, s19
8572; GFX9-NEXT:    s_add_u32 s5, s5, s19
8573; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s16
8574; GFX9-NEXT:    s_addc_u32 s5, s17, s20
8575; GFX9-NEXT:    s_addc_u32 s17, s18, 0
8576; GFX9-NEXT:    s_mul_i32 s16, s4, s16
8577; GFX9-NEXT:    s_add_u32 s5, s5, s16
8578; GFX9-NEXT:    s_addc_u32 s16, 0, s17
8579; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
8580; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8581; GFX9-NEXT:    s_addc_u32 s4, s4, s16
8582; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
8583; GFX9-NEXT:    s_mul_i32 s5, s0, s4
8584; GFX9-NEXT:    s_mul_hi_u32 s17, s0, s16
8585; GFX9-NEXT:    s_add_i32 s5, s17, s5
8586; GFX9-NEXT:    s_mul_i32 s1, s1, s16
8587; GFX9-NEXT:    s_add_i32 s5, s5, s1
8588; GFX9-NEXT:    s_mul_i32 s0, s0, s16
8589; GFX9-NEXT:    s_mul_hi_u32 s17, s4, s0
8590; GFX9-NEXT:    s_mul_i32 s18, s4, s0
8591; GFX9-NEXT:    s_mul_i32 s20, s16, s5
8592; GFX9-NEXT:    s_mul_hi_u32 s0, s16, s0
8593; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s5
8594; GFX9-NEXT:    s_add_u32 s0, s0, s20
8595; GFX9-NEXT:    s_addc_u32 s16, 0, s19
8596; GFX9-NEXT:    s_add_u32 s0, s0, s18
8597; GFX9-NEXT:    s_mul_hi_u32 s1, s4, s5
8598; GFX9-NEXT:    s_addc_u32 s0, s16, s17
8599; GFX9-NEXT:    s_addc_u32 s1, s1, 0
8600; GFX9-NEXT:    s_mul_i32 s5, s4, s5
8601; GFX9-NEXT:    s_add_u32 s0, s0, s5
8602; GFX9-NEXT:    s_addc_u32 s1, 0, s1
8603; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
8604; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8605; GFX9-NEXT:    s_addc_u32 s16, s4, s1
8606; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
8607; GFX9-NEXT:    s_add_u32 s0, s8, s4
8608; GFX9-NEXT:    s_mov_b32 s5, s4
8609; GFX9-NEXT:    s_addc_u32 s1, s9, s4
8610; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[4:5]
8611; GFX9-NEXT:    v_readfirstlane_b32 s17, v0
8612; GFX9-NEXT:    s_mul_i32 s1, s8, s16
8613; GFX9-NEXT:    s_mul_hi_u32 s18, s8, s17
8614; GFX9-NEXT:    s_mul_hi_u32 s0, s8, s16
8615; GFX9-NEXT:    s_add_u32 s1, s18, s1
8616; GFX9-NEXT:    s_addc_u32 s0, 0, s0
8617; GFX9-NEXT:    s_mul_hi_u32 s19, s9, s17
8618; GFX9-NEXT:    s_mul_i32 s17, s9, s17
8619; GFX9-NEXT:    s_add_u32 s1, s1, s17
8620; GFX9-NEXT:    s_mul_hi_u32 s18, s9, s16
8621; GFX9-NEXT:    s_addc_u32 s0, s0, s19
8622; GFX9-NEXT:    s_addc_u32 s1, s18, 0
8623; GFX9-NEXT:    s_mul_i32 s16, s9, s16
8624; GFX9-NEXT:    s_add_u32 s16, s0, s16
8625; GFX9-NEXT:    s_addc_u32 s17, 0, s1
8626; GFX9-NEXT:    s_mul_i32 s0, s14, s17
8627; GFX9-NEXT:    s_mul_hi_u32 s1, s14, s16
8628; GFX9-NEXT:    s_add_i32 s0, s1, s0
8629; GFX9-NEXT:    s_mul_i32 s1, s15, s16
8630; GFX9-NEXT:    s_add_i32 s18, s0, s1
8631; GFX9-NEXT:    s_mul_i32 s1, s14, s16
8632; GFX9-NEXT:    v_mov_b32_e32 v0, s1
8633; GFX9-NEXT:    s_sub_i32 s0, s9, s18
8634; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s8, v0
8635; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8636; GFX9-NEXT:    s_subb_u32 s8, s0, s15
8637; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s14, v0
8638; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
8639; GFX9-NEXT:    s_subb_u32 s8, s8, 0
8640; GFX9-NEXT:    s_cmp_ge_u32 s8, s15
8641; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
8642; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v1
8643; GFX9-NEXT:    s_cmp_eq_u32 s8, s15
8644; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
8645; GFX9-NEXT:    v_mov_b32_e32 v2, s19
8646; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
8647; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
8648; GFX9-NEXT:    s_add_u32 s0, s16, 1
8649; GFX9-NEXT:    s_addc_u32 s8, s17, 0
8650; GFX9-NEXT:    s_add_u32 s1, s16, 2
8651; GFX9-NEXT:    s_addc_u32 s19, s17, 0
8652; GFX9-NEXT:    v_mov_b32_e32 v2, s0
8653; GFX9-NEXT:    v_mov_b32_e32 v3, s1
8654; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
8655; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
8656; GFX9-NEXT:    v_mov_b32_e32 v2, s8
8657; GFX9-NEXT:    v_mov_b32_e32 v3, s19
8658; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8659; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
8660; GFX9-NEXT:    s_subb_u32 s0, s9, s18
8661; GFX9-NEXT:    s_cmp_ge_u32 s0, s15
8662; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
8663; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
8664; GFX9-NEXT:    s_cmp_eq_u32 s0, s15
8665; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
8666; GFX9-NEXT:    v_mov_b32_e32 v3, s1
8667; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
8668; GFX9-NEXT:    s_xor_b64 s[0:1], s[4:5], s[12:13]
8669; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
8670; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
8671; GFX9-NEXT:    s_add_u32 s6, s6, s4
8672; GFX9-NEXT:    v_mov_b32_e32 v3, s17
8673; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
8674; GFX9-NEXT:    s_mov_b32 s5, s4
8675; GFX9-NEXT:    s_addc_u32 s7, s7, s4
8676; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
8677; GFX9-NEXT:    v_mov_b32_e32 v2, s16
8678; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
8679; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
8680; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
8681; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s7
8682; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
8683; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v0
8684; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v1
8685; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
8686; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
8687; GFX9-NEXT:    s_sub_u32 s0, 0, s6
8688; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8689; GFX9-NEXT:    s_subb_u32 s1, 0, s7
8690; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
8691; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
8692; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
8693; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
8694; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
8695; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
8696; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v6, vcc
8697; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
8698; GFX9-NEXT:    v_readfirstlane_b32 s13, v3
8699; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s8
8700; GFX9-NEXT:    s_mul_i32 s14, s0, s13
8701; GFX9-NEXT:    s_mul_i32 s9, s1, s8
8702; GFX9-NEXT:    s_add_i32 s12, s12, s14
8703; GFX9-NEXT:    s_add_i32 s12, s12, s9
8704; GFX9-NEXT:    s_mul_i32 s15, s0, s8
8705; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s12
8706; GFX9-NEXT:    s_mul_i32 s14, s8, s12
8707; GFX9-NEXT:    s_mul_hi_u32 s8, s8, s15
8708; GFX9-NEXT:    s_add_u32 s8, s8, s14
8709; GFX9-NEXT:    s_addc_u32 s9, 0, s9
8710; GFX9-NEXT:    s_mul_hi_u32 s16, s13, s15
8711; GFX9-NEXT:    s_mul_i32 s15, s13, s15
8712; GFX9-NEXT:    s_add_u32 s8, s8, s15
8713; GFX9-NEXT:    s_mul_hi_u32 s14, s13, s12
8714; GFX9-NEXT:    s_addc_u32 s8, s9, s16
8715; GFX9-NEXT:    s_addc_u32 s9, s14, 0
8716; GFX9-NEXT:    s_mul_i32 s12, s13, s12
8717; GFX9-NEXT:    s_add_u32 s8, s8, s12
8718; GFX9-NEXT:    s_addc_u32 s9, 0, s9
8719; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
8720; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8721; GFX9-NEXT:    s_addc_u32 s8, s13, s9
8722; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
8723; GFX9-NEXT:    s_mul_i32 s9, s0, s8
8724; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
8725; GFX9-NEXT:    s_add_i32 s9, s13, s9
8726; GFX9-NEXT:    s_mul_i32 s1, s1, s12
8727; GFX9-NEXT:    s_add_i32 s9, s9, s1
8728; GFX9-NEXT:    s_mul_i32 s0, s0, s12
8729; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s0
8730; GFX9-NEXT:    s_mul_i32 s14, s8, s0
8731; GFX9-NEXT:    s_mul_i32 s16, s12, s9
8732; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
8733; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s9
8734; GFX9-NEXT:    s_add_u32 s0, s0, s16
8735; GFX9-NEXT:    s_addc_u32 s12, 0, s15
8736; GFX9-NEXT:    s_add_u32 s0, s0, s14
8737; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s9
8738; GFX9-NEXT:    s_addc_u32 s0, s12, s13
8739; GFX9-NEXT:    s_addc_u32 s1, s1, 0
8740; GFX9-NEXT:    s_mul_i32 s9, s8, s9
8741; GFX9-NEXT:    s_add_u32 s0, s0, s9
8742; GFX9-NEXT:    s_addc_u32 s1, 0, s1
8743; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
8744; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8745; GFX9-NEXT:    s_addc_u32 s12, s8, s1
8746; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
8747; GFX9-NEXT:    s_add_u32 s0, s10, s8
8748; GFX9-NEXT:    s_mov_b32 s9, s8
8749; GFX9-NEXT:    s_addc_u32 s1, s11, s8
8750; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
8751; GFX9-NEXT:    v_readfirstlane_b32 s13, v2
8752; GFX9-NEXT:    s_mul_i32 s1, s10, s12
8753; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s13
8754; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s12
8755; GFX9-NEXT:    s_add_u32 s1, s14, s1
8756; GFX9-NEXT:    s_addc_u32 s0, 0, s0
8757; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s13
8758; GFX9-NEXT:    s_mul_i32 s13, s11, s13
8759; GFX9-NEXT:    s_add_u32 s1, s1, s13
8760; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s12
8761; GFX9-NEXT:    s_addc_u32 s0, s0, s15
8762; GFX9-NEXT:    s_addc_u32 s1, s14, 0
8763; GFX9-NEXT:    s_mul_i32 s12, s11, s12
8764; GFX9-NEXT:    s_add_u32 s12, s0, s12
8765; GFX9-NEXT:    s_addc_u32 s13, 0, s1
8766; GFX9-NEXT:    s_mul_i32 s0, s6, s13
8767; GFX9-NEXT:    s_mul_hi_u32 s1, s6, s12
8768; GFX9-NEXT:    s_add_i32 s0, s1, s0
8769; GFX9-NEXT:    s_mul_i32 s1, s7, s12
8770; GFX9-NEXT:    s_add_i32 s14, s0, s1
8771; GFX9-NEXT:    s_mul_i32 s1, s6, s12
8772; GFX9-NEXT:    v_mov_b32_e32 v2, s1
8773; GFX9-NEXT:    s_sub_i32 s0, s11, s14
8774; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v2
8775; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8776; GFX9-NEXT:    s_subb_u32 s10, s0, s7
8777; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s6, v2
8778; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
8779; GFX9-NEXT:    s_subb_u32 s10, s10, 0
8780; GFX9-NEXT:    s_cmp_ge_u32 s10, s7
8781; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
8782; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v3
8783; GFX9-NEXT:    s_cmp_eq_u32 s10, s7
8784; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
8785; GFX9-NEXT:    v_mov_b32_e32 v5, s15
8786; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
8787; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
8788; GFX9-NEXT:    s_add_u32 s0, s12, 1
8789; GFX9-NEXT:    s_addc_u32 s10, s13, 0
8790; GFX9-NEXT:    s_add_u32 s1, s12, 2
8791; GFX9-NEXT:    s_addc_u32 s15, s13, 0
8792; GFX9-NEXT:    v_mov_b32_e32 v5, s0
8793; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8794; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
8795; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
8796; GFX9-NEXT:    v_mov_b32_e32 v5, s10
8797; GFX9-NEXT:    v_mov_b32_e32 v6, s15
8798; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
8799; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
8800; GFX9-NEXT:    s_subb_u32 s0, s11, s14
8801; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
8802; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
8803; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
8804; GFX9-NEXT:    s_cmp_eq_u32 s0, s7
8805; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
8806; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8807; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
8808; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
8809; GFX9-NEXT:    v_mov_b32_e32 v6, s13
8810; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8811; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
8812; GFX9-NEXT:    v_mov_b32_e32 v5, s12
8813; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
8814; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[4:5]
8815; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
8816; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v2
8817; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8818; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
8819; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
8820; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8821; GFX9-NEXT:    s_endpgm
8822  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
8823  %r = sdiv <2 x i64> %x, %shl.y
8824  store <2 x i64> %r, ptr addrspace(1) %out
8825  ret void
8826}
8827
8828define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
8829; CHECK-LABEL: @srem_i64_oddk_denom(
8830; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
8831; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
8832; CHECK-NEXT:    ret void
8833;
8834; GFX6-LABEL: srem_i64_oddk_denom:
8835; GFX6:       ; %bb.0:
8836; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
8837; GFX6-NEXT:    v_mov_b32_e32 v2, 0xfd81e19
8838; GFX6-NEXT:    v_mov_b32_e32 v0, 0x6ca94220
8839; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8840; GFX6-NEXT:    s_mov_b32 s2, -1
8841; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8842; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
8843; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v2
8844; GFX6-NEXT:    s_mov_b32 s0, s4
8845; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
8846; GFX6-NEXT:    s_mul_i32 s4, s7, 0xfd81e19
8847; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
8848; GFX6-NEXT:    s_mul_i32 s1, s6, 0x6ca94220
8849; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8850; GFX6-NEXT:    s_ashr_i32 s4, s7, 31
8851; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s1, v3
8852; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
8853; GFX6-NEXT:    v_mul_hi_u32 v2, s4, v2
8854; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8855; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
8856; GFX6-NEXT:    s_mul_i32 s1, s7, 0x6ca94220
8857; GFX6-NEXT:    v_addc_u32_e64 v3, s[8:9], 0, 0, vcc
8858; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s1, v1
8859; GFX6-NEXT:    s_mul_i32 s1, s4, 0x6ca94220
8860; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v0, v3, vcc
8861; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
8862; GFX6-NEXT:    s_mul_i32 s4, s4, 0xfd81e19
8863; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v0
8864; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
8865; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
8866; GFX6-NEXT:    v_ashr_i64 v[2:3], v[0:1], 19
8867; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
8868; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
8869; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
8870; GFX6-NEXT:    s_mov_b32 s4, 0x12d8fb
8871; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
8872; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s4
8873; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
8874; GFX6-NEXT:    s_mov_b32 s1, s5
8875; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
8876; GFX6-NEXT:    v_mov_b32_e32 v2, s7
8877; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
8878; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
8879; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
8880; GFX6-NEXT:    s_endpgm
8881;
8882; GFX9-LABEL: srem_i64_oddk_denom:
8883; GFX9:       ; %bb.0:
8884; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8885; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8887; GFX9-NEXT:    s_mul_i32 s7, s3, 0xfd81e19
8888; GFX9-NEXT:    s_mul_hi_u32 s8, s2, 0xfd81e19
8889; GFX9-NEXT:    s_mul_hi_u32 s6, s3, 0xfd81e19
8890; GFX9-NEXT:    s_add_u32 s7, s7, s8
8891; GFX9-NEXT:    s_mul_i32 s5, s2, 0x6ca94220
8892; GFX9-NEXT:    s_addc_u32 s6, s6, 0
8893; GFX9-NEXT:    s_mul_hi_u32 s4, s2, 0x6ca94220
8894; GFX9-NEXT:    s_add_u32 s5, s5, s7
8895; GFX9-NEXT:    s_addc_u32 s4, s4, 0
8896; GFX9-NEXT:    s_add_u32 s4, s6, s4
8897; GFX9-NEXT:    s_addc_u32 s5, 0, 0
8898; GFX9-NEXT:    s_mul_i32 s7, s3, 0x6ca94220
8899; GFX9-NEXT:    s_mul_hi_u32 s6, s3, 0x6ca94220
8900; GFX9-NEXT:    s_add_u32 s4, s7, s4
8901; GFX9-NEXT:    s_addc_u32 s5, s6, s5
8902; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
8903; GFX9-NEXT:    s_mul_i32 s7, s6, 0x6ca94220
8904; GFX9-NEXT:    s_mul_hi_u32 s8, s6, 0xfd81e19
8905; GFX9-NEXT:    s_add_i32 s7, s8, s7
8906; GFX9-NEXT:    s_mul_i32 s6, s6, 0xfd81e19
8907; GFX9-NEXT:    s_add_i32 s7, s7, s6
8908; GFX9-NEXT:    s_add_u32 s4, s4, s6
8909; GFX9-NEXT:    s_addc_u32 s5, s5, s7
8910; GFX9-NEXT:    s_ashr_i64 s[6:7], s[4:5], 19
8911; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
8912; GFX9-NEXT:    s_add_u32 s4, s6, s4
8913; GFX9-NEXT:    s_addc_u32 s5, s7, 0
8914; GFX9-NEXT:    s_mul_i32 s5, s5, 0x12d8fb
8915; GFX9-NEXT:    s_mul_hi_u32 s6, s4, 0x12d8fb
8916; GFX9-NEXT:    s_add_i32 s6, s6, s5
8917; GFX9-NEXT:    s_mul_i32 s4, s4, 0x12d8fb
8918; GFX9-NEXT:    s_sub_u32 s2, s2, s4
8919; GFX9-NEXT:    s_subb_u32 s3, s3, s6
8920; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8921; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8922; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
8923; GFX9-NEXT:    s_endpgm
8924  %r = srem i64 %x, 1235195
8925  store i64 %r, ptr addrspace(1) %out
8926  ret void
8927}
8928
8929define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
8930; CHECK-LABEL: @srem_i64_pow2k_denom(
8931; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
8932; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
8933; CHECK-NEXT:    ret void
8934;
8935; GFX6-LABEL: srem_i64_pow2k_denom:
8936; GFX6:       ; %bb.0:
8937; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8938; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8939; GFX6-NEXT:    s_mov_b32 s6, -1
8940; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8941; GFX6-NEXT:    s_mov_b32 s4, s0
8942; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
8943; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
8944; GFX6-NEXT:    s_add_u32 s0, s2, s0
8945; GFX6-NEXT:    s_mov_b32 s5, s1
8946; GFX6-NEXT:    s_addc_u32 s1, s3, 0
8947; GFX6-NEXT:    s_and_b32 s0, s0, 0xfffff000
8948; GFX6-NEXT:    s_sub_u32 s0, s2, s0
8949; GFX6-NEXT:    s_subb_u32 s1, s3, s1
8950; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8951; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8952; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8953; GFX6-NEXT:    s_endpgm
8954;
8955; GFX9-LABEL: srem_i64_pow2k_denom:
8956; GFX9:       ; %bb.0:
8957; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
8958; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8959; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8960; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8961; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8962; GFX9-NEXT:    s_add_u32 s4, s2, s4
8963; GFX9-NEXT:    s_addc_u32 s5, s3, 0
8964; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
8965; GFX9-NEXT:    s_sub_u32 s2, s2, s4
8966; GFX9-NEXT:    s_subb_u32 s3, s3, s5
8967; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8968; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8969; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
8970; GFX9-NEXT:    s_endpgm
8971  %r = srem i64 %x, 4096
8972  store i64 %r, ptr addrspace(1) %out
8973  ret void
8974}
8975
8976define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
8977; CHECK-LABEL: @srem_i64_pow2_shl_denom(
8978; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8979; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
8980; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
8981; CHECK-NEXT:    ret void
8982;
8983; GFX6-LABEL: srem_i64_pow2_shl_denom:
8984; GFX6:       ; %bb.0:
8985; GFX6-NEXT:    s_load_dword s0, s[4:5], 0xd
8986; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8987; GFX6-NEXT:    s_mov_b32 s6, -1
8988; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8989; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
8990; GFX6-NEXT:    s_ashr_i32 s2, s1, 31
8991; GFX6-NEXT:    s_add_u32 s0, s0, s2
8992; GFX6-NEXT:    s_mov_b32 s3, s2
8993; GFX6-NEXT:    s_addc_u32 s1, s1, s2
8994; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
8995; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
8996; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
8997; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
8998; GFX6-NEXT:    s_sub_u32 s4, 0, s8
8999; GFX6-NEXT:    s_subb_u32 s5, 0, s9
9000; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
9001; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9002; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9003; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
9004; GFX6-NEXT:    s_add_u32 s2, s2, s10
9005; GFX6-NEXT:    s_mov_b32 s11, s10
9006; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9007; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9008; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9009; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
9010; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9011; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9012; GFX6-NEXT:    s_addc_u32 s3, s3, s10
9013; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
9014; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9015; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
9016; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
9017; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
9018; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9019; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9020; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9021; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9022; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
9023; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9024; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9025; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
9026; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9027; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
9028; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9029; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9030; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9031; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
9032; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9033; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9034; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9035; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9036; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9037; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
9038; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
9039; GFX6-NEXT:    s_mov_b32 s5, s1
9040; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9041; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
9042; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9043; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
9044; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
9045; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
9046; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
9047; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
9048; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
9049; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9050; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9051; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9052; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
9053; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
9054; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
9055; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9056; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9057; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9058; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9059; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
9060; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
9061; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v1
9062; GFX6-NEXT:    v_mul_hi_u32 v5, s13, v1
9063; GFX6-NEXT:    v_mul_lo_u32 v1, s13, v1
9064; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9065; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9066; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
9067; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
9068; GFX6-NEXT:    s_mov_b32 s4, s0
9069; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9070; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9071; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
9072; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9073; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
9074; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v1
9075; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
9076; GFX6-NEXT:    v_mul_lo_u32 v3, s9, v0
9077; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
9078; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
9079; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
9080; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s13, v1
9081; GFX6-NEXT:    v_mov_b32_e32 v3, s9
9082; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
9083; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
9084; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
9085; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
9086; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
9087; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
9088; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
9089; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
9090; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
9091; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
9092; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
9093; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
9094; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
9095; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9096; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
9097; GFX6-NEXT:    v_mov_b32_e32 v4, s13
9098; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
9099; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
9100; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
9101; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
9102; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
9103; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9104; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
9105; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
9106; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9107; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
9108; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
9109; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
9110; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
9111; GFX6-NEXT:    v_mov_b32_e32 v2, s10
9112; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
9113; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
9114; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9115; GFX6-NEXT:    s_endpgm
9116;
9117; GFX9-LABEL: srem_i64_pow2_shl_denom:
9118; GFX9:       ; %bb.0:
9119; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x34
9120; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
9121; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9122; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
9123; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
9124; GFX9-NEXT:    s_add_u32 s0, s0, s2
9125; GFX9-NEXT:    s_mov_b32 s3, s2
9126; GFX9-NEXT:    s_addc_u32 s1, s1, s2
9127; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
9128; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
9129; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
9130; GFX9-NEXT:    s_sub_u32 s0, 0, s6
9131; GFX9-NEXT:    s_subb_u32 s1, 0, s7
9132; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
9133; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
9134; GFX9-NEXT:    v_mov_b32_e32 v0, 0
9135; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
9136; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
9137; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
9138; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
9139; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
9140; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9141; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
9142; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
9143; GFX9-NEXT:    s_mul_i32 s4, s0, s2
9144; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s3
9145; GFX9-NEXT:    s_mul_i32 s5, s1, s3
9146; GFX9-NEXT:    s_add_i32 s4, s12, s4
9147; GFX9-NEXT:    s_mul_i32 s13, s0, s3
9148; GFX9-NEXT:    s_add_i32 s4, s4, s5
9149; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s13
9150; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s4
9151; GFX9-NEXT:    s_mul_i32 s3, s3, s4
9152; GFX9-NEXT:    s_add_u32 s3, s12, s3
9153; GFX9-NEXT:    s_addc_u32 s5, 0, s5
9154; GFX9-NEXT:    s_mul_hi_u32 s14, s2, s13
9155; GFX9-NEXT:    s_mul_i32 s13, s2, s13
9156; GFX9-NEXT:    s_add_u32 s3, s3, s13
9157; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s4
9158; GFX9-NEXT:    s_addc_u32 s3, s5, s14
9159; GFX9-NEXT:    s_addc_u32 s5, s12, 0
9160; GFX9-NEXT:    s_mul_i32 s4, s2, s4
9161; GFX9-NEXT:    s_add_u32 s3, s3, s4
9162; GFX9-NEXT:    s_addc_u32 s4, 0, s5
9163; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s3, v1
9164; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9165; GFX9-NEXT:    s_addc_u32 s2, s2, s4
9166; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
9167; GFX9-NEXT:    s_mul_i32 s3, s0, s2
9168; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s4
9169; GFX9-NEXT:    s_add_i32 s3, s5, s3
9170; GFX9-NEXT:    s_mul_i32 s1, s1, s4
9171; GFX9-NEXT:    s_add_i32 s3, s3, s1
9172; GFX9-NEXT:    s_mul_i32 s0, s0, s4
9173; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s0
9174; GFX9-NEXT:    s_mul_i32 s12, s2, s0
9175; GFX9-NEXT:    s_mul_i32 s14, s4, s3
9176; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s0
9177; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s3
9178; GFX9-NEXT:    s_add_u32 s0, s0, s14
9179; GFX9-NEXT:    s_addc_u32 s4, 0, s13
9180; GFX9-NEXT:    s_add_u32 s0, s0, s12
9181; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
9182; GFX9-NEXT:    s_addc_u32 s0, s4, s5
9183; GFX9-NEXT:    s_addc_u32 s1, s1, 0
9184; GFX9-NEXT:    s_mul_i32 s3, s2, s3
9185; GFX9-NEXT:    s_add_u32 s0, s0, s3
9186; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9187; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
9188; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9189; GFX9-NEXT:    s_addc_u32 s2, s2, s1
9190; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
9191; GFX9-NEXT:    s_add_u32 s0, s10, s4
9192; GFX9-NEXT:    s_mov_b32 s5, s4
9193; GFX9-NEXT:    s_addc_u32 s1, s11, s4
9194; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[4:5]
9195; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
9196; GFX9-NEXT:    s_mul_i32 s1, s10, s2
9197; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s3
9198; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s2
9199; GFX9-NEXT:    s_add_u32 s1, s5, s1
9200; GFX9-NEXT:    s_addc_u32 s0, 0, s0
9201; GFX9-NEXT:    s_mul_hi_u32 s12, s11, s3
9202; GFX9-NEXT:    s_mul_i32 s3, s11, s3
9203; GFX9-NEXT:    s_add_u32 s1, s1, s3
9204; GFX9-NEXT:    s_mul_hi_u32 s5, s11, s2
9205; GFX9-NEXT:    s_addc_u32 s0, s0, s12
9206; GFX9-NEXT:    s_addc_u32 s1, s5, 0
9207; GFX9-NEXT:    s_mul_i32 s2, s11, s2
9208; GFX9-NEXT:    s_add_u32 s0, s0, s2
9209; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9210; GFX9-NEXT:    s_mul_i32 s1, s6, s1
9211; GFX9-NEXT:    s_mul_hi_u32 s2, s6, s0
9212; GFX9-NEXT:    s_add_i32 s1, s2, s1
9213; GFX9-NEXT:    s_mul_i32 s2, s7, s0
9214; GFX9-NEXT:    s_mul_i32 s0, s6, s0
9215; GFX9-NEXT:    s_add_i32 s5, s1, s2
9216; GFX9-NEXT:    v_mov_b32_e32 v1, s0
9217; GFX9-NEXT:    s_sub_i32 s1, s11, s5
9218; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
9219; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9220; GFX9-NEXT:    s_subb_u32 s10, s1, s7
9221; GFX9-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s6, v1
9222; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9223; GFX9-NEXT:    s_subb_u32 s12, s10, 0
9224; GFX9-NEXT:    s_cmp_ge_u32 s12, s7
9225; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
9226; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s6, v2
9227; GFX9-NEXT:    s_cmp_eq_u32 s12, s7
9228; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
9229; GFX9-NEXT:    v_mov_b32_e32 v4, s13
9230; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
9231; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9232; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
9233; GFX9-NEXT:    s_subb_u32 s2, s10, s7
9234; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s6, v2
9235; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9236; GFX9-NEXT:    s_subb_u32 s2, s2, 0
9237; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
9238; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
9239; GFX9-NEXT:    v_mov_b32_e32 v3, s12
9240; GFX9-NEXT:    v_mov_b32_e32 v4, s2
9241; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9242; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
9243; GFX9-NEXT:    s_subb_u32 s0, s11, s5
9244; GFX9-NEXT:    s_cmp_ge_u32 s0, s7
9245; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
9246; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
9247; GFX9-NEXT:    s_cmp_eq_u32 s0, s7
9248; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
9249; GFX9-NEXT:    v_mov_b32_e32 v5, s1
9250; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
9251; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
9252; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9253; GFX9-NEXT:    v_mov_b32_e32 v5, s0
9254; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
9255; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
9256; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
9257; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v3
9258; GFX9-NEXT:    v_mov_b32_e32 v3, s4
9259; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s4, v1
9260; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
9261; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
9262; GFX9-NEXT:    s_endpgm
9263  %shl.y = shl i64 4096, %y
9264  %r = srem i64 %x, %shl.y
9265  store i64 %r, ptr addrspace(1) %out
9266  ret void
9267}
9268
9269define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
9270; CHECK-LABEL: @srem_v2i64_pow2k_denom(
9271; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9272; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
9273; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
9274; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
9275; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
9276; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
9277; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
9278; CHECK-NEXT:    ret void
9279;
9280; GFX6-LABEL: srem_v2i64_pow2k_denom:
9281; GFX6:       ; %bb.0:
9282; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
9283; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
9284; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9285; GFX6-NEXT:    s_mov_b32 s6, -1
9286; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9287; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
9288; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
9289; GFX6-NEXT:    s_add_u32 s8, s0, s8
9290; GFX6-NEXT:    s_addc_u32 s9, s1, 0
9291; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
9292; GFX6-NEXT:    s_sub_u32 s0, s0, s8
9293; GFX6-NEXT:    s_subb_u32 s1, s1, s9
9294; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
9295; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
9296; GFX6-NEXT:    s_add_u32 s8, s2, s8
9297; GFX6-NEXT:    s_addc_u32 s9, s3, 0
9298; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
9299; GFX6-NEXT:    s_sub_u32 s2, s2, s8
9300; GFX6-NEXT:    s_subb_u32 s3, s3, s9
9301; GFX6-NEXT:    v_mov_b32_e32 v0, s0
9302; GFX6-NEXT:    v_mov_b32_e32 v1, s1
9303; GFX6-NEXT:    v_mov_b32_e32 v2, s2
9304; GFX6-NEXT:    v_mov_b32_e32 v3, s3
9305; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9306; GFX6-NEXT:    s_endpgm
9307;
9308; GFX9-LABEL: srem_v2i64_pow2k_denom:
9309; GFX9:       ; %bb.0:
9310; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
9311; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
9312; GFX9-NEXT:    v_mov_b32_e32 v4, 0
9313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9314; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
9315; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
9316; GFX9-NEXT:    s_add_u32 s4, s0, s4
9317; GFX9-NEXT:    s_addc_u32 s5, s1, 0
9318; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
9319; GFX9-NEXT:    s_sub_u32 s0, s0, s4
9320; GFX9-NEXT:    s_subb_u32 s1, s1, s5
9321; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
9322; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
9323; GFX9-NEXT:    s_add_u32 s4, s2, s4
9324; GFX9-NEXT:    s_addc_u32 s5, s3, 0
9325; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
9326; GFX9-NEXT:    s_sub_u32 s2, s2, s4
9327; GFX9-NEXT:    s_subb_u32 s3, s3, s5
9328; GFX9-NEXT:    v_mov_b32_e32 v0, s0
9329; GFX9-NEXT:    v_mov_b32_e32 v1, s1
9330; GFX9-NEXT:    v_mov_b32_e32 v2, s2
9331; GFX9-NEXT:    v_mov_b32_e32 v3, s3
9332; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
9333; GFX9-NEXT:    s_endpgm
9334  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
9335  store <2 x i64> %r, ptr addrspace(1) %out
9336  ret void
9337}
9338
9339define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
9340; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
9341; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
9342; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9343; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
9344; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
9345; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
9346; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
9347; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
9348; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
9349; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
9350; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
9351; CHECK-NEXT:    ret void
9352;
9353; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
9354; GFX6:       ; %bb.0:
9355; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
9356; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
9357; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9358; GFX6-NEXT:    s_mov_b32 s6, -1
9359; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9360; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s12
9361; GFX6-NEXT:    s_lshl_b64 s[16:17], 0x1000, s14
9362; GFX6-NEXT:    s_ashr_i32 s2, s1, 31
9363; GFX6-NEXT:    s_add_u32 s0, s0, s2
9364; GFX6-NEXT:    s_mov_b32 s3, s2
9365; GFX6-NEXT:    s_addc_u32 s1, s1, s2
9366; GFX6-NEXT:    s_xor_b64 s[14:15], s[0:1], s[2:3]
9367; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s14
9368; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s15
9369; GFX6-NEXT:    s_sub_u32 s0, 0, s14
9370; GFX6-NEXT:    s_subb_u32 s1, 0, s15
9371; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
9372; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9373; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9374; GFX6-NEXT:    s_mov_b32 s13, s12
9375; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9376; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9377; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9378; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9379; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9380; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9381; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
9382; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
9383; GFX6-NEXT:    v_mul_lo_u32 v5, s1, v0
9384; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v0
9385; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9386; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9387; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9388; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9389; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
9390; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
9391; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
9392; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
9393; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9394; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
9395; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9396; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
9397; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
9398; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
9399; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9400; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9401; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9402; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9403; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
9404; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
9405; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
9406; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9407; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v0
9408; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9409; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
9410; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
9411; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
9412; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
9413; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
9414; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
9415; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9416; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9417; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9418; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
9419; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
9420; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
9421; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9422; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9423; GFX6-NEXT:    s_add_u32 s0, s8, s12
9424; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9425; GFX6-NEXT:    s_addc_u32 s1, s9, s12
9426; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9427; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
9428; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
9429; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
9430; GFX6-NEXT:    v_mul_hi_u32 v4, s8, v1
9431; GFX6-NEXT:    v_mul_hi_u32 v5, s9, v1
9432; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
9433; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9434; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9435; GFX6-NEXT:    v_mul_lo_u32 v4, s9, v0
9436; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
9437; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9438; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9439; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
9440; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9441; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
9442; GFX6-NEXT:    v_mul_lo_u32 v1, s14, v1
9443; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v0
9444; GFX6-NEXT:    v_mul_lo_u32 v3, s15, v0
9445; GFX6-NEXT:    v_mul_lo_u32 v0, s14, v0
9446; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
9447; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
9448; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
9449; GFX6-NEXT:    v_mov_b32_e32 v3, s15
9450; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
9451; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
9452; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s14, v0
9453; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
9454; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s15, v5
9455; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
9456; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s14, v4
9457; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
9458; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
9459; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s15, v5
9460; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s14, v4
9461; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
9462; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
9463; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9464; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
9465; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
9466; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
9467; GFX6-NEXT:    s_add_u32 s2, s16, s0
9468; GFX6-NEXT:    s_mov_b32 s1, s0
9469; GFX6-NEXT:    s_addc_u32 s3, s17, s0
9470; GFX6-NEXT:    v_mov_b32_e32 v4, s9
9471; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[0:1]
9472; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
9473; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s8
9474; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s9
9475; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s15, v1
9476; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9477; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
9478; GFX6-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
9479; GFX6-NEXT:    v_rcp_f32_e32 v4, v4
9480; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9481; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v1
9482; GFX6-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
9483; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
9484; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
9485; GFX6-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
9486; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
9487; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
9488; GFX6-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
9489; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
9490; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
9491; GFX6-NEXT:    s_sub_u32 s0, 0, s8
9492; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
9493; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v2
9494; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
9495; GFX6-NEXT:    s_subb_u32 s1, 0, s9
9496; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
9497; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
9498; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
9499; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
9500; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9501; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v3
9502; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v5
9503; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v3
9504; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v3
9505; GFX6-NEXT:    v_mul_lo_u32 v3, v4, v3
9506; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9507; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9508; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
9509; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
9510; GFX6-NEXT:    s_mov_b32 s15, s14
9511; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
9512; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
9513; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
9514; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
9515; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
9516; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9517; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9518; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
9519; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
9520; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
9521; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
9522; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
9523; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
9524; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
9525; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
9526; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
9527; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
9528; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
9529; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
9530; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
9531; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
9532; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
9533; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
9534; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
9535; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
9536; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
9537; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
9538; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9539; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9540; GFX6-NEXT:    s_add_u32 s0, s10, s14
9541; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9542; GFX6-NEXT:    s_addc_u32 s1, s11, s14
9543; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
9544; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
9545; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v3
9546; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v2
9547; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v3
9548; GFX6-NEXT:    v_mul_hi_u32 v8, s11, v3
9549; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
9550; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9551; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
9552; GFX6-NEXT:    v_mul_lo_u32 v7, s11, v2
9553; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
9554; GFX6-NEXT:    v_mov_b32_e32 v6, s12
9555; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
9556; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
9557; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
9558; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9559; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9560; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v3
9561; GFX6-NEXT:    v_mul_hi_u32 v4, s8, v2
9562; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v2
9563; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
9564; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v2
9565; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
9566; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
9567; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
9568; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
9569; GFX6-NEXT:    v_mov_b32_e32 v5, s9
9570; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
9571; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
9572; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
9573; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
9574; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
9575; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
9576; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
9577; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
9578; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
9579; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
9580; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
9581; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
9582; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9583; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
9584; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
9585; GFX6-NEXT:    v_mov_b32_e32 v6, s11
9586; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
9587; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
9588; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9589; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
9590; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
9591; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9592; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
9593; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
9594; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9595; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
9596; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
9597; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
9598; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
9599; GFX6-NEXT:    v_mov_b32_e32 v4, s14
9600; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
9601; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
9602; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9603; GFX6-NEXT:    s_endpgm
9604;
9605; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
9606; GFX9:       ; %bb.0:
9607; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
9608; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
9609; GFX9-NEXT:    v_mov_b32_e32 v4, 0
9610; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9611; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s12
9612; GFX9-NEXT:    s_lshl_b64 s[14:15], 0x1000, s14
9613; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
9614; GFX9-NEXT:    s_add_u32 s0, s0, s2
9615; GFX9-NEXT:    s_mov_b32 s3, s2
9616; GFX9-NEXT:    s_addc_u32 s1, s1, s2
9617; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
9618; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
9619; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
9620; GFX9-NEXT:    s_sub_u32 s0, 0, s12
9621; GFX9-NEXT:    s_subb_u32 s1, 0, s13
9622; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9623; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9624; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9625; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9626; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9627; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9628; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9629; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9630; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
9631; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
9632; GFX9-NEXT:    s_mul_i32 s4, s0, s2
9633; GFX9-NEXT:    s_mul_hi_u32 s16, s0, s3
9634; GFX9-NEXT:    s_mul_i32 s5, s1, s3
9635; GFX9-NEXT:    s_add_i32 s4, s16, s4
9636; GFX9-NEXT:    s_mul_i32 s17, s0, s3
9637; GFX9-NEXT:    s_add_i32 s4, s4, s5
9638; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s4
9639; GFX9-NEXT:    s_mul_i32 s16, s3, s4
9640; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s17
9641; GFX9-NEXT:    s_add_u32 s3, s3, s16
9642; GFX9-NEXT:    s_addc_u32 s5, 0, s5
9643; GFX9-NEXT:    s_mul_hi_u32 s18, s2, s17
9644; GFX9-NEXT:    s_mul_i32 s17, s2, s17
9645; GFX9-NEXT:    s_add_u32 s3, s3, s17
9646; GFX9-NEXT:    s_mul_hi_u32 s16, s2, s4
9647; GFX9-NEXT:    s_addc_u32 s3, s5, s18
9648; GFX9-NEXT:    s_addc_u32 s5, s16, 0
9649; GFX9-NEXT:    s_mul_i32 s4, s2, s4
9650; GFX9-NEXT:    s_add_u32 s3, s3, s4
9651; GFX9-NEXT:    s_addc_u32 s4, 0, s5
9652; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
9653; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9654; GFX9-NEXT:    s_addc_u32 s2, s2, s4
9655; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
9656; GFX9-NEXT:    s_mul_i32 s3, s0, s2
9657; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s4
9658; GFX9-NEXT:    s_add_i32 s3, s5, s3
9659; GFX9-NEXT:    s_mul_i32 s1, s1, s4
9660; GFX9-NEXT:    s_add_i32 s3, s3, s1
9661; GFX9-NEXT:    s_mul_i32 s0, s0, s4
9662; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s0
9663; GFX9-NEXT:    s_mul_i32 s16, s2, s0
9664; GFX9-NEXT:    s_mul_i32 s18, s4, s3
9665; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s0
9666; GFX9-NEXT:    s_mul_hi_u32 s17, s4, s3
9667; GFX9-NEXT:    s_add_u32 s0, s0, s18
9668; GFX9-NEXT:    s_addc_u32 s4, 0, s17
9669; GFX9-NEXT:    s_add_u32 s0, s0, s16
9670; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
9671; GFX9-NEXT:    s_addc_u32 s0, s4, s5
9672; GFX9-NEXT:    s_addc_u32 s1, s1, 0
9673; GFX9-NEXT:    s_mul_i32 s3, s2, s3
9674; GFX9-NEXT:    s_add_u32 s0, s0, s3
9675; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9676; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
9677; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9678; GFX9-NEXT:    s_addc_u32 s2, s2, s1
9679; GFX9-NEXT:    s_ashr_i32 s16, s9, 31
9680; GFX9-NEXT:    s_add_u32 s0, s8, s16
9681; GFX9-NEXT:    s_mov_b32 s17, s16
9682; GFX9-NEXT:    s_addc_u32 s1, s9, s16
9683; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
9684; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
9685; GFX9-NEXT:    s_mul_i32 s1, s4, s2
9686; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s3
9687; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s2
9688; GFX9-NEXT:    s_add_u32 s1, s8, s1
9689; GFX9-NEXT:    s_addc_u32 s0, 0, s0
9690; GFX9-NEXT:    s_mul_hi_u32 s9, s5, s3
9691; GFX9-NEXT:    s_mul_i32 s3, s5, s3
9692; GFX9-NEXT:    s_add_u32 s1, s1, s3
9693; GFX9-NEXT:    s_mul_hi_u32 s8, s5, s2
9694; GFX9-NEXT:    s_addc_u32 s0, s0, s9
9695; GFX9-NEXT:    s_addc_u32 s1, s8, 0
9696; GFX9-NEXT:    s_mul_i32 s2, s5, s2
9697; GFX9-NEXT:    s_add_u32 s0, s0, s2
9698; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9699; GFX9-NEXT:    s_mul_i32 s1, s12, s1
9700; GFX9-NEXT:    s_mul_hi_u32 s2, s12, s0
9701; GFX9-NEXT:    s_add_i32 s1, s2, s1
9702; GFX9-NEXT:    s_mul_i32 s2, s13, s0
9703; GFX9-NEXT:    s_mul_i32 s0, s12, s0
9704; GFX9-NEXT:    s_add_i32 s8, s1, s2
9705; GFX9-NEXT:    v_mov_b32_e32 v0, s0
9706; GFX9-NEXT:    s_sub_i32 s1, s5, s8
9707; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
9708; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9709; GFX9-NEXT:    s_subb_u32 s4, s1, s13
9710; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
9711; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9712; GFX9-NEXT:    s_subb_u32 s9, s4, 0
9713; GFX9-NEXT:    s_cmp_ge_u32 s9, s13
9714; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
9715; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v1
9716; GFX9-NEXT:    s_cmp_eq_u32 s9, s13
9717; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[2:3]
9718; GFX9-NEXT:    v_mov_b32_e32 v3, s17
9719; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
9720; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9721; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
9722; GFX9-NEXT:    s_subb_u32 s2, s4, s13
9723; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v1
9724; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9725; GFX9-NEXT:    s_subb_u32 s2, s2, 0
9726; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
9727; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9728; GFX9-NEXT:    v_mov_b32_e32 v2, s9
9729; GFX9-NEXT:    v_mov_b32_e32 v3, s2
9730; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9731; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
9732; GFX9-NEXT:    s_subb_u32 s0, s5, s8
9733; GFX9-NEXT:    s_cmp_ge_u32 s0, s13
9734; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
9735; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
9736; GFX9-NEXT:    s_cmp_eq_u32 s0, s13
9737; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9738; GFX9-NEXT:    v_mov_b32_e32 v5, s1
9739; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
9740; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
9741; GFX9-NEXT:    v_mov_b32_e32 v5, s0
9742; GFX9-NEXT:    s_ashr_i32 s0, s15, 31
9743; GFX9-NEXT:    s_add_u32 s2, s14, s0
9744; GFX9-NEXT:    s_mov_b32 s1, s0
9745; GFX9-NEXT:    s_addc_u32 s3, s15, s0
9746; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
9747; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
9748; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
9749; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
9750; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
9751; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
9752; GFX9-NEXT:    v_xor_b32_e32 v0, s16, v0
9753; GFX9-NEXT:    v_xor_b32_e32 v2, s16, v2
9754; GFX9-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v3
9755; GFX9-NEXT:    v_rcp_f32_e32 v3, v1
9756; GFX9-NEXT:    v_mov_b32_e32 v5, s16
9757; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v0
9758; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v5, vcc
9759; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v3
9760; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
9761; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
9762; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
9763; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
9764; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
9765; GFX9-NEXT:    s_sub_u32 s0, 0, s4
9766; GFX9-NEXT:    s_subb_u32 s1, 0, s5
9767; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
9768; GFX9-NEXT:    v_readfirstlane_b32 s9, v3
9769; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
9770; GFX9-NEXT:    s_mul_i32 s12, s0, s9
9771; GFX9-NEXT:    s_mul_i32 s3, s1, s2
9772; GFX9-NEXT:    s_add_i32 s8, s8, s12
9773; GFX9-NEXT:    s_add_i32 s8, s8, s3
9774; GFX9-NEXT:    s_mul_i32 s13, s0, s2
9775; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s8
9776; GFX9-NEXT:    s_mul_i32 s12, s2, s8
9777; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s13
9778; GFX9-NEXT:    s_add_u32 s2, s2, s12
9779; GFX9-NEXT:    s_addc_u32 s3, 0, s3
9780; GFX9-NEXT:    s_mul_hi_u32 s14, s9, s13
9781; GFX9-NEXT:    s_mul_i32 s13, s9, s13
9782; GFX9-NEXT:    s_add_u32 s2, s2, s13
9783; GFX9-NEXT:    s_mul_hi_u32 s12, s9, s8
9784; GFX9-NEXT:    s_addc_u32 s2, s3, s14
9785; GFX9-NEXT:    s_addc_u32 s3, s12, 0
9786; GFX9-NEXT:    s_mul_i32 s8, s9, s8
9787; GFX9-NEXT:    s_add_u32 s2, s2, s8
9788; GFX9-NEXT:    s_addc_u32 s3, 0, s3
9789; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
9790; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9791; GFX9-NEXT:    s_addc_u32 s2, s9, s3
9792; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
9793; GFX9-NEXT:    s_mul_i32 s3, s0, s2
9794; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
9795; GFX9-NEXT:    s_add_i32 s3, s9, s3
9796; GFX9-NEXT:    s_mul_i32 s1, s1, s8
9797; GFX9-NEXT:    s_add_i32 s3, s3, s1
9798; GFX9-NEXT:    s_mul_i32 s0, s0, s8
9799; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s0
9800; GFX9-NEXT:    s_mul_i32 s12, s2, s0
9801; GFX9-NEXT:    s_mul_i32 s14, s8, s3
9802; GFX9-NEXT:    s_mul_hi_u32 s0, s8, s0
9803; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s3
9804; GFX9-NEXT:    s_add_u32 s0, s0, s14
9805; GFX9-NEXT:    s_addc_u32 s8, 0, s13
9806; GFX9-NEXT:    s_add_u32 s0, s0, s12
9807; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
9808; GFX9-NEXT:    s_addc_u32 s0, s8, s9
9809; GFX9-NEXT:    s_addc_u32 s1, s1, 0
9810; GFX9-NEXT:    s_mul_i32 s3, s2, s3
9811; GFX9-NEXT:    s_add_u32 s0, s0, s3
9812; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9813; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
9814; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9815; GFX9-NEXT:    s_addc_u32 s2, s2, s1
9816; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
9817; GFX9-NEXT:    s_add_u32 s0, s10, s8
9818; GFX9-NEXT:    s_mov_b32 s9, s8
9819; GFX9-NEXT:    s_addc_u32 s1, s11, s8
9820; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
9821; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
9822; GFX9-NEXT:    s_mul_i32 s1, s10, s2
9823; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s3
9824; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s2
9825; GFX9-NEXT:    s_add_u32 s1, s9, s1
9826; GFX9-NEXT:    s_addc_u32 s0, 0, s0
9827; GFX9-NEXT:    s_mul_hi_u32 s12, s11, s3
9828; GFX9-NEXT:    s_mul_i32 s3, s11, s3
9829; GFX9-NEXT:    s_add_u32 s1, s1, s3
9830; GFX9-NEXT:    s_mul_hi_u32 s9, s11, s2
9831; GFX9-NEXT:    s_addc_u32 s0, s0, s12
9832; GFX9-NEXT:    s_addc_u32 s1, s9, 0
9833; GFX9-NEXT:    s_mul_i32 s2, s11, s2
9834; GFX9-NEXT:    s_add_u32 s0, s0, s2
9835; GFX9-NEXT:    s_addc_u32 s1, 0, s1
9836; GFX9-NEXT:    s_mul_i32 s1, s4, s1
9837; GFX9-NEXT:    s_mul_hi_u32 s2, s4, s0
9838; GFX9-NEXT:    s_add_i32 s1, s2, s1
9839; GFX9-NEXT:    s_mul_i32 s2, s5, s0
9840; GFX9-NEXT:    s_mul_i32 s0, s4, s0
9841; GFX9-NEXT:    s_add_i32 s9, s1, s2
9842; GFX9-NEXT:    v_mov_b32_e32 v2, s0
9843; GFX9-NEXT:    s_sub_i32 s1, s11, s9
9844; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v2
9845; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9846; GFX9-NEXT:    s_subb_u32 s10, s1, s5
9847; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s4, v2
9848; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9849; GFX9-NEXT:    s_subb_u32 s12, s10, 0
9850; GFX9-NEXT:    s_cmp_ge_u32 s12, s5
9851; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
9852; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v3
9853; GFX9-NEXT:    s_cmp_eq_u32 s12, s5
9854; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[2:3]
9855; GFX9-NEXT:    v_mov_b32_e32 v6, s13
9856; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
9857; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9858; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[2:3]
9859; GFX9-NEXT:    s_subb_u32 s2, s10, s5
9860; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v3
9861; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
9862; GFX9-NEXT:    s_subb_u32 s2, s2, 0
9863; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9864; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
9865; GFX9-NEXT:    v_mov_b32_e32 v5, s12
9866; GFX9-NEXT:    v_mov_b32_e32 v6, s2
9867; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
9868; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
9869; GFX9-NEXT:    s_subb_u32 s0, s11, s9
9870; GFX9-NEXT:    s_cmp_ge_u32 s0, s5
9871; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
9872; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
9873; GFX9-NEXT:    s_cmp_eq_u32 s0, s5
9874; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9875; GFX9-NEXT:    v_mov_b32_e32 v7, s1
9876; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
9877; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
9878; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9879; GFX9-NEXT:    v_mov_b32_e32 v7, s0
9880; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
9881; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
9882; GFX9-NEXT:    v_xor_b32_e32 v2, s8, v2
9883; GFX9-NEXT:    v_xor_b32_e32 v3, s8, v5
9884; GFX9-NEXT:    v_mov_b32_e32 v5, s8
9885; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s8, v2
9886; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
9887; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
9888; GFX9-NEXT:    s_endpgm
9889  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
9890  %r = srem <2 x i64> %x, %shl.y
9891  store <2 x i64> %r, ptr addrspace(1) %out
9892  ret void
9893}
9894
9895define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) {
9896; CHECK-LABEL:  @v_sdiv_i32_exact(
9897; CHECK:        %1 = extractelement <2 x i32> %num, i64 0
9898; CHECK-NEXT:   %2 = sdiv exact i32 %1, 4096
9899; CHECK-NEXT:   %3 = insertelement <2 x i32> poison, i32 %2, i64 0
9900; CHECK-NEXT:   %4 = extractelement <2 x i32> %num, i64 1
9901; CHECK-NEXT:   %5 = sdiv exact i32 %4, 1024
9902; CHECK-NEXT:   %6 = insertelement <2 x i32> %3, i32 %5, i64 1
9903; CHECK-NEXT:   ret <2 x i32> %6
9904;
9905; GFX6-LABEL: v_sdiv_i32_exact:
9906; GFX6:       ; %bb.0:
9907; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9908; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
9909; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 10, v1
9910; GFX6-NEXT:    s_setpc_b64 s[30:31]
9911;
9912; GFX9-LABEL: v_sdiv_i32_exact:
9913; GFX9:       ; %bb.0:
9914; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9915; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
9916; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 10, v1
9917; GFX9-NEXT:    s_setpc_b64 s[30:31]
9918   %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024>
9919   ret <2 x i32> %result
9920}
9921
9922define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) {
9923; CHECK-LABEL:  @v_sdiv_i64_exact(
9924; CHECK:        %1 = extractelement <2 x i64> %num, i64 0
9925; CHECK-NEXT:   %2 = sdiv exact i64 %1, 4096
9926; CHECK-NEXT:   %3 = insertelement <2 x i64> poison, i64 %2, i64 0
9927; CHECK-NEXT:   %4 = extractelement <2 x i64> %num, i64 1
9928; CHECK-NEXT:   %5 = sdiv exact i64 %4, 1024
9929; CHECK-NEXT:   %6 = insertelement <2 x i64> %3, i64 %5, i64 1
9930; CHECK-NEXT:   ret <2 x i64> %6
9931;
9932; GFX6-LABEL: v_sdiv_i64_exact:
9933; GFX6:       ; %bb.0:
9934; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9935; GFX6-NEXT:    v_ashr_i64 v[0:1], v[0:1], 12
9936; GFX6-NEXT:    v_ashr_i64 v[2:3], v[2:3], 10
9937; GFX6-NEXT:    s_setpc_b64 s[30:31]
9938;
9939; GFX9-LABEL: v_sdiv_i64_exact:
9940; GFX9:       ; %bb.0:
9941; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9942; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 12, v[0:1]
9943; GFX9-NEXT:    v_ashrrev_i64 v[2:3], 10, v[2:3]
9944; GFX9-NEXT:    s_setpc_b64 s[30:31]
9945   %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024>
9946   ret <2 x i64> %result
9947}
9948
9949define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) {
9950; CHECK-LABEL:  @v_udiv_i32_exact(
9951; CHECK:        %1 = extractelement <2 x i32> %num, i64 0
9952; CHECK-NEXT:   %2 = udiv exact i32 %1, 4096
9953; CHECK-NEXT:   %3 = insertelement <2 x i32> poison, i32 %2, i64 0
9954; CHECK-NEXT:   %4 = extractelement <2 x i32> %num, i64 1
9955; CHECK-NEXT:   %5 = udiv exact i32 %4, 1024
9956; CHECK-NEXT:   %6 = insertelement <2 x i32> %3, i32 %5, i64 1
9957; CHECK-NEXT:   ret <2 x i32> %6
9958;
9959; GFX6-LABEL: v_udiv_i32_exact:
9960; GFX6:       ; %bb.0:
9961; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9962; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
9963; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
9964; GFX6-NEXT:    s_setpc_b64 s[30:31]
9965;
9966; GFX9-LABEL: v_udiv_i32_exact:
9967; GFX9:       ; %bb.0:
9968; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9969; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
9970; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
9971; GFX9-NEXT:    s_setpc_b64 s[30:31]
9972   %result = udiv exact <2 x i32> %num, <i32 4096, i32 1024>
9973   ret <2 x i32> %result
9974}
9975
9976define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
9977; CHECK-LABEL:  @v_udiv_i64_exact(
9978; CHECK:        %1 = extractelement <2 x i64> %num, i64 0
9979; CHECK-NEXT:   %2 = udiv exact i64 %1, 4096
9980; CHECK-NEXT:   %3 = insertelement <2 x i64> poison, i64 %2, i64 0
9981; CHECK-NEXT:   %4 = extractelement <2 x i64> %num, i64 1
9982; CHECK-NEXT:   %5 = udiv exact i64 %4, 1024
9983; CHECK-NEXT:   %6 = insertelement <2 x i64> %3, i64 %5, i64 1
9984; CHECK-NEXT:   ret <2 x i64> %6
9985;
9986; GFX6-LABEL: v_udiv_i64_exact:
9987; GFX6:       ; %bb.0:
9988; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9989; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 12
9990; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 10
9991; GFX6-NEXT:    s_setpc_b64 s[30:31]
9992;
9993; GFX9-LABEL: v_udiv_i64_exact:
9994; GFX9:       ; %bb.0:
9995; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9996; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 12, v[0:1]
9997; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 10, v[2:3]
9998; GFX9-NEXT:    s_setpc_b64 s[30:31]
9999   %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024>
10000   ret <2 x i64> %result
10001}
10002
10003define i64 @udiv_i64_gt_smax(i8 %size) {
10004; GFX6-LABEL: udiv_i64_gt_smax:
10005; GFX6:       ; %bb.0:
10006; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10007; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
10008; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
10009; GFX6-NEXT:    v_not_b32_e32 v1, v1
10010; GFX6-NEXT:    v_not_b32_e32 v0, v0
10011; GFX6-NEXT:    s_mov_b32 s4, 0xcccccccd
10012; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s4
10013; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s4
10014; GFX6-NEXT:    s_mov_b32 s6, 0xcccccccc
10015; GFX6-NEXT:    v_mul_hi_u32 v5, v1, s4
10016; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
10017; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
10018; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
10019; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
10020; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
10021; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
10022; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s6
10023; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s6
10024; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
10025; GFX6-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
10026; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
10027; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
10028; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 3
10029; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
10030; GFX6-NEXT:    s_setpc_b64 s[30:31]
10031;
10032; GFX9-LABEL: udiv_i64_gt_smax:
10033; GFX9:       ; %bb.0:
10034; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10035; GFX9-NEXT:    v_mov_b32_e32 v1, 31
10036; GFX9-NEXT:    v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
10037; GFX9-NEXT:    s_mov_b32 s4, 0xcccccccd
10038; GFX9-NEXT:    v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
10039; GFX9-NEXT:    v_mul_hi_u32 v0, v4, s4
10040; GFX9-NEXT:    v_not_b32_e32 v5, v1
10041; GFX9-NEXT:    v_mov_b32_e32 v1, 0
10042; GFX9-NEXT:    s_mov_b32 s6, 0xcccccccc
10043; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1]
10044; GFX9-NEXT:    v_mov_b32_e32 v6, v3
10045; GFX9-NEXT:    v_mov_b32_e32 v3, v1
10046; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3]
10047; GFX9-NEXT:    v_mov_b32_e32 v0, v1
10048; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
10049; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
10050; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
10051; GFX9-NEXT:    v_alignbit_b32 v0, v1, v0, 3
10052; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
10053; GFX9-NEXT:    s_setpc_b64 s[30:31]
10054  %esize = sext i8 %size to i64
10055  %minus = sub nuw nsw i64 -1, %esize
10056  %div = udiv i64 %minus, 10
10057  ret i64 %div
10058}
10059
10060define i64 @udiv_i64_9divbits(i8 %size) {
10061; GFX6-LABEL: udiv_i64_9divbits:
10062; GFX6:       ; %bb.0:
10063; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10064; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
10065; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
10066; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
10067; GFX6-NEXT:    s_mov_b32 s4, 0x41200000
10068; GFX6-NEXT:    v_mul_f32_e32 v1, 0x3dcccccd, v0
10069; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
10070; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
10071; GFX6-NEXT:    v_mad_f32 v0, -v1, s4, v0
10072; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
10073; GFX6-NEXT:    v_mov_b32_e32 v1, 0
10074; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
10075; GFX6-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
10076; GFX6-NEXT:    s_setpc_b64 s[30:31]
10077;
10078; GFX9-LABEL: udiv_i64_9divbits:
10079; GFX9:       ; %bb.0:
10080; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10081; GFX9-NEXT:    v_mov_b32_e32 v1, 1
10082; GFX9-NEXT:    v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
10083; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
10084; GFX9-NEXT:    s_mov_b32 s4, 0x41200000
10085; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3dcccccd, v0
10086; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10087; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
10088; GFX9-NEXT:    v_mad_f32 v0, -v1, s4, v0
10089; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
10090; GFX9-NEXT:    v_mov_b32_e32 v1, 0
10091; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
10092; GFX9-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
10093; GFX9-NEXT:    s_setpc_b64 s[30:31]
10094  %zextend = zext i8 %size to i64
10095  %num = add nuw nsw i64 1, %zextend
10096  %div = udiv i64 %num, 10
10097  ret i64 %div
10098}
10099