xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll (revision e5638c5a00682243b1ee012d7dd8292aa221dff8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,GISEL %s
3; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,CGP %s
4
5; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
6
7define i32 @v_udiv_i32(i32 %num, i32 %den) {
8; GISEL-LABEL: v_udiv_i32:
9; GISEL:       ; %bb.0:
10; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
12; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
13; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
14; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
15; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
16; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
17; GISEL-NEXT:    v_mul_hi_u32 v3, v2, v3
18; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
19; GISEL-NEXT:    v_mul_hi_u32 v2, v0, v2
20; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
21; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
22; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
23; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
24; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
25; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v1
26; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
27; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
28; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
29; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
30; GISEL-NEXT:    s_setpc_b64 s[30:31]
31;
32; CGP-LABEL: v_udiv_i32:
33; CGP:       ; %bb.0:
34; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
36; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
37; CGP-NEXT:    v_rcp_f32_e32 v2, v2
38; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
39; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
40; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
41; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
42; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
43; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
44; CGP-NEXT:    v_mul_lo_u32 v3, v2, v1
45; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
46; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
47; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
48; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
49; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v1
50; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
51; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
52; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
53; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
54; CGP-NEXT:    s_setpc_b64 s[30:31]
55  %result = udiv i32 %num, %den
56  ret i32 %result
57}
58
59; FIXME: This is a workaround for not handling uniform VGPR case.
60declare i32 @llvm.amdgcn.readfirstlane(i32)
61
62define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) {
63; GISEL-LABEL: s_udiv_i32:
64; GISEL:       ; %bb.0:
65; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, s1
66; GISEL-NEXT:    s_sub_i32 s2, 0, s1
67; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
68; GISEL-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
69; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
70; GISEL-NEXT:    v_mul_lo_u32 v1, s2, v0
71; GISEL-NEXT:    v_mul_hi_u32 v1, v0, v1
72; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
73; GISEL-NEXT:    v_mul_hi_u32 v0, s0, v0
74; GISEL-NEXT:    v_mul_lo_u32 v1, v0, s1
75; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
76; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
77; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
78; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
79; GISEL-NEXT:    v_subrev_i32_e64 v2, s[2:3], s1, v1
80; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
81; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
82; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
83; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
84; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
85; GISEL-NEXT:    ; return to shader part epilog
86;
87; CGP-LABEL: s_udiv_i32:
88; CGP:       ; %bb.0:
89; CGP-NEXT:    v_cvt_f32_u32_e32 v0, s1
90; CGP-NEXT:    s_sub_i32 s2, 0, s1
91; CGP-NEXT:    v_rcp_f32_e32 v0, v0
92; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
93; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
94; CGP-NEXT:    v_mul_lo_u32 v1, s2, v0
95; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
96; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
97; CGP-NEXT:    v_mul_hi_u32 v0, s0, v0
98; CGP-NEXT:    v_mul_lo_u32 v1, v0, s1
99; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
100; CGP-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
101; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
102; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
103; CGP-NEXT:    v_subrev_i32_e64 v2, s[2:3], s1, v1
104; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
105; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
106; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
107; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
108; CGP-NEXT:    v_readfirstlane_b32 s0, v0
109; CGP-NEXT:    ; return to shader part epilog
110  %result = udiv i32 %num, %den
111  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %result)
112  ret i32 %readlane
113}
114
115define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
116; GISEL-LABEL: v_udiv_v2i32:
117; GISEL:       ; %bb.0:
118; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
120; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
121; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
122; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
123; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
124; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
125; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
126; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
127; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
128; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
129; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
130; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v6
131; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
132; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
133; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
134; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
135; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v4
136; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
137; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
138; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
139; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
140; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
141; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
142; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
143; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
144; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
145; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
146; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
147; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
148; GISEL-NEXT:    v_sub_i32_e64 v7, s[6:7], v1, v3
149; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
150; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
151; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
152; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
153; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
154; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
155; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
156; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
157; GISEL-NEXT:    s_setpc_b64 s[30:31]
158;
159; CGP-LABEL: v_udiv_v2i32:
160; CGP:       ; %bb.0:
161; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
163; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
164; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
165; CGP-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
166; CGP-NEXT:    v_rcp_f32_e32 v4, v4
167; CGP-NEXT:    v_rcp_f32_e32 v6, v6
168; CGP-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
169; CGP-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
170; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
171; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
172; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
173; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
174; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
175; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
176; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
177; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
178; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
179; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
180; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
181; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
182; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
183; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
184; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
185; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
186; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
187; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
188; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
189; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
190; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
191; CGP-NEXT:    v_sub_i32_e64 v7, s[6:7], v1, v3
192; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
193; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
194; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
195; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
196; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
197; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
198; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
199; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
200; CGP-NEXT:    s_setpc_b64 s[30:31]
201  %result = udiv <2 x i32> %num, %den
202  ret <2 x i32> %result
203}
204
205define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
206; CHECK-LABEL: v_udiv_i32_pow2k_denom:
207; CHECK:       ; %bb.0:
208; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
210; CHECK-NEXT:    s_setpc_b64 s[30:31]
211  %result = udiv i32 %num, 4096
212  ret i32 %result
213}
214
215define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
216; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
217; CHECK:       ; %bb.0:
218; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
220; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 12, v1
221; CHECK-NEXT:    s_setpc_b64 s[30:31]
222  %result = udiv <2 x i32> %num, <i32 4096, i32 4096>
223  ret <2 x i32> %result
224}
225
226define i32 @v_udiv_i32_oddk_denom(i32 %num) {
227; CHECK-LABEL: v_udiv_i32_oddk_denom:
228; CHECK:       ; %bb.0:
229; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; CHECK-NEXT:    v_mov_b32_e32 v1, 0xb2a50881
231; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
232; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
233; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
234; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
235; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
236; CHECK-NEXT:    s_setpc_b64 s[30:31]
237  %result = udiv i32 %num, 1235195
238  ret i32 %result
239}
240
241define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
242; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
243; CHECK:       ; %bb.0:
244; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245; CHECK-NEXT:    v_mov_b32_e32 v2, 0xb2a50881
246; CHECK-NEXT:    v_mul_hi_u32 v3, v0, v2
247; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
248; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
249; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
250; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
251; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
252; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
253; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
254; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
255; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 20, v1
256; CHECK-NEXT:    s_setpc_b64 s[30:31]
257  %result = udiv <2 x i32> %num, <i32 1235195, i32 1235195>
258  ret <2 x i32> %result
259}
260
261define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
262; CHECK-LABEL: v_udiv_i32_pow2_shl_denom:
263; CHECK:       ; %bb.0:
264; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; CHECK-NEXT:    v_lshl_b32_e32 v1, 0x1000, v1
266; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
267; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
268; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
269; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
270; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
271; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v2
272; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
273; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
274; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
275; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v1
276; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
277; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
278; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
279; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
280; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v1
281; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
282; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
283; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
284; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
285; CHECK-NEXT:    s_setpc_b64 s[30:31]
286  %shl.y = shl i32 4096, %y
287  %r = udiv i32 %x, %shl.y
288  ret i32 %r
289}
290
291define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
292; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom:
293; GISEL:       ; %bb.0:
294; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
296; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
297; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
298; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
299; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
300; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
301; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
302; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
303; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
304; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
305; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
306; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
307; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
308; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v6
309; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
310; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
311; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
312; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
313; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v4
314; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
315; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
316; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
317; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
318; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
319; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
320; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
321; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
322; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
323; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
324; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
325; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
326; GISEL-NEXT:    v_sub_i32_e64 v7, s[6:7], v1, v3
327; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
328; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
329; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
330; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
331; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
332; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
333; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
334; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
335; GISEL-NEXT:    s_setpc_b64 s[30:31]
336;
337; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom:
338; CGP:       ; %bb.0:
339; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
341; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
342; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
343; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
344; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
345; CGP-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
346; CGP-NEXT:    v_rcp_f32_e32 v4, v4
347; CGP-NEXT:    v_rcp_f32_e32 v6, v6
348; CGP-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
349; CGP-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
350; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
351; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
352; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
353; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
354; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
355; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
356; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
357; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
358; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
359; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
360; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
361; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
362; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
363; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
364; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
365; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
366; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
367; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
368; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
369; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
370; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
371; CGP-NEXT:    v_sub_i32_e64 v7, s[6:7], v1, v3
372; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
373; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
374; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
375; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
376; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
377; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
378; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
379; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
380; CGP-NEXT:    s_setpc_b64 s[30:31]
381  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
382  %r = udiv <2 x i32> %x, %shl.y
383  ret <2 x i32> %r
384}
385
386define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
387; GISEL-LABEL: v_udiv_i32_24bit:
388; GISEL:       ; %bb.0:
389; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
391; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
392; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
393; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
394; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
395; GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
396; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
397; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
398; GISEL-NEXT:    v_mul_hi_u32 v3, v2, v3
399; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
400; GISEL-NEXT:    v_mul_hi_u32 v2, v0, v2
401; GISEL-NEXT:    v_mul_lo_u32 v3, v2, v1
402; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
403; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
404; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
405; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
406; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v1
407; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
408; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
409; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
410; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
411; GISEL-NEXT:    s_setpc_b64 s[30:31]
412;
413; CGP-LABEL: v_udiv_i32_24bit:
414; CGP:       ; %bb.0:
415; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
417; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
418; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v0
419; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v1
420; CGP-NEXT:    v_rcp_f32_e32 v2, v1
421; CGP-NEXT:    v_mul_f32_e32 v2, v0, v2
422; CGP-NEXT:    v_trunc_f32_e32 v2, v2
423; CGP-NEXT:    v_fma_f32 v0, -v2, v1, v0
424; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
425; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, v1
426; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
427; CGP-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
428; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
429; CGP-NEXT:    s_setpc_b64 s[30:31]
430  %num.mask = and i32 %num, 16777215
431  %den.mask = and i32 %den, 16777215
432  %result = udiv i32 %num.mask, %den.mask
433  ret i32 %result
434}
435
436define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
437; GISEL-LABEL: v_udiv_v2i32_24bit:
438; GISEL:       ; %bb.0:
439; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
440; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
441; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
442; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
443; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
444; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
445; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
446; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
447; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
448; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
449; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
450; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
451; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
452; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
453; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
454; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
455; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v6
456; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
457; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
458; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
459; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
460; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v4
461; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
462; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
463; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
464; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v3
465; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
466; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
467; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
468; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
469; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
470; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
471; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
472; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
473; GISEL-NEXT:    v_sub_i32_e64 v7, s[6:7], v1, v3
474; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
475; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
476; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
477; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
478; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
479; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
480; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
481; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
482; GISEL-NEXT:    s_setpc_b64 s[30:31]
483;
484; CGP-LABEL: v_udiv_v2i32_24bit:
485; CGP:       ; %bb.0:
486; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
488; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
489; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
490; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
491; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v0
492; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v2
493; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v1
494; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v3
495; CGP-NEXT:    v_rcp_f32_e32 v4, v2
496; CGP-NEXT:    v_rcp_f32_e32 v5, v3
497; CGP-NEXT:    v_mul_f32_e32 v4, v0, v4
498; CGP-NEXT:    v_mul_f32_e32 v5, v1, v5
499; CGP-NEXT:    v_trunc_f32_e32 v4, v4
500; CGP-NEXT:    v_trunc_f32_e32 v5, v5
501; CGP-NEXT:    v_fma_f32 v0, -v4, v2, v0
502; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
503; CGP-NEXT:    v_fma_f32 v1, -v5, v3, v1
504; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
505; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, v2
506; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
507; CGP-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, v3
508; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
509; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
510; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
511; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
512; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
513; CGP-NEXT:    s_setpc_b64 s[30:31]
514  %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
515  %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
516  %result = udiv <2 x i32> %num.mask, %den.mask
517  ret <2 x i32> %result
518}
519