xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bypass-div.ll (revision 463e93b95f0887145b51edb81b770eeb4463abc5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3
4; 64-bit divides and rems should be split into a fast and slow path
5; where the fast path uses a 32-bit operation.
6
7define i64 @sdiv64(i64 %a, i64 %b) {
8; GFX9-LABEL: sdiv64:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
12; GFX9-NEXT:    v_mov_b32_e32 v4, 0
13; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
14; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
15; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
16; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
17; GFX9-NEXT:    s_cbranch_execz .LBB0_2
18; GFX9-NEXT:  ; %bb.1:
19; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
20; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
21; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
22; GFX9-NEXT:    v_xor_b32_e32 v10, v3, v9
23; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v9
24; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v11
25; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
26; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
27; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
28; GFX9-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
29; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
30; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
31; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
32; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
33; GFX9-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
34; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
35; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
36; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
37; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
38; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v12
39; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v2
40; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
41; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
42; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
43; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
44; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
45; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
46; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
47; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
48; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
49; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
50; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
51; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
52; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
53; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v12
54; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v13
55; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
56; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
57; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
58; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
59; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v2
60; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
61; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
62; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
63; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
64; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
65; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
66; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
67; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
68; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
69; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
70; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
71; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
72; GFX9-NEXT:    v_xor_b32_e32 v6, v0, v4
73; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v4, vcc
74; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
75; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v2
76; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v4
77; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v0
78; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
79; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
80; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0
81; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
82; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
83; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
84; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
85; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
86; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v2
87; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
88; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
89; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
90; GFX9-NEXT:    v_sub_u32_e32 v7, v5, v1
91; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v6, v0
92; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v7, v10, vcc
93; GFX9-NEXT:    v_sub_co_u32_e64 v7, s[4:5], v0, v11
94; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
95; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
96; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
97; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v11
98; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
99; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v10
100; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[4:5]
101; GFX9-NEXT:    v_add_co_u32_e64 v7, s[4:5], 2, v2
102; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
103; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[4:5], 0, v3, s[4:5]
104; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
105; GFX9-NEXT:    v_add_co_u32_e64 v12, s[4:5], 1, v2
106; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
107; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
108; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[4:5], 0, v3, s[4:5]
109; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
110; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v10
111; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
112; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
113; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
114; GFX9-NEXT:    v_cndmask_b32_e64 v1, v12, v7, s[4:5]
115; GFX9-NEXT:    v_cndmask_b32_e64 v6, v13, v8, s[4:5]
116; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
117; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v9
118; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
119; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
120; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
121; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v1, v2
122; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v0, v2, vcc
123; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
124; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
125; GFX9-NEXT:  .LBB0_2: ; %Flow
126; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
127; GFX9-NEXT:    s_cbranch_execz .LBB0_4
128; GFX9-NEXT:  ; %bb.3:
129; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
130; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
131; GFX9-NEXT:    v_mov_b32_e32 v5, 0
132; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
133; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
134; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
135; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
136; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
137; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
138; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
139; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
140; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
141; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
142; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
143; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
144; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
145; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
146; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
147; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
148; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
149; GFX9-NEXT:  .LBB0_4:
150; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
151; GFX9-NEXT:    v_mov_b32_e32 v0, v4
152; GFX9-NEXT:    v_mov_b32_e32 v1, v5
153; GFX9-NEXT:    s_setpc_b64 s[30:31]
154  %d = sdiv i64 %a, %b
155  ret i64 %d
156}
157
158define i64 @udiv64(i64 %a, i64 %b) {
159; GFX9-LABEL: udiv64:
160; GFX9:       ; %bb.0:
161; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
163; GFX9-NEXT:    v_mov_b32_e32 v4, 0
164; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
165; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
166; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
167; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
168; GFX9-NEXT:    s_cbranch_execz .LBB1_2
169; GFX9-NEXT:  ; %bb.1:
170; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
171; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
172; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
173; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
174; GFX9-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
175; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
176; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
177; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
178; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
179; GFX9-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
180; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
181; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
182; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
183; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
184; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
185; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
186; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
187; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
188; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
189; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
190; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
191; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
192; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
193; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
194; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
195; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
196; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
197; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
198; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
199; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
200; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
201; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
202; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
203; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
204; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
205; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
206; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
207; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
208; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
209; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
210; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
211; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
212; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
213; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
214; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
215; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
216; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
217; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
218; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
219; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
220; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
221; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
222; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
223; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
224; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
225; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
226; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
227; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6
228; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v7
229; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
230; GFX9-NEXT:    v_add3_u32 v5, v5, v9, v8
231; GFX9-NEXT:    v_sub_u32_e32 v8, v1, v5
232; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
233; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v8, v3, vcc
234; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v2
235; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
236; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
237; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
238; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
239; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
240; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v3
241; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v8, s[4:5]
242; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v6
243; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
244; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v7, s[4:5]
245; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
246; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v6
247; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
248; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
249; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v7, s[4:5]
250; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
251; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
252; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
253; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
254; GFX9-NEXT:    v_cndmask_b32_e64 v4, v11, v9, s[4:5]
255; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
256; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
257; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v4, vcc
258; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v0, vcc
259; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
260; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
261; GFX9-NEXT:  .LBB1_2: ; %Flow
262; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
263; GFX9-NEXT:    s_cbranch_execz .LBB1_4
264; GFX9-NEXT:  ; %bb.3:
265; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
266; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
267; GFX9-NEXT:    v_mov_b32_e32 v5, 0
268; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
269; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
270; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
271; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
272; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
273; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
274; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
275; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
276; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
277; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
278; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
279; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
280; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
281; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
282; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
283; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
284; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
285; GFX9-NEXT:  .LBB1_4:
286; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
287; GFX9-NEXT:    v_mov_b32_e32 v0, v4
288; GFX9-NEXT:    v_mov_b32_e32 v1, v5
289; GFX9-NEXT:    s_setpc_b64 s[30:31]
290  %d = udiv i64 %a, %b
291  ret i64 %d
292}
293
294define i64 @srem64(i64 %a, i64 %b) {
295; GFX9-LABEL: srem64:
296; GFX9:       ; %bb.0:
297; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
299; GFX9-NEXT:    v_mov_b32_e32 v4, 0
300; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
301; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
302; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
303; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
304; GFX9-NEXT:    s_cbranch_execz .LBB2_2
305; GFX9-NEXT:  ; %bb.1:
306; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
307; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
308; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
309; GFX9-NEXT:    v_xor_b32_e32 v9, v3, v4
310; GFX9-NEXT:    v_xor_b32_e32 v10, v2, v4
311; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v10
312; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v9
313; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v10
314; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
315; GFX9-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
316; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
317; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
318; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
319; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
320; GFX9-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
321; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
322; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v3
323; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
324; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
325; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v11
326; GFX9-NEXT:    v_mul_hi_u32 v12, v6, v2
327; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
328; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
329; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v3
330; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
331; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v4, vcc
332; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0
333; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
334; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v13, v3, vcc
335; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
336; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
337; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
338; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v6, v2
339; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
340; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v11
341; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v12
342; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
343; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
344; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0
345; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
346; GFX9-NEXT:    v_mul_hi_u32 v13, v12, v2
347; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0
348; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v5
349; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
350; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
351; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
352; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
353; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
354; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
355; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v12, v2
356; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
357; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
358; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
359; GFX9-NEXT:    v_xor_b32_e32 v6, v0, v5
360; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
361; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
362; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v2
363; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v5
364; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v0
365; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
366; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
367; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
368; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
369; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
370; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
371; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
372; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
373; GFX9-NEXT:    v_mul_lo_u32 v2, v9, v0
374; GFX9-NEXT:    v_mul_lo_u32 v3, v10, v1
375; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v0, 0
376; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v2
377; GFX9-NEXT:    v_sub_u32_e32 v2, v4, v1
378; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v6, v0
379; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v9, vcc
380; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v0, v10
381; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[6:7], 0, v2, s[4:5]
382; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v9
383; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
384; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v3, v10
385; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
386; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v9
387; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v2, v9, s[4:5]
388; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
389; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v3, v10
390; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
391; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
392; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v9
393; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
394; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
395; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
396; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
397; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
398; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v9
399; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
400; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
401; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
402; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v8, s[4:5]
403; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
404; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
405; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
406; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v5
407; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
408; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
409; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
410; GFX9-NEXT:  .LBB2_2: ; %Flow
411; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
412; GFX9-NEXT:    s_cbranch_execz .LBB2_4
413; GFX9-NEXT:  ; %bb.3:
414; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
415; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
416; GFX9-NEXT:    v_mov_b32_e32 v5, 0
417; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
418; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
419; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
420; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
421; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
422; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
423; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
424; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
425; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
426; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
427; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
428; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
429; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
430; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
431; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
432; GFX9-NEXT:  .LBB2_4:
433; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
434; GFX9-NEXT:    v_mov_b32_e32 v0, v4
435; GFX9-NEXT:    v_mov_b32_e32 v1, v5
436; GFX9-NEXT:    s_setpc_b64 s[30:31]
437  %d = srem i64 %a, %b
438  ret i64 %d
439}
440
441define i64 @urem64(i64 %a, i64 %b) {
442; GFX9-LABEL: urem64:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
446; GFX9-NEXT:    v_mov_b32_e32 v4, 0
447; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
448; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
449; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
450; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
451; GFX9-NEXT:    s_cbranch_execz .LBB3_2
452; GFX9-NEXT:  ; %bb.1:
453; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
454; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
455; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
456; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
457; GFX9-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
458; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
459; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
460; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
461; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
462; GFX9-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
463; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
464; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
465; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
466; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
467; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
468; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
469; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
470; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
471; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
472; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
473; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
474; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
475; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
476; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
477; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
478; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
479; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
480; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
481; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
482; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
483; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
484; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
485; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
486; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
487; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
488; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
489; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
490; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
491; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
492; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
493; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
494; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
495; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
496; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
497; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
498; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
499; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
500; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
501; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
502; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
503; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
504; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
505; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
506; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
507; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
508; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
509; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
510; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v4
511; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v5
512; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v4, 0
513; GFX9-NEXT:    v_add3_u32 v5, v5, v7, v6
514; GFX9-NEXT:    v_sub_u32_e32 v6, v1, v5
515; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
516; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v6, v3, vcc
517; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v0, v2
518; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5]
519; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
520; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
521; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
522; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
523; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
524; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v7, v3
525; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
526; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
527; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
528; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v6, v2
529; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
530; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
531; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
532; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
533; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
534; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
535; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
536; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
537; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
538; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
539; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, v9, s[4:5]
540; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
541; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
542; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
543; GFX9-NEXT:  .LBB3_2: ; %Flow
544; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
545; GFX9-NEXT:    s_cbranch_execz .LBB3_4
546; GFX9-NEXT:  ; %bb.3:
547; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
548; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
549; GFX9-NEXT:    v_mov_b32_e32 v5, 0
550; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
551; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
552; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
553; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
554; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
555; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
556; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
557; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
558; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
559; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
560; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
561; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
562; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
563; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
564; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
565; GFX9-NEXT:  .LBB3_4:
566; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
567; GFX9-NEXT:    v_mov_b32_e32 v0, v4
568; GFX9-NEXT:    v_mov_b32_e32 v1, v5
569; GFX9-NEXT:    s_setpc_b64 s[30:31]
570  %d = urem i64 %a, %b
571  ret i64 %d
572}
573
574define i32 @sdiv32(i32 %a, i32 %b) {
575; GFX9-LABEL: sdiv32:
576; GFX9:       ; %bb.0:
577; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
578; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v1
579; GFX9-NEXT:    v_max_i32_e32 v2, v1, v2
580; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v2
581; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v2
582; GFX9-NEXT:    v_sub_u32_e32 v5, 0, v0
583; GFX9-NEXT:    v_max_i32_e32 v5, v0, v5
584; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
585; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
586; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
587; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
588; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
589; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
590; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
591; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
592; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
593; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v2
594; GFX9-NEXT:    v_add_u32_e32 v1, 1, v3
595; GFX9-NEXT:    v_sub_u32_e32 v4, v5, v4
596; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
597; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
598; GFX9-NEXT:    v_sub_u32_e32 v3, v4, v2
599; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
600; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
601; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
602; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
603; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
604; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
605; GFX9-NEXT:    s_setpc_b64 s[30:31]
606  %d = sdiv i32 %a, %b
607  ret i32 %d
608}
609
610define i32 @udiv32(i32 %a, i32 %b) {
611; GFX9-LABEL: udiv32:
612; GFX9:       ; %bb.0:
613; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
614; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
615; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
616; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
617; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
618; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
619; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
620; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
621; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
622; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
623; GFX9-NEXT:    v_mul_lo_u32 v3, v2, v1
624; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
625; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
626; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
627; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v1
628; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
629; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
630; GFX9-NEXT:    v_add_u32_e32 v3, 1, v2
631; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
632; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
633; GFX9-NEXT:    s_setpc_b64 s[30:31]
634  %d = udiv i32 %a, %b
635  ret i32 %d
636}
637
638define i32 @srem32(i32 %a, i32 %b) {
639; GFX9-LABEL: srem32:
640; GFX9:       ; %bb.0:
641; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
642; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v1
643; GFX9-NEXT:    v_max_i32_e32 v1, v1, v2
644; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
645; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
646; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v0
647; GFX9-NEXT:    v_max_i32_e32 v4, v0, v4
648; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
649; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
650; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
651; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
652; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
653; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
654; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
655; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
656; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
657; GFX9-NEXT:    v_sub_u32_e32 v2, v4, v2
658; GFX9-NEXT:    v_sub_u32_e32 v3, v2, v1
659; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v1
660; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
661; GFX9-NEXT:    v_sub_u32_e32 v3, v2, v1
662; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v1
663; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
664; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
665; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
666; GFX9-NEXT:    s_setpc_b64 s[30:31]
667  %d = srem i32 %a, %b
668  ret i32 %d
669}
670
671define i32 @urem32(i32 %a, i32 %b) {
672; GFX9-LABEL: urem32:
673; GFX9:       ; %bb.0:
674; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
675; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
676; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
677; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
678; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
679; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
680; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
681; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
682; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
683; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
684; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
685; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
686; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
687; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
688; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
689; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
690; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
691; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
692; GFX9-NEXT:    s_setpc_b64 s[30:31]
693  %d = urem i32 %a, %b
694  ret i32 %d
695}
696
697define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
698; GFX9-LABEL: sdivrem64:
699; GFX9:       ; %bb.0:
700; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
701; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
702; GFX9-NEXT:    v_mov_b32_e32 v4, 0
703; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
704; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
705; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
706; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
707; GFX9-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
708; GFX9-NEXT:    s_cbranch_execz .LBB8_2
709; GFX9-NEXT:  ; %bb.1:
710; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
711; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
712; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
713; GFX9-NEXT:    v_xor_b32_e32 v10, v3, v9
714; GFX9-NEXT:    v_xor_b32_e32 v11, v2, v9
715; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v11
716; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v10
717; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v11
718; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
719; GFX9-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
720; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
721; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
722; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
723; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
724; GFX9-NEXT:    v_madmk_f32 v2, v3, 0xcf800000, v2
725; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v2
726; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
727; GFX9-NEXT:    v_mul_lo_u32 v4, v8, v6
728; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0
729; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v12
730; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v2
731; GFX9-NEXT:    v_add3_u32 v5, v3, v5, v4
732; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
733; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v13, v3
734; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
735; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
736; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
737; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
738; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
739; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
740; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
741; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
742; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v6, v2
743; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v3, vcc
744; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v12
745; GFX9-NEXT:    v_mul_lo_u32 v5, v8, v13
746; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0
747; GFX9-NEXT:    v_add3_u32 v5, v3, v4, v5
748; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
749; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
750; GFX9-NEXT:    v_mul_hi_u32 v14, v13, v2
751; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
752; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v5
753; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
754; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
755; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
756; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
757; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
758; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
759; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v13, v2
760; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v12, v3, vcc
761; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
762; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
763; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v7
764; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v7, vcc
765; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0
766; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v2
767; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v7
768; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
769; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
770; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
771; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
772; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
773; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
774; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
775; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
776; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
777; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v2
778; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
779; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0
780; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v6
781; GFX9-NEXT:    v_sub_u32_e32 v6, v4, v1
782; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v5, v0
783; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v10, vcc
784; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v11
785; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[6:7], 0, v6, s[4:5]
786; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v10
787; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[6:7]
788; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v11
789; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
790; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v12, v10
791; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[6:7]
792; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 2, v2
793; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v3, s[6:7]
794; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 1, v2
795; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
796; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v3, s[6:7]
797; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
798; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v5
799; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
800; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
801; GFX9-NEXT:    v_cndmask_b32_e64 v5, v16, v14, s[6:7]
802; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
803; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v10
804; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc
805; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
806; GFX9-NEXT:    v_cndmask_b32_e64 v4, v15, v13, s[6:7]
807; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
808; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
809; GFX9-NEXT:    v_xor_b32_e32 v5, v7, v9
810; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v5
811; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
812; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v2, v5
813; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[4:5], v6, v10, s[4:5]
814; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[8:9], v3, v5, s[8:9]
815; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[4:5], v8, v11
816; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5]
817; GFX9-NEXT:    v_cndmask_b32_e64 v2, v12, v2, s[6:7]
818; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
819; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v3, s[6:7]
820; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
821; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
822; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
823; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v0, v7
824; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
825; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
826; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
827; GFX9-NEXT:  .LBB8_2: ; %Flow
828; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
829; GFX9-NEXT:    s_cbranch_execz .LBB8_4
830; GFX9-NEXT:  ; %bb.3:
831; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
832; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
833; GFX9-NEXT:    v_mov_b32_e32 v5, 0
834; GFX9-NEXT:    v_mov_b32_e32 v7, v5
835; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
836; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
837; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
838; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
839; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
840; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
841; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
842; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
843; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
844; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
845; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
846; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
847; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
848; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
849; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
850; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
851; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
852; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
853; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
854; GFX9-NEXT:  .LBB8_4:
855; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
856; GFX9-NEXT:    v_mov_b32_e32 v0, v4
857; GFX9-NEXT:    v_mov_b32_e32 v1, v5
858; GFX9-NEXT:    v_mov_b32_e32 v2, v6
859; GFX9-NEXT:    v_mov_b32_e32 v3, v7
860; GFX9-NEXT:    s_setpc_b64 s[30:31]
861  %d = sdiv i64 %a, %b
862  %r = srem i64 %a, %b
863  %ins.0 = insertelement <2 x i64> undef, i64 %d, i32 0
864  %ins.1 = insertelement <2 x i64> %ins.0, i64 %r, i32 1
865  ret <2 x i64> %ins.1
866}
867
868define <2 x i64> @udivrem64(i64 %a, i64 %b) {
869; GFX9-LABEL: udivrem64:
870; GFX9:       ; %bb.0:
871; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
872; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
873; GFX9-NEXT:    v_mov_b32_e32 v4, 0
874; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
875; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
876; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
877; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
878; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
879; GFX9-NEXT:    s_cbranch_execz .LBB9_2
880; GFX9-NEXT:  ; %bb.1:
881; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
882; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
883; GFX9-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v2
884; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
885; GFX9-NEXT:    v_madmk_f32 v4, v5, 0x4f800000, v4
886; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
887; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
888; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
889; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
890; GFX9-NEXT:    v_madmk_f32 v4, v5, 0xcf800000, v4
891; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v5
892; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v4
893; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v8
894; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v9
895; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
896; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
897; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v4
898; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
899; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v5
900; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
901; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
902; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
903; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
904; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
905; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
906; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
907; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
908; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v9, v4
909; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
910; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v13
911; GFX9-NEXT:    v_mul_lo_u32 v7, v11, v12
912; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
913; GFX9-NEXT:    v_add3_u32 v7, v5, v6, v7
914; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
915; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
916; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v4
917; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
918; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v11, v7
919; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
920; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
921; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
922; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
923; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
924; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
925; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v4
926; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
927; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
928; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v6
929; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v4
930; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
931; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v6, 0
932; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
933; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
934; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
935; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
936; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
937; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
938; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v6
939; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v7
940; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0
941; GFX9-NEXT:    v_add3_u32 v5, v5, v9, v8
942; GFX9-NEXT:    v_sub_u32_e32 v8, v1, v5
943; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
944; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc
945; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v0, v2
946; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
947; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v3
948; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[6:7]
949; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v2
950; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
951; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v3
952; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v11, s[6:7]
953; GFX9-NEXT:    v_add_co_u32_e64 v11, s[6:7], 2, v6
954; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, v7, s[6:7]
955; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 1, v6
956; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
957; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v7, s[6:7]
958; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
959; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
960; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
961; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
962; GFX9-NEXT:    v_cndmask_b32_e64 v4, v14, v12, s[6:7]
963; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
964; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
965; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v8, v3, s[4:5]
966; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[4:5], v9, v2
967; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
968; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
969; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
970; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[6:7]
971; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v4, vcc
972; GFX9-NEXT:    v_cndmask_b32_e64 v4, v13, v11, s[6:7]
973; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
974; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v2, s[6:7]
975; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
976; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
977; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
978; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
979; GFX9-NEXT:  .LBB9_2: ; %Flow
980; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[8:9]
981; GFX9-NEXT:    s_cbranch_execz .LBB9_4
982; GFX9-NEXT:  ; %bb.3:
983; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
984; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
985; GFX9-NEXT:    v_mov_b32_e32 v5, 0
986; GFX9-NEXT:    v_mov_b32_e32 v7, v5
987; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
988; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
989; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
990; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
991; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
992; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
993; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
994; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
995; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
996; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
997; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
998; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
999; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1000; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1001; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
1002; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
1003; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
1004; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
1005; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
1006; GFX9-NEXT:  .LBB9_4:
1007; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1008; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1009; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1010; GFX9-NEXT:    v_mov_b32_e32 v2, v6
1011; GFX9-NEXT:    v_mov_b32_e32 v3, v7
1012; GFX9-NEXT:    s_setpc_b64 s[30:31]
1013  %d = udiv i64 %a, %b
1014  %r = urem i64 %a, %b
1015  %ins.0 = insertelement <2 x i64> undef, i64 %d, i32 0
1016  %ins.1 = insertelement <2 x i64> %ins.0, i64 %r, i32 1
1017  ret <2 x i64> %ins.1
1018}
1019
1020define i64 @sdiv64_known32(i64 %a, i64 %b) {
1021; GFX9-LABEL: sdiv64_known32:
1022; GFX9:       ; %bb.0:
1023; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
1025; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
1026; GFX9-NEXT:    v_or_b32_e32 v5, v2, v0
1027; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1028; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1029; GFX9-NEXT:    v_mov_b32_e32 v7, v1
1030; GFX9-NEXT:    v_mov_b32_e32 v6, v3
1031; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
1032; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1033; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
1034; GFX9-NEXT:    s_cbranch_execz .LBB10_2
1035; GFX9-NEXT:  ; %bb.1:
1036; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v6
1037; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
1038; GFX9-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v6
1039; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v0, vcc
1040; GFX9-NEXT:    v_madmk_f32 v1, v3, 0x4f800000, v1
1041; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
1042; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
1043; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v1
1044; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
1045; GFX9-NEXT:    v_madmk_f32 v1, v3, 0xcf800000, v1
1046; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v3
1047; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1048; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v10
1049; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v1
1050; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
1051; GFX9-NEXT:    v_add3_u32 v8, v4, v5, v8
1052; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
1053; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v8, 0
1054; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v9, v4
1055; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v3, 0
1056; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v8, 0
1057; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
1058; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v13, v3
1059; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
1060; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v9, vcc
1061; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
1062; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
1063; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
1064; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v10, v4, vcc
1065; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v13
1066; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v1
1067; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v1, 0
1068; GFX9-NEXT:    v_add3_u32 v8, v4, v5, v8
1069; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v8, 0
1070; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, 0
1071; GFX9-NEXT:    v_mul_hi_u32 v12, v1, v3
1072; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v3, 0
1073; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v12, v8
1074; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
1075; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v10
1076; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v11, vcc
1077; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
1078; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
1079; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
1080; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
1081; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v4, vcc
1082; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v5, 0
1083; GFX9-NEXT:    v_mul_hi_u32 v8, v7, v1
1084; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v8, v3
1085; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v4, vcc
1086; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v1, 0
1087; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v5, 0
1088; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v10, v3
1089; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v4, vcc
1090; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v9, vcc
1091; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v8
1092; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
1093; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v1
1094; GFX9-NEXT:    v_mul_lo_u32 v9, v6, v5
1095; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v1, 0
1096; GFX9-NEXT:    v_add3_u32 v4, v4, v9, v8
1097; GFX9-NEXT:    v_sub_u32_e32 v8, v2, v4
1098; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, v7, v3
1099; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[4:5], v8, v0, vcc
1100; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v3, v6
1101; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5]
1102; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v0
1103; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
1104; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v6
1105; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
1106; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v0
1107; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[4:5]
1108; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v1
1109; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
1110; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
1111; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
1112; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v1
1113; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
1114; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
1115; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5]
1116; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
1117; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v0
1118; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
1119; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
1120; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[4:5]
1121; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1122; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
1123; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1124; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
1125; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
1126; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1127; GFX9-NEXT:  .LBB10_2: ; %Flow
1128; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
1129; GFX9-NEXT:    s_cbranch_execz .LBB10_4
1130; GFX9-NEXT:  ; %bb.3:
1131; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v3
1132; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v3
1133; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1134; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1135; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1136; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1137; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v0
1138; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1139; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1140; GFX9-NEXT:    v_mul_hi_u32 v0, v1, v0
1141; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v3
1142; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
1143; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v2
1144; GFX9-NEXT:    v_sub_u32_e32 v2, v1, v3
1145; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
1146; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1147; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1148; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
1149; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
1150; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
1151; GFX9-NEXT:  .LBB10_4:
1152; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1153; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1154; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1155; GFX9-NEXT:    s_setpc_b64 s[30:31]
1156  %a.ext = ashr i64 %a, 32
1157  %b.ext = ashr i64 %b, 32
1158  %d = udiv i64 %a.ext, %b.ext
1159  ret i64 %d
1160}
1161
1162define i64 @udiv64_known32(i64 %a, i64 %b) {
1163; GFX9-LABEL: udiv64_known32:
1164; GFX9:       ; %bb.0:
1165; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
1167; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
1168; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1169; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1170; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1171; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
1172; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1173; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1174; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
1175; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
1176; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
1177; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
1178; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
1179; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
1180; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1181; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1182; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
1183; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
1184; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
1185; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1186; GFX9-NEXT:    s_setpc_b64 s[30:31]
1187  %a.mask = and i64 %a, 4294967295
1188  %b.mask = and i64 %b, 4294967295
1189  %d = udiv i64 %a.mask, %b.mask
1190  ret i64 %d
1191}
1192