xref: /llvm-project/llvm/test/CodeGen/AMDGPU/carryout-selection.ll (revision 7c58d6363a40fc6d1cdf6a147da8f3bb0d4f96ec)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN-ISEL                %s
3; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel -enable-new-pm < %s | FileCheck -enable-var-scope -check-prefixes=GCN-ISEL                %s
4
5; RUN: llc -mtriple=amdgcn -mcpu=verde   -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CISI    %s
6; RUN: llc -mtriple=amdgcn -mcpu=fiji    -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI      %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx900  -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9    %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1010 %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s
10; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s
11; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
12
13; GCN-ISEL-LABEL: name:   sadd64rr
14; GCN-ISEL-LABEL: body:
15; GCN-ISEL-LABEL: bb.0.entry:
16; GCN-ISEL: S_ADD_U64_PSEUDO
17
18define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
19; CISI-LABEL: sadd64rr:
20; CISI:       ; %bb.0: ; %entry
21; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
22; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
23; CISI-NEXT:    s_mov_b32 s7, 0xf000
24; CISI-NEXT:    s_mov_b32 s6, -1
25; CISI-NEXT:    s_waitcnt lgkmcnt(0)
26; CISI-NEXT:    s_mov_b32 s4, s0
27; CISI-NEXT:    s_add_u32 s0, s2, s8
28; CISI-NEXT:    s_mov_b32 s5, s1
29; CISI-NEXT:    s_addc_u32 s1, s3, s9
30; CISI-NEXT:    v_mov_b32_e32 v0, s0
31; CISI-NEXT:    v_mov_b32_e32 v1, s1
32; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
33; CISI-NEXT:    s_endpgm
34;
35; VI-LABEL: sadd64rr:
36; VI:       ; %bb.0: ; %entry
37; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
38; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
39; VI-NEXT:    s_waitcnt lgkmcnt(0)
40; VI-NEXT:    v_mov_b32_e32 v0, s0
41; VI-NEXT:    s_add_u32 s0, s2, s4
42; VI-NEXT:    v_mov_b32_e32 v1, s1
43; VI-NEXT:    s_addc_u32 s1, s3, s5
44; VI-NEXT:    v_mov_b32_e32 v3, s1
45; VI-NEXT:    v_mov_b32_e32 v2, s0
46; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
47; VI-NEXT:    s_endpgm
48;
49; GFX9-LABEL: sadd64rr:
50; GFX9:       ; %bb.0: ; %entry
51; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
52; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
53; GFX9-NEXT:    v_mov_b32_e32 v2, 0
54; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX9-NEXT:    s_add_u32 s2, s2, s6
56; GFX9-NEXT:    s_addc_u32 s3, s3, s7
57; GFX9-NEXT:    v_mov_b32_e32 v0, s2
58; GFX9-NEXT:    v_mov_b32_e32 v1, s3
59; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
60; GFX9-NEXT:    s_endpgm
61;
62; GFX1010-LABEL: sadd64rr:
63; GFX1010:       ; %bb.0: ; %entry
64; GFX1010-NEXT:    s_clause 0x1
65; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
66; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
67; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
68; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX1010-NEXT:    s_add_u32 s2, s2, s6
70; GFX1010-NEXT:    s_addc_u32 s3, s3, s7
71; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
72; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
73; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
74; GFX1010-NEXT:    s_endpgm
75;
76; GFX1030W32-LABEL: sadd64rr:
77; GFX1030W32:       ; %bb.0: ; %entry
78; GFX1030W32-NEXT:    s_clause 0x1
79; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
80; GFX1030W32-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
81; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
82; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX1030W32-NEXT:    s_add_u32 s2, s2, s4
84; GFX1030W32-NEXT:    s_addc_u32 s3, s3, s5
85; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s2
86; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s3
87; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
88; GFX1030W32-NEXT:    s_endpgm
89;
90; GFX1030W64-LABEL: sadd64rr:
91; GFX1030W64:       ; %bb.0: ; %entry
92; GFX1030W64-NEXT:    s_clause 0x1
93; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
94; GFX1030W64-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
95; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
96; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX1030W64-NEXT:    s_add_u32 s2, s2, s4
98; GFX1030W64-NEXT:    s_addc_u32 s3, s3, s5
99; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s2
100; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s3
101; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
102; GFX1030W64-NEXT:    s_endpgm
103;
104; GFX11-LABEL: sadd64rr:
105; GFX11:       ; %bb.0: ; %entry
106; GFX11-NEXT:    s_clause 0x1
107; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
108; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
109; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
110; GFX11-NEXT:    s_add_u32 s2, s2, s4
111; GFX11-NEXT:    s_addc_u32 s3, s3, s5
112; GFX11-NEXT:    v_mov_b32_e32 v0, s2
113; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
114; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
115; GFX11-NEXT:    s_endpgm
116entry:
117  %add = add i64 %a, %b
118  store i64 %add, ptr addrspace(1) %out
119  ret void
120}
121
122; GCN-ISEL-LABEL: name:   sadd64ri
123; GCN-ISEL-LABEL: body:
124; GCN-ISEL-LABEL: bb.0.entry:
125; GCN-ISEL: S_ADD_U64_PSEUDO
126
127define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
128; CISI-LABEL: sadd64ri:
129; CISI:       ; %bb.0: ; %entry
130; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
131; CISI-NEXT:    s_mov_b32 s7, 0xf000
132; CISI-NEXT:    s_mov_b32 s6, -1
133; CISI-NEXT:    s_waitcnt lgkmcnt(0)
134; CISI-NEXT:    s_mov_b32 s4, s0
135; CISI-NEXT:    s_add_u32 s0, s2, 0x56789876
136; CISI-NEXT:    s_mov_b32 s5, s1
137; CISI-NEXT:    s_addc_u32 s1, s3, 0x1234
138; CISI-NEXT:    v_mov_b32_e32 v0, s0
139; CISI-NEXT:    v_mov_b32_e32 v1, s1
140; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
141; CISI-NEXT:    s_endpgm
142;
143; VI-LABEL: sadd64ri:
144; VI:       ; %bb.0: ; %entry
145; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
146; VI-NEXT:    s_waitcnt lgkmcnt(0)
147; VI-NEXT:    v_mov_b32_e32 v0, s0
148; VI-NEXT:    s_add_u32 s0, s2, 0x56789876
149; VI-NEXT:    v_mov_b32_e32 v1, s1
150; VI-NEXT:    s_addc_u32 s1, s3, 0x1234
151; VI-NEXT:    v_mov_b32_e32 v3, s1
152; VI-NEXT:    v_mov_b32_e32 v2, s0
153; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
154; VI-NEXT:    s_endpgm
155;
156; GFX9-LABEL: sadd64ri:
157; GFX9:       ; %bb.0: ; %entry
158; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
159; GFX9-NEXT:    v_mov_b32_e32 v2, 0
160; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
161; GFX9-NEXT:    s_add_u32 s2, s2, 0x56789876
162; GFX9-NEXT:    s_addc_u32 s3, s3, 0x1234
163; GFX9-NEXT:    v_mov_b32_e32 v0, s2
164; GFX9-NEXT:    v_mov_b32_e32 v1, s3
165; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
166; GFX9-NEXT:    s_endpgm
167;
168; GFX1010-LABEL: sadd64ri:
169; GFX1010:       ; %bb.0: ; %entry
170; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
171; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
172; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX1010-NEXT:    s_add_u32 s2, s2, 0x56789876
174; GFX1010-NEXT:    s_addc_u32 s3, s3, 0x1234
175; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
176; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
177; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
178; GFX1010-NEXT:    s_endpgm
179;
180; GFX1030W32-LABEL: sadd64ri:
181; GFX1030W32:       ; %bb.0: ; %entry
182; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
183; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
184; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX1030W32-NEXT:    s_add_u32 s2, s2, 0x56789876
186; GFX1030W32-NEXT:    s_addc_u32 s3, s3, 0x1234
187; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s2
188; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s3
189; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
190; GFX1030W32-NEXT:    s_endpgm
191;
192; GFX1030W64-LABEL: sadd64ri:
193; GFX1030W64:       ; %bb.0: ; %entry
194; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
195; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
196; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
197; GFX1030W64-NEXT:    s_add_u32 s2, s2, 0x56789876
198; GFX1030W64-NEXT:    s_addc_u32 s3, s3, 0x1234
199; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s2
200; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s3
201; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
202; GFX1030W64-NEXT:    s_endpgm
203;
204; GFX11-LABEL: sadd64ri:
205; GFX11:       ; %bb.0: ; %entry
206; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
207; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX11-NEXT:    s_add_u32 s2, s2, 0x56789876
209; GFX11-NEXT:    s_addc_u32 s3, s3, 0x1234
210; GFX11-NEXT:    v_mov_b32_e32 v0, s2
211; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
212; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
213; GFX11-NEXT:    s_endpgm
214entry:
215  %add = add i64 20015998343286, %a
216  store i64 %add, ptr addrspace(1) %out
217  ret void
218}
219
220; GCN-ISEL-LABEL: name:   vadd64rr
221; GCN-ISEL-LABEL: body:
222; GCN-ISEL-LABEL: bb.0.entry:
223; GCN-ISEL: V_ADD_U64_PSEUDO
224
225define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
226; CISI-LABEL: vadd64rr:
227; CISI:       ; %bb.0: ; %entry
228; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
229; CISI-NEXT:    s_mov_b32 s7, 0xf000
230; CISI-NEXT:    s_mov_b32 s6, -1
231; CISI-NEXT:    s_waitcnt lgkmcnt(0)
232; CISI-NEXT:    v_mov_b32_e32 v1, s3
233; CISI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
234; CISI-NEXT:    s_mov_b32 s4, s0
235; CISI-NEXT:    s_mov_b32 s5, s1
236; CISI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
237; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
238; CISI-NEXT:    s_endpgm
239;
240; VI-LABEL: vadd64rr:
241; VI:       ; %bb.0: ; %entry
242; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
243; VI-NEXT:    s_waitcnt lgkmcnt(0)
244; VI-NEXT:    v_mov_b32_e32 v4, s3
245; VI-NEXT:    v_add_u32_e32 v3, vcc, s2, v0
246; VI-NEXT:    v_mov_b32_e32 v1, s0
247; VI-NEXT:    v_mov_b32_e32 v2, s1
248; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
249; VI-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
250; VI-NEXT:    s_endpgm
251;
252; GFX9-LABEL: vadd64rr:
253; GFX9:       ; %bb.0: ; %entry
254; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
255; GFX9-NEXT:    v_mov_b32_e32 v2, 0
256; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX9-NEXT:    v_mov_b32_e32 v1, s3
258; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
259; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
260; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
261; GFX9-NEXT:    s_endpgm
262;
263; GFX1010-LABEL: vadd64rr:
264; GFX1010:       ; %bb.0: ; %entry
265; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
266; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
267; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX1010-NEXT:    v_add_co_u32 v0, s2, s2, v0
269; GFX1010-NEXT:    v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
270; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
271; GFX1010-NEXT:    s_endpgm
272;
273; GFX1030W32-LABEL: vadd64rr:
274; GFX1030W32:       ; %bb.0: ; %entry
275; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
276; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
277; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX1030W32-NEXT:    v_add_co_u32 v0, s2, s2, v0
279; GFX1030W32-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
280; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
281; GFX1030W32-NEXT:    s_endpgm
282;
283; GFX1030W64-LABEL: vadd64rr:
284; GFX1030W64:       ; %bb.0: ; %entry
285; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
286; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
287; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX1030W64-NEXT:    v_add_co_u32 v0, s[4:5], s2, v0
289; GFX1030W64-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
290; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
291; GFX1030W64-NEXT:    s_endpgm
292;
293; GFX11-LABEL: vadd64rr:
294; GFX11:       ; %bb.0: ; %entry
295; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
296; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
297; GFX11-NEXT:    v_mov_b32_e32 v2, 0
298; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
300; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
301; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
302; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
303; GFX11-NEXT:    s_endpgm
304entry:
305  %tid = call i32 @llvm.amdgcn.workitem.id.x()
306  %tid.ext = sext i32 %tid to i64
307  %add = add i64 %a, %tid.ext
308  store i64 %add, ptr addrspace(1) %out
309  ret void
310}
311
312; GCN-ISEL-LABEL: name:   vadd64ri
313; GCN-ISEL-LABEL: body:
314; GCN-ISEL-LABEL: bb.0.entry:
315; GCN-ISEL: V_ADD_U64_PSEUDO
316
317define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
318; CISI-LABEL: vadd64ri:
319; CISI:       ; %bb.0: ; %entry
320; CISI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
321; CISI-NEXT:    v_add_i32_e32 v0, vcc, 0x56789876, v0
322; CISI-NEXT:    v_mov_b32_e32 v1, 0x1234
323; CISI-NEXT:    s_mov_b32 s3, 0xf000
324; CISI-NEXT:    s_mov_b32 s2, -1
325; CISI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
326; CISI-NEXT:    s_waitcnt lgkmcnt(0)
327; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
328; CISI-NEXT:    s_endpgm
329;
330; VI-LABEL: vadd64ri:
331; VI:       ; %bb.0: ; %entry
332; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
333; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x56789876, v0
334; VI-NEXT:    v_mov_b32_e32 v1, 0x1234
335; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
336; VI-NEXT:    s_waitcnt lgkmcnt(0)
337; VI-NEXT:    v_mov_b32_e32 v3, s1
338; VI-NEXT:    v_mov_b32_e32 v2, s0
339; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
340; VI-NEXT:    s_endpgm
341;
342; GFX9-LABEL: vadd64ri:
343; GFX9:       ; %bb.0: ; %entry
344; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
345; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x56789876, v0
346; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1234
347; GFX9-NEXT:    v_mov_b32_e32 v2, 0
348; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
349; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
351; GFX9-NEXT:    s_endpgm
352;
353; GFX1010-LABEL: vadd64ri:
354; GFX1010:       ; %bb.0: ; %entry
355; GFX1010-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
356; GFX1010-NEXT:    v_add_co_u32 v0, s2, 0x56789876, v0
357; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
358; GFX1010-NEXT:    v_add_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
359; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
361; GFX1010-NEXT:    s_endpgm
362;
363; GFX1030W32-LABEL: vadd64ri:
364; GFX1030W32:       ; %bb.0: ; %entry
365; GFX1030W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
366; GFX1030W32-NEXT:    v_add_co_u32 v0, s2, 0x56789876, v0
367; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
368; GFX1030W32-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
369; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
371; GFX1030W32-NEXT:    s_endpgm
372;
373; GFX1030W64-LABEL: vadd64ri:
374; GFX1030W64:       ; %bb.0: ; %entry
375; GFX1030W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
376; GFX1030W64-NEXT:    v_add_co_u32 v0, s[2:3], 0x56789876, v0
377; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
378; GFX1030W64-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
379; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
381; GFX1030W64-NEXT:    s_endpgm
382;
383; GFX11-LABEL: vadd64ri:
384; GFX11:       ; %bb.0: ; %entry
385; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
386; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
387; GFX11-NEXT:    v_mov_b32_e32 v2, 0
388; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
389; GFX11-NEXT:    v_add_co_u32 v0, s2, 0x56789876, v0
390; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
391; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
393; GFX11-NEXT:    s_endpgm
394entry:
395  %tid = call i32 @llvm.amdgcn.workitem.id.x()
396  %tid.ext = sext i32 %tid to i64
397  %add = add i64 20015998343286, %tid.ext
398  store i64 %add, ptr addrspace(1) %out
399  ret void
400}
401
402; GCN-ISEL-LABEL: name:   suaddo32
403; GCN-ISEL-LABEL: body:
404; GCN-ISEL-LABEL: bb.0
405; GCN-ISEL: S_ADD_I32
406define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
407; CISI-LABEL: suaddo32:
408; CISI:       ; %bb.0:
409; CISI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
410; CISI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
411; CISI-NEXT:    s_mov_b32 s3, 0xf000
412; CISI-NEXT:    s_mov_b32 s2, -1
413; CISI-NEXT:    s_waitcnt lgkmcnt(0)
414; CISI-NEXT:    s_add_i32 s4, s6, s7
415; CISI-NEXT:    v_mov_b32_e32 v0, s4
416; CISI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
417; CISI-NEXT:    s_endpgm
418;
419; VI-LABEL: suaddo32:
420; VI:       ; %bb.0:
421; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
422; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
423; VI-NEXT:    s_waitcnt lgkmcnt(0)
424; VI-NEXT:    s_add_i32 s0, s0, s1
425; VI-NEXT:    v_mov_b32_e32 v0, s2
426; VI-NEXT:    v_mov_b32_e32 v1, s3
427; VI-NEXT:    v_mov_b32_e32 v2, s0
428; VI-NEXT:    flat_store_dword v[0:1], v2
429; VI-NEXT:    s_endpgm
430;
431; GFX9-LABEL: suaddo32:
432; GFX9:       ; %bb.0:
433; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
434; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
435; GFX9-NEXT:    v_mov_b32_e32 v0, 0
436; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
437; GFX9-NEXT:    s_add_i32 s0, s0, s1
438; GFX9-NEXT:    v_mov_b32_e32 v1, s0
439; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
440; GFX9-NEXT:    s_endpgm
441;
442; GFX1010-LABEL: suaddo32:
443; GFX1010:       ; %bb.0:
444; GFX1010-NEXT:    s_clause 0x1
445; GFX1010-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
446; GFX1010-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
447; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
448; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX1010-NEXT:    s_add_i32 s0, s0, s1
450; GFX1010-NEXT:    v_mov_b32_e32 v1, s0
451; GFX1010-NEXT:    global_store_dword v0, v1, s[2:3]
452; GFX1010-NEXT:    s_endpgm
453;
454; GFX1030W32-LABEL: suaddo32:
455; GFX1030W32:       ; %bb.0:
456; GFX1030W32-NEXT:    s_clause 0x1
457; GFX1030W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
458; GFX1030W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
459; GFX1030W32-NEXT:    v_mov_b32_e32 v0, 0
460; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
461; GFX1030W32-NEXT:    s_add_i32 s0, s0, s1
462; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s0
463; GFX1030W32-NEXT:    global_store_dword v0, v1, s[2:3]
464; GFX1030W32-NEXT:    s_endpgm
465;
466; GFX1030W64-LABEL: suaddo32:
467; GFX1030W64:       ; %bb.0:
468; GFX1030W64-NEXT:    s_clause 0x1
469; GFX1030W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
470; GFX1030W64-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
471; GFX1030W64-NEXT:    v_mov_b32_e32 v0, 0
472; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX1030W64-NEXT:    s_add_i32 s0, s0, s1
474; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s0
475; GFX1030W64-NEXT:    global_store_dword v0, v1, s[2:3]
476; GFX1030W64-NEXT:    s_endpgm
477;
478; GFX11-LABEL: suaddo32:
479; GFX11:       ; %bb.0:
480; GFX11-NEXT:    s_clause 0x1
481; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
482; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
483; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX11-NEXT:    s_add_i32 s0, s0, s1
485; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
486; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
487; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
488; GFX11-NEXT:    s_endpgm
489  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
490  %val = extractvalue { i32, i1 } %uadd, 0
491  %carry = extractvalue { i32, i1 } %uadd, 1
492  store i32 %val, ptr addrspace(1) %out, align 4
493  ret void
494}
495
496
497; GCN-ISEL-LABEL: name:   uaddo32_vcc_user
498; GCN-ISEL-LABEL: body:
499; GCN-ISEL-LABEL: bb.0
500; GCN-ISEL: V_ADD_CO_U32_e64
501
502; below we check selection to v_add/addc
503; because the only user of VCC produced by the UADDOis v_cndmask.
504; We select to VALU form to avoid unnecessary s_cselect to copy SCC to VCC
505
506define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
507; CISI-LABEL: uaddo32_vcc_user:
508; CISI:       ; %bb.0:
509; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
510; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
511; CISI-NEXT:    s_mov_b32 s7, 0xf000
512; CISI-NEXT:    s_mov_b32 s6, -1
513; CISI-NEXT:    s_waitcnt lgkmcnt(0)
514; CISI-NEXT:    s_mov_b32 s4, s0
515; CISI-NEXT:    v_mov_b32_e32 v0, s9
516; CISI-NEXT:    s_mov_b32 s5, s1
517; CISI-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
518; CISI-NEXT:    s_mov_b32 s0, s2
519; CISI-NEXT:    s_mov_b32 s1, s3
520; CISI-NEXT:    s_mov_b32 s2, s6
521; CISI-NEXT:    s_mov_b32 s3, s7
522; CISI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
523; CISI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
524; CISI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
525; CISI-NEXT:    s_endpgm
526;
527; VI-LABEL: uaddo32_vcc_user:
528; VI:       ; %bb.0:
529; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
530; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
531; VI-NEXT:    s_waitcnt lgkmcnt(0)
532; VI-NEXT:    v_mov_b32_e32 v0, s0
533; VI-NEXT:    v_mov_b32_e32 v4, s5
534; VI-NEXT:    v_mov_b32_e32 v1, s1
535; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
536; VI-NEXT:    v_mov_b32_e32 v2, s2
537; VI-NEXT:    v_mov_b32_e32 v3, s3
538; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
539; VI-NEXT:    flat_store_dword v[0:1], v4
540; VI-NEXT:    flat_store_byte v[2:3], v5
541; VI-NEXT:    s_endpgm
542;
543; GFX9-LABEL: uaddo32_vcc_user:
544; GFX9:       ; %bb.0:
545; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
546; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
547; GFX9-NEXT:    v_mov_b32_e32 v0, 0
548; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX9-NEXT:    v_mov_b32_e32 v1, s7
550; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s6, v1
551; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
552; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
553; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
554; GFX9-NEXT:    s_endpgm
555;
556; GFX1010-LABEL: uaddo32_vcc_user:
557; GFX1010:       ; %bb.0:
558; GFX1010-NEXT:    s_clause 0x1
559; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
560; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
561; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
562; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
563; GFX1010-NEXT:    v_add_co_u32 v1, s4, s6, s7
564; GFX1010-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
565; GFX1010-NEXT:    global_store_dword v0, v1, s[0:1]
566; GFX1010-NEXT:    global_store_byte v0, v2, s[2:3]
567; GFX1010-NEXT:    s_endpgm
568;
569; GFX1030W32-LABEL: uaddo32_vcc_user:
570; GFX1030W32:       ; %bb.0:
571; GFX1030W32-NEXT:    s_clause 0x1
572; GFX1030W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
573; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
574; GFX1030W32-NEXT:    v_mov_b32_e32 v0, 0
575; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
576; GFX1030W32-NEXT:    v_add_co_u32 v1, s4, s6, s7
577; GFX1030W32-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
578; GFX1030W32-NEXT:    global_store_dword v0, v1, s[0:1]
579; GFX1030W32-NEXT:    global_store_byte v0, v2, s[2:3]
580; GFX1030W32-NEXT:    s_endpgm
581;
582; GFX1030W64-LABEL: uaddo32_vcc_user:
583; GFX1030W64:       ; %bb.0:
584; GFX1030W64-NEXT:    s_clause 0x1
585; GFX1030W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
586; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
587; GFX1030W64-NEXT:    v_mov_b32_e32 v0, 0
588; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
589; GFX1030W64-NEXT:    v_add_co_u32 v1, s[4:5], s6, s7
590; GFX1030W64-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
591; GFX1030W64-NEXT:    global_store_dword v0, v1, s[0:1]
592; GFX1030W64-NEXT:    global_store_byte v0, v2, s[2:3]
593; GFX1030W64-NEXT:    s_endpgm
594;
595; GFX11-LABEL: uaddo32_vcc_user:
596; GFX11:       ; %bb.0:
597; GFX11-NEXT:    s_clause 0x1
598; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
599; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
600; GFX11-NEXT:    v_mov_b32_e32 v0, 0
601; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX11-NEXT:    v_add_co_u32 v1, s4, s6, s7
603; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
604; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
605; GFX11-NEXT:    s_clause 0x1
606; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
607; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
608; GFX11-NEXT:    s_endpgm
609  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
610  %val = extractvalue { i32, i1 } %uadd, 0
611  %carry = extractvalue { i32, i1 } %uadd, 1
612  store i32 %val, ptr addrspace(1) %out, align 4
613  store i1 %carry, ptr addrspace(1) %carryout
614  ret void
615}
616
617; GCN-ISEL-LABEL: name:   suaddo64
618; GCN-ISEL-LABEL: body:
619; GCN-ISEL-LABEL: bb.0
620; GCN-ISEL: S_ADD_U64_PSEUDO
621
622define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
623; CISI-LABEL: suaddo64:
624; CISI:       ; %bb.0:
625; CISI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
626; CISI-NEXT:    s_mov_b32 s11, 0xf000
627; CISI-NEXT:    s_mov_b32 s10, -1
628; CISI-NEXT:    s_waitcnt lgkmcnt(0)
629; CISI-NEXT:    s_add_u32 s6, s4, s6
630; CISI-NEXT:    v_mov_b32_e32 v0, s4
631; CISI-NEXT:    s_addc_u32 s7, s5, s7
632; CISI-NEXT:    v_mov_b32_e32 v1, s5
633; CISI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
634; CISI-NEXT:    v_mov_b32_e32 v2, s6
635; CISI-NEXT:    s_mov_b32 s8, s0
636; CISI-NEXT:    s_mov_b32 s9, s1
637; CISI-NEXT:    s_mov_b32 s0, s2
638; CISI-NEXT:    s_mov_b32 s1, s3
639; CISI-NEXT:    s_mov_b32 s2, s10
640; CISI-NEXT:    s_mov_b32 s3, s11
641; CISI-NEXT:    v_mov_b32_e32 v3, s7
642; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
643; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
644; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
645; CISI-NEXT:    s_endpgm
646;
647; VI-LABEL: suaddo64:
648; VI:       ; %bb.0:
649; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
650; VI-NEXT:    s_waitcnt lgkmcnt(0)
651; VI-NEXT:    v_mov_b32_e32 v0, s0
652; VI-NEXT:    s_add_u32 s0, s4, s6
653; VI-NEXT:    v_mov_b32_e32 v4, s4
654; VI-NEXT:    v_mov_b32_e32 v1, s1
655; VI-NEXT:    s_addc_u32 s1, s5, s7
656; VI-NEXT:    v_mov_b32_e32 v5, s5
657; VI-NEXT:    v_mov_b32_e32 v7, s1
658; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
659; VI-NEXT:    v_mov_b32_e32 v6, s0
660; VI-NEXT:    v_mov_b32_e32 v2, s2
661; VI-NEXT:    v_mov_b32_e32 v3, s3
662; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
663; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
664; VI-NEXT:    flat_store_byte v[2:3], v0
665; VI-NEXT:    s_endpgm
666;
667; GFX9-LABEL: suaddo64:
668; GFX9:       ; %bb.0:
669; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
670; GFX9-NEXT:    v_mov_b32_e32 v4, 0
671; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX9-NEXT:    s_add_u32 s0, s12, s14
673; GFX9-NEXT:    v_mov_b32_e32 v0, s12
674; GFX9-NEXT:    v_mov_b32_e32 v1, s13
675; GFX9-NEXT:    s_addc_u32 s1, s13, s15
676; GFX9-NEXT:    v_mov_b32_e32 v3, s1
677; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
678; GFX9-NEXT:    v_mov_b32_e32 v2, s0
679; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
680; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
681; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
682; GFX9-NEXT:    s_endpgm
683;
684; GFX1010-LABEL: suaddo64:
685; GFX1010:       ; %bb.0:
686; GFX1010-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
687; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
688; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX1010-NEXT:    s_add_u32 s0, s12, s14
690; GFX1010-NEXT:    s_addc_u32 s1, s13, s15
691; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
692; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
693; GFX1010-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
694; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
695; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
696; GFX1010-NEXT:    global_store_byte v2, v3, s[10:11]
697; GFX1010-NEXT:    s_endpgm
698;
699; GFX1030W32-LABEL: suaddo64:
700; GFX1030W32:       ; %bb.0:
701; GFX1030W32-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
702; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
703; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX1030W32-NEXT:    s_add_u32 s6, s4, s6
705; GFX1030W32-NEXT:    s_addc_u32 s7, s5, s7
706; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s6
707; GFX1030W32-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
708; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s7
709; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
710; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
711; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
712; GFX1030W32-NEXT:    s_endpgm
713;
714; GFX1030W64-LABEL: suaddo64:
715; GFX1030W64:       ; %bb.0:
716; GFX1030W64-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
717; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
718; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
719; GFX1030W64-NEXT:    s_add_u32 s6, s4, s6
720; GFX1030W64-NEXT:    s_addc_u32 s7, s5, s7
721; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s6
722; GFX1030W64-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5]
723; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s7
724; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
725; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
726; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
727; GFX1030W64-NEXT:    s_endpgm
728;
729; GFX11-LABEL: suaddo64:
730; GFX11:       ; %bb.0:
731; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
732; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX11-NEXT:    s_add_u32 s6, s4, s6
734; GFX11-NEXT:    s_addc_u32 s7, s5, s7
735; GFX11-NEXT:    v_mov_b32_e32 v0, s6
736; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
737; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
738; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
739; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
740; GFX11-NEXT:    s_clause 0x1
741; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
742; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
743; GFX11-NEXT:    s_endpgm
744  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
745  %val = extractvalue { i64, i1 } %uadd, 0
746  %carry = extractvalue { i64, i1 } %uadd, 1
747  store i64 %val, ptr addrspace(1) %out, align 8
748  store i1 %carry, ptr addrspace(1) %carryout
749  ret void
750}
751
752; GCN-ISEL-LABEL: name:   vuaddo64
753; GCN-ISEL-LABEL: body:
754; GCN-ISEL-LABEL: bb.0
755; GCN-ISEL: V_ADD_U64_PSEUDO
756
757define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
758; CISI-LABEL: vuaddo64:
759; CISI:       ; %bb.0:
760; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
761; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
762; CISI-NEXT:    s_mov_b32 s7, 0xf000
763; CISI-NEXT:    s_mov_b32 s6, -1
764; CISI-NEXT:    s_waitcnt lgkmcnt(0)
765; CISI-NEXT:    s_mov_b32 s4, s0
766; CISI-NEXT:    v_mov_b32_e32 v1, s9
767; CISI-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
768; CISI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
769; CISI-NEXT:    v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1]
770; CISI-NEXT:    s_mov_b32 s5, s1
771; CISI-NEXT:    s_mov_b32 s0, s2
772; CISI-NEXT:    s_mov_b32 s1, s3
773; CISI-NEXT:    s_mov_b32 s2, s6
774; CISI-NEXT:    s_mov_b32 s3, s7
775; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
776; CISI-NEXT:    s_waitcnt expcnt(0)
777; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
778; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
779; CISI-NEXT:    s_endpgm
780;
781; VI-LABEL: vuaddo64:
782; VI:       ; %bb.0:
783; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
784; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
785; VI-NEXT:    s_waitcnt lgkmcnt(0)
786; VI-NEXT:    v_mov_b32_e32 v1, s0
787; VI-NEXT:    v_mov_b32_e32 v6, s5
788; VI-NEXT:    v_add_u32_e32 v5, vcc, s4, v0
789; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
790; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
791; VI-NEXT:    v_mov_b32_e32 v2, s1
792; VI-NEXT:    v_mov_b32_e32 v3, s2
793; VI-NEXT:    v_mov_b32_e32 v4, s3
794; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
795; VI-NEXT:    flat_store_dwordx2 v[1:2], v[5:6]
796; VI-NEXT:    flat_store_byte v[3:4], v0
797; VI-NEXT:    s_endpgm
798;
799; GFX9-LABEL: vuaddo64:
800; GFX9:       ; %bb.0:
801; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
802; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
803; GFX9-NEXT:    v_mov_b32_e32 v2, 0
804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
805; GFX9-NEXT:    v_mov_b32_e32 v1, s7
806; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
807; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
808; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
809; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
810; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
811; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
812; GFX9-NEXT:    s_endpgm
813;
814; GFX1010-LABEL: vuaddo64:
815; GFX1010:       ; %bb.0:
816; GFX1010-NEXT:    s_clause 0x1
817; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
818; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
819; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
820; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX1010-NEXT:    v_add_co_u32 v0, s4, s6, v0
822; GFX1010-NEXT:    v_add_co_ci_u32_e64 v1, s4, s7, 0, s4
823; GFX1010-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
824; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
825; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
826; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
827; GFX1010-NEXT:    s_endpgm
828;
829; GFX1030W32-LABEL: vuaddo64:
830; GFX1030W32:       ; %bb.0:
831; GFX1030W32-NEXT:    s_clause 0x1
832; GFX1030W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
833; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
834; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
835; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
836; GFX1030W32-NEXT:    v_add_co_u32 v0, s4, s6, v0
837; GFX1030W32-NEXT:    v_add_co_ci_u32_e64 v1, null, s7, 0, s4
838; GFX1030W32-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
839; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
840; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
841; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
842; GFX1030W32-NEXT:    s_endpgm
843;
844; GFX1030W64-LABEL: vuaddo64:
845; GFX1030W64:       ; %bb.0:
846; GFX1030W64-NEXT:    s_clause 0x1
847; GFX1030W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
848; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
849; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
850; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX1030W64-NEXT:    v_add_co_u32 v0, s[4:5], s6, v0
852; GFX1030W64-NEXT:    v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
853; GFX1030W64-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
854; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
855; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
856; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
857; GFX1030W64-NEXT:    s_endpgm
858;
859; GFX11-LABEL: vuaddo64:
860; GFX11:       ; %bb.0:
861; GFX11-NEXT:    s_clause 0x1
862; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
863; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
864; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
865; GFX11-NEXT:    v_mov_b32_e32 v2, 0
866; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
868; GFX11-NEXT:    v_add_co_u32 v0, s4, s6, v0
869; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s7, 0, s4
870; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
871; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1]
872; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
873; GFX11-NEXT:    s_clause 0x1
874; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
875; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
876; GFX11-NEXT:    s_endpgm
877  %tid = call i32 @llvm.amdgcn.workitem.id.x()
878  %tid.ext = sext i32 %tid to i64
879  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext)
880  %val = extractvalue { i64, i1 } %uadd, 0
881  %carry = extractvalue { i64, i1 } %uadd, 1
882  store i64 %val, ptr addrspace(1) %out, align 8
883  store i1 %carry, ptr addrspace(1) %carryout
884  ret void
885}
886
887; GCN-ISEL-LABEL: name:   ssub64rr
888; GCN-ISEL-LABEL: body:
889; GCN-ISEL-LABEL: bb.0.entry:
890; GCN-ISEL: S_SUB_U64_PSEUDO
891
892define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
893; CISI-LABEL: ssub64rr:
894; CISI:       ; %bb.0: ; %entry
895; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
896; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
897; CISI-NEXT:    s_mov_b32 s7, 0xf000
898; CISI-NEXT:    s_mov_b32 s6, -1
899; CISI-NEXT:    s_waitcnt lgkmcnt(0)
900; CISI-NEXT:    s_mov_b32 s4, s0
901; CISI-NEXT:    s_sub_u32 s0, s2, s8
902; CISI-NEXT:    s_mov_b32 s5, s1
903; CISI-NEXT:    s_subb_u32 s1, s3, s9
904; CISI-NEXT:    v_mov_b32_e32 v0, s0
905; CISI-NEXT:    v_mov_b32_e32 v1, s1
906; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
907; CISI-NEXT:    s_endpgm
908;
909; VI-LABEL: ssub64rr:
910; VI:       ; %bb.0: ; %entry
911; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
912; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
913; VI-NEXT:    s_waitcnt lgkmcnt(0)
914; VI-NEXT:    v_mov_b32_e32 v0, s0
915; VI-NEXT:    s_sub_u32 s0, s2, s4
916; VI-NEXT:    v_mov_b32_e32 v1, s1
917; VI-NEXT:    s_subb_u32 s1, s3, s5
918; VI-NEXT:    v_mov_b32_e32 v3, s1
919; VI-NEXT:    v_mov_b32_e32 v2, s0
920; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
921; VI-NEXT:    s_endpgm
922;
923; GFX9-LABEL: ssub64rr:
924; GFX9:       ; %bb.0: ; %entry
925; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
926; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
927; GFX9-NEXT:    v_mov_b32_e32 v2, 0
928; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
929; GFX9-NEXT:    s_sub_u32 s2, s2, s6
930; GFX9-NEXT:    s_subb_u32 s3, s3, s7
931; GFX9-NEXT:    v_mov_b32_e32 v0, s2
932; GFX9-NEXT:    v_mov_b32_e32 v1, s3
933; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
934; GFX9-NEXT:    s_endpgm
935;
936; GFX1010-LABEL: ssub64rr:
937; GFX1010:       ; %bb.0: ; %entry
938; GFX1010-NEXT:    s_clause 0x1
939; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
940; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
941; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
942; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
943; GFX1010-NEXT:    s_sub_u32 s2, s2, s6
944; GFX1010-NEXT:    s_subb_u32 s3, s3, s7
945; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
946; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
947; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
948; GFX1010-NEXT:    s_endpgm
949;
950; GFX1030W32-LABEL: ssub64rr:
951; GFX1030W32:       ; %bb.0: ; %entry
952; GFX1030W32-NEXT:    s_clause 0x1
953; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
954; GFX1030W32-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
955; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
956; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
957; GFX1030W32-NEXT:    s_sub_u32 s2, s2, s4
958; GFX1030W32-NEXT:    s_subb_u32 s3, s3, s5
959; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s2
960; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s3
961; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
962; GFX1030W32-NEXT:    s_endpgm
963;
964; GFX1030W64-LABEL: ssub64rr:
965; GFX1030W64:       ; %bb.0: ; %entry
966; GFX1030W64-NEXT:    s_clause 0x1
967; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
968; GFX1030W64-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
969; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
970; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
971; GFX1030W64-NEXT:    s_sub_u32 s2, s2, s4
972; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s5
973; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s2
974; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s3
975; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
976; GFX1030W64-NEXT:    s_endpgm
977;
978; GFX11-LABEL: ssub64rr:
979; GFX11:       ; %bb.0: ; %entry
980; GFX11-NEXT:    s_clause 0x1
981; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
982; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
983; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
984; GFX11-NEXT:    s_sub_u32 s2, s2, s4
985; GFX11-NEXT:    s_subb_u32 s3, s3, s5
986; GFX11-NEXT:    v_mov_b32_e32 v0, s2
987; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
988; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
989; GFX11-NEXT:    s_endpgm
990entry:
991  %sub = sub i64 %a, %b
992  store i64 %sub, ptr addrspace(1) %out
993  ret void
994}
995
996; GCN-ISEL-LABEL: name:   ssub64ri
997; GCN-ISEL-LABEL: body:
998; GCN-ISEL-LABEL: bb.0.entry:
999; GCN-ISEL: S_SUB_U64_PSEUDO
1000
1001define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
1002; CISI-LABEL: ssub64ri:
1003; CISI:       ; %bb.0: ; %entry
1004; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1005; CISI-NEXT:    s_mov_b32 s7, 0xf000
1006; CISI-NEXT:    s_mov_b32 s6, -1
1007; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1008; CISI-NEXT:    s_mov_b32 s4, s0
1009; CISI-NEXT:    s_sub_u32 s0, 0x56789876, s2
1010; CISI-NEXT:    s_mov_b32 s5, s1
1011; CISI-NEXT:    s_subb_u32 s1, 0x1234, s3
1012; CISI-NEXT:    v_mov_b32_e32 v0, s0
1013; CISI-NEXT:    v_mov_b32_e32 v1, s1
1014; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1015; CISI-NEXT:    s_endpgm
1016;
1017; VI-LABEL: ssub64ri:
1018; VI:       ; %bb.0: ; %entry
1019; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1020; VI-NEXT:    s_waitcnt lgkmcnt(0)
1021; VI-NEXT:    v_mov_b32_e32 v0, s0
1022; VI-NEXT:    s_sub_u32 s0, 0x56789876, s2
1023; VI-NEXT:    v_mov_b32_e32 v1, s1
1024; VI-NEXT:    s_subb_u32 s1, 0x1234, s3
1025; VI-NEXT:    v_mov_b32_e32 v3, s1
1026; VI-NEXT:    v_mov_b32_e32 v2, s0
1027; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1028; VI-NEXT:    s_endpgm
1029;
1030; GFX9-LABEL: ssub64ri:
1031; GFX9:       ; %bb.0: ; %entry
1032; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1033; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1034; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX9-NEXT:    s_sub_u32 s2, 0x56789876, s2
1036; GFX9-NEXT:    s_subb_u32 s3, 0x1234, s3
1037; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1038; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1039; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1040; GFX9-NEXT:    s_endpgm
1041;
1042; GFX1010-LABEL: ssub64ri:
1043; GFX1010:       ; %bb.0: ; %entry
1044; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1045; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
1046; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
1047; GFX1010-NEXT:    s_sub_u32 s2, 0x56789876, s2
1048; GFX1010-NEXT:    s_subb_u32 s3, 0x1234, s3
1049; GFX1010-NEXT:    v_mov_b32_e32 v0, s2
1050; GFX1010-NEXT:    v_mov_b32_e32 v1, s3
1051; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1052; GFX1010-NEXT:    s_endpgm
1053;
1054; GFX1030W32-LABEL: ssub64ri:
1055; GFX1030W32:       ; %bb.0: ; %entry
1056; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1057; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
1058; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
1059; GFX1030W32-NEXT:    s_sub_u32 s2, 0x56789876, s2
1060; GFX1030W32-NEXT:    s_subb_u32 s3, 0x1234, s3
1061; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s2
1062; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s3
1063; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1064; GFX1030W32-NEXT:    s_endpgm
1065;
1066; GFX1030W64-LABEL: ssub64ri:
1067; GFX1030W64:       ; %bb.0: ; %entry
1068; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1069; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
1070; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX1030W64-NEXT:    s_sub_u32 s2, 0x56789876, s2
1072; GFX1030W64-NEXT:    s_subb_u32 s3, 0x1234, s3
1073; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s2
1074; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s3
1075; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1076; GFX1030W64-NEXT:    s_endpgm
1077;
1078; GFX11-LABEL: ssub64ri:
1079; GFX11:       ; %bb.0: ; %entry
1080; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1081; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX11-NEXT:    s_sub_u32 s2, 0x56789876, s2
1083; GFX11-NEXT:    s_subb_u32 s3, 0x1234, s3
1084; GFX11-NEXT:    v_mov_b32_e32 v0, s2
1085; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1086; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1087; GFX11-NEXT:    s_endpgm
1088entry:
1089  %sub = sub i64 20015998343286, %a
1090  store i64 %sub, ptr addrspace(1) %out
1091  ret void
1092}
1093
1094; GCN-ISEL-LABEL: name:   vsub64rr
1095; GCN-ISEL-LABEL: body:
1096; GCN-ISEL-LABEL: bb.0.entry:
1097; GCN-ISEL: V_SUB_U64_PSEUDO
1098
1099define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
1100; CISI-LABEL: vsub64rr:
1101; CISI:       ; %bb.0: ; %entry
1102; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1103; CISI-NEXT:    s_mov_b32 s7, 0xf000
1104; CISI-NEXT:    s_mov_b32 s6, -1
1105; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1106; CISI-NEXT:    v_mov_b32_e32 v1, s3
1107; CISI-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1108; CISI-NEXT:    s_mov_b32 s4, s0
1109; CISI-NEXT:    s_mov_b32 s5, s1
1110; CISI-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1111; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1112; CISI-NEXT:    s_endpgm
1113;
1114; VI-LABEL: vsub64rr:
1115; VI:       ; %bb.0: ; %entry
1116; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1117; VI-NEXT:    s_waitcnt lgkmcnt(0)
1118; VI-NEXT:    v_mov_b32_e32 v4, s3
1119; VI-NEXT:    v_sub_u32_e32 v3, vcc, s2, v0
1120; VI-NEXT:    v_mov_b32_e32 v1, s0
1121; VI-NEXT:    v_mov_b32_e32 v2, s1
1122; VI-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
1123; VI-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
1124; VI-NEXT:    s_endpgm
1125;
1126; GFX9-LABEL: vsub64rr:
1127; GFX9:       ; %bb.0: ; %entry
1128; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1129; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1130; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1131; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1132; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
1133; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1134; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1135; GFX9-NEXT:    s_endpgm
1136;
1137; GFX1010-LABEL: vsub64rr:
1138; GFX1010:       ; %bb.0: ; %entry
1139; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1140; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
1141; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
1142; GFX1010-NEXT:    v_sub_co_u32 v0, s2, s2, v0
1143; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
1144; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1145; GFX1010-NEXT:    s_endpgm
1146;
1147; GFX1030W32-LABEL: vsub64rr:
1148; GFX1030W32:       ; %bb.0: ; %entry
1149; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1150; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
1151; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
1152; GFX1030W32-NEXT:    v_sub_co_u32 v0, s2, s2, v0
1153; GFX1030W32-NEXT:    v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
1154; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1155; GFX1030W32-NEXT:    s_endpgm
1156;
1157; GFX1030W64-LABEL: vsub64rr:
1158; GFX1030W64:       ; %bb.0: ; %entry
1159; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1160; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
1161; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX1030W64-NEXT:    v_sub_co_u32 v0, s[4:5], s2, v0
1163; GFX1030W64-NEXT:    v_sub_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
1164; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1165; GFX1030W64-NEXT:    s_endpgm
1166;
1167; GFX11-LABEL: vsub64rr:
1168; GFX11:       ; %bb.0: ; %entry
1169; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1170; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1171; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1172; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1173; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1174; GFX11-NEXT:    v_sub_co_u32 v0, s2, s2, v0
1175; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
1176; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1177; GFX11-NEXT:    s_endpgm
1178entry:
1179  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1180  %tid.ext = sext i32 %tid to i64
1181  %sub = sub i64 %a, %tid.ext
1182  store i64 %sub, ptr addrspace(1) %out
1183  ret void
1184}
1185
1186; GCN-ISEL-LABEL: name:   vsub64ri
1187; GCN-ISEL-LABEL: body:
1188; GCN-ISEL-LABEL: bb.0.entry:
1189; GCN-ISEL: V_SUB_U64_PSEUDO
1190
1191define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
1192; CISI-LABEL: vsub64ri:
1193; CISI:       ; %bb.0: ; %entry
1194; CISI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1195; CISI-NEXT:    v_sub_i32_e32 v0, vcc, 0x56789876, v0
1196; CISI-NEXT:    v_mov_b32_e32 v1, 0x1234
1197; CISI-NEXT:    s_mov_b32 s3, 0xf000
1198; CISI-NEXT:    s_mov_b32 s2, -1
1199; CISI-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1200; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1201; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1202; CISI-NEXT:    s_endpgm
1203;
1204; VI-LABEL: vsub64ri:
1205; VI:       ; %bb.0: ; %entry
1206; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1207; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0x56789876, v0
1208; VI-NEXT:    v_mov_b32_e32 v1, 0x1234
1209; VI-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1210; VI-NEXT:    s_waitcnt lgkmcnt(0)
1211; VI-NEXT:    v_mov_b32_e32 v3, s1
1212; VI-NEXT:    v_mov_b32_e32 v2, s0
1213; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1214; VI-NEXT:    s_endpgm
1215;
1216; GFX9-LABEL: vsub64ri:
1217; GFX9:       ; %bb.0: ; %entry
1218; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1219; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, 0x56789876, v0
1220; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1234
1221; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1222; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1223; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1224; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1225; GFX9-NEXT:    s_endpgm
1226;
1227; GFX1010-LABEL: vsub64ri:
1228; GFX1010:       ; %bb.0: ; %entry
1229; GFX1010-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1230; GFX1010-NEXT:    v_sub_co_u32 v0, s2, 0x56789876, v0
1231; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
1232; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
1233; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
1234; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1235; GFX1010-NEXT:    s_endpgm
1236;
1237; GFX1030W32-LABEL: vsub64ri:
1238; GFX1030W32:       ; %bb.0: ; %entry
1239; GFX1030W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1240; GFX1030W32-NEXT:    v_sub_co_u32 v0, s2, 0x56789876, v0
1241; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
1242; GFX1030W32-NEXT:    v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
1243; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
1244; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1245; GFX1030W32-NEXT:    s_endpgm
1246;
1247; GFX1030W64-LABEL: vsub64ri:
1248; GFX1030W64:       ; %bb.0: ; %entry
1249; GFX1030W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1250; GFX1030W64-NEXT:    v_sub_co_u32 v0, s[2:3], 0x56789876, v0
1251; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
1252; GFX1030W64-NEXT:    v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
1253; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
1254; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1255; GFX1030W64-NEXT:    s_endpgm
1256;
1257; GFX11-LABEL: vsub64ri:
1258; GFX11:       ; %bb.0: ; %entry
1259; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1260; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1261; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1262; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1263; GFX11-NEXT:    v_sub_co_u32 v0, s2, 0x56789876, v0
1264; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
1265; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1267; GFX11-NEXT:    s_endpgm
1268entry:
1269  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1270  %tid.ext = sext i32 %tid to i64
1271  %sub = sub i64 20015998343286, %tid.ext
1272  store i64 %sub, ptr addrspace(1) %out
1273  ret void
1274}
1275
1276; GCN-ISEL-LABEL: name:   susubo32
1277; GCN-ISEL-LABEL: body:
1278; GCN-ISEL-LABEL: bb.0
1279; GCN-ISEL: S_SUB_I32
1280
1281define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
1282; CISI-LABEL: susubo32:
1283; CISI:       ; %bb.0:
1284; CISI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
1285; CISI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1286; CISI-NEXT:    s_mov_b32 s3, 0xf000
1287; CISI-NEXT:    s_mov_b32 s2, -1
1288; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1289; CISI-NEXT:    s_sub_i32 s4, s6, s7
1290; CISI-NEXT:    v_mov_b32_e32 v0, s4
1291; CISI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1292; CISI-NEXT:    s_endpgm
1293;
1294; VI-LABEL: susubo32:
1295; VI:       ; %bb.0:
1296; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1297; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1298; VI-NEXT:    s_waitcnt lgkmcnt(0)
1299; VI-NEXT:    s_sub_i32 s0, s0, s1
1300; VI-NEXT:    v_mov_b32_e32 v0, s2
1301; VI-NEXT:    v_mov_b32_e32 v1, s3
1302; VI-NEXT:    v_mov_b32_e32 v2, s0
1303; VI-NEXT:    flat_store_dword v[0:1], v2
1304; VI-NEXT:    s_endpgm
1305;
1306; GFX9-LABEL: susubo32:
1307; GFX9:       ; %bb.0:
1308; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1309; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1310; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1311; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1312; GFX9-NEXT:    s_sub_i32 s0, s0, s1
1313; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1314; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
1315; GFX9-NEXT:    s_endpgm
1316;
1317; GFX1010-LABEL: susubo32:
1318; GFX1010:       ; %bb.0:
1319; GFX1010-NEXT:    s_clause 0x1
1320; GFX1010-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1321; GFX1010-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1322; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
1323; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
1324; GFX1010-NEXT:    s_sub_i32 s0, s0, s1
1325; GFX1010-NEXT:    v_mov_b32_e32 v1, s0
1326; GFX1010-NEXT:    global_store_dword v0, v1, s[2:3]
1327; GFX1010-NEXT:    s_endpgm
1328;
1329; GFX1030W32-LABEL: susubo32:
1330; GFX1030W32:       ; %bb.0:
1331; GFX1030W32-NEXT:    s_clause 0x1
1332; GFX1030W32-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1333; GFX1030W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1334; GFX1030W32-NEXT:    v_mov_b32_e32 v0, 0
1335; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX1030W32-NEXT:    s_sub_i32 s0, s0, s1
1337; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s0
1338; GFX1030W32-NEXT:    global_store_dword v0, v1, s[2:3]
1339; GFX1030W32-NEXT:    s_endpgm
1340;
1341; GFX1030W64-LABEL: susubo32:
1342; GFX1030W64:       ; %bb.0:
1343; GFX1030W64-NEXT:    s_clause 0x1
1344; GFX1030W64-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1345; GFX1030W64-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1346; GFX1030W64-NEXT:    v_mov_b32_e32 v0, 0
1347; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
1348; GFX1030W64-NEXT:    s_sub_i32 s0, s0, s1
1349; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s0
1350; GFX1030W64-NEXT:    global_store_dword v0, v1, s[2:3]
1351; GFX1030W64-NEXT:    s_endpgm
1352;
1353; GFX11-LABEL: susubo32:
1354; GFX11:       ; %bb.0:
1355; GFX11-NEXT:    s_clause 0x1
1356; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
1357; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1358; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1359; GFX11-NEXT:    s_sub_i32 s0, s0, s1
1360; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1361; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
1362; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
1363; GFX11-NEXT:    s_endpgm
1364  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
1365  %val = extractvalue { i32, i1 } %usub, 0
1366  %carry = extractvalue { i32, i1 } %usub, 1
1367  store i32 %val, ptr addrspace(1) %out, align 4
1368  ret void
1369}
1370
1371
1372; GCN-ISEL-LABEL: name:   usubo32_vcc_user
1373; GCN-ISEL-LABEL: body:
1374; GCN-ISEL-LABEL: bb.0
1375; GCN-ISEL: V_SUB_CO_U32_e64
1376
1377; below we check selection to v_sub/subb
1378; because the only user of VCC produced by the USUBOis v_cndmask.
1379; We select to VALU form to avoid unnecessary s_cselect to copy SCC to VCC
1380
1381define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
1382; CISI-LABEL: usubo32_vcc_user:
1383; CISI:       ; %bb.0:
1384; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1385; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1386; CISI-NEXT:    s_mov_b32 s7, 0xf000
1387; CISI-NEXT:    s_mov_b32 s6, -1
1388; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1389; CISI-NEXT:    s_mov_b32 s4, s0
1390; CISI-NEXT:    v_mov_b32_e32 v0, s9
1391; CISI-NEXT:    s_mov_b32 s5, s1
1392; CISI-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
1393; CISI-NEXT:    s_mov_b32 s0, s2
1394; CISI-NEXT:    s_mov_b32 s1, s3
1395; CISI-NEXT:    s_mov_b32 s2, s6
1396; CISI-NEXT:    s_mov_b32 s3, s7
1397; CISI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
1398; CISI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1399; CISI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
1400; CISI-NEXT:    s_endpgm
1401;
1402; VI-LABEL: usubo32_vcc_user:
1403; VI:       ; %bb.0:
1404; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1405; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1406; VI-NEXT:    s_waitcnt lgkmcnt(0)
1407; VI-NEXT:    v_mov_b32_e32 v0, s0
1408; VI-NEXT:    v_mov_b32_e32 v4, s5
1409; VI-NEXT:    v_mov_b32_e32 v1, s1
1410; VI-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
1411; VI-NEXT:    v_mov_b32_e32 v2, s2
1412; VI-NEXT:    v_mov_b32_e32 v3, s3
1413; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1414; VI-NEXT:    flat_store_dword v[0:1], v4
1415; VI-NEXT:    flat_store_byte v[2:3], v5
1416; VI-NEXT:    s_endpgm
1417;
1418; GFX9-LABEL: usubo32_vcc_user:
1419; GFX9:       ; %bb.0:
1420; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1421; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1422; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1424; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1425; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
1426; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1427; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1428; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
1429; GFX9-NEXT:    s_endpgm
1430;
1431; GFX1010-LABEL: usubo32_vcc_user:
1432; GFX1010:       ; %bb.0:
1433; GFX1010-NEXT:    s_clause 0x1
1434; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1435; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1436; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
1437; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
1438; GFX1010-NEXT:    v_sub_co_u32 v1, s4, s6, s7
1439; GFX1010-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
1440; GFX1010-NEXT:    global_store_dword v0, v1, s[0:1]
1441; GFX1010-NEXT:    global_store_byte v0, v2, s[2:3]
1442; GFX1010-NEXT:    s_endpgm
1443;
1444; GFX1030W32-LABEL: usubo32_vcc_user:
1445; GFX1030W32:       ; %bb.0:
1446; GFX1030W32-NEXT:    s_clause 0x1
1447; GFX1030W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1448; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1449; GFX1030W32-NEXT:    v_mov_b32_e32 v0, 0
1450; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX1030W32-NEXT:    v_sub_co_u32 v1, s4, s6, s7
1452; GFX1030W32-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
1453; GFX1030W32-NEXT:    global_store_dword v0, v1, s[0:1]
1454; GFX1030W32-NEXT:    global_store_byte v0, v2, s[2:3]
1455; GFX1030W32-NEXT:    s_endpgm
1456;
1457; GFX1030W64-LABEL: usubo32_vcc_user:
1458; GFX1030W64:       ; %bb.0:
1459; GFX1030W64-NEXT:    s_clause 0x1
1460; GFX1030W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1461; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1462; GFX1030W64-NEXT:    v_mov_b32_e32 v0, 0
1463; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
1464; GFX1030W64-NEXT:    v_sub_co_u32 v1, s[4:5], s6, s7
1465; GFX1030W64-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
1466; GFX1030W64-NEXT:    global_store_dword v0, v1, s[0:1]
1467; GFX1030W64-NEXT:    global_store_byte v0, v2, s[2:3]
1468; GFX1030W64-NEXT:    s_endpgm
1469;
1470; GFX11-LABEL: usubo32_vcc_user:
1471; GFX11:       ; %bb.0:
1472; GFX11-NEXT:    s_clause 0x1
1473; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1474; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1475; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1476; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX11-NEXT:    v_sub_co_u32 v1, s4, s6, s7
1478; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1479; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
1480; GFX11-NEXT:    s_clause 0x1
1481; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1482; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
1483; GFX11-NEXT:    s_endpgm
1484  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
1485  %val = extractvalue { i32, i1 } %usub, 0
1486  %carry = extractvalue { i32, i1 } %usub, 1
1487  store i32 %val, ptr addrspace(1) %out, align 4
1488  store i1 %carry, ptr addrspace(1) %carryout
1489  ret void
1490}
1491
1492; GCN-ISEL-LABEL: name:   susubo64
1493; GCN-ISEL-LABEL: body:
1494; GCN-ISEL-LABEL: bb.0
1495; GCN-ISEL: S_SUB_U64_PSEUDO
1496
1497define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
1498; CISI-LABEL: susubo64:
1499; CISI:       ; %bb.0:
1500; CISI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1501; CISI-NEXT:    s_mov_b32 s11, 0xf000
1502; CISI-NEXT:    s_mov_b32 s10, -1
1503; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1504; CISI-NEXT:    s_sub_u32 s6, s4, s6
1505; CISI-NEXT:    v_mov_b32_e32 v0, s4
1506; CISI-NEXT:    s_subb_u32 s7, s5, s7
1507; CISI-NEXT:    v_mov_b32_e32 v1, s5
1508; CISI-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
1509; CISI-NEXT:    v_mov_b32_e32 v2, s6
1510; CISI-NEXT:    s_mov_b32 s8, s0
1511; CISI-NEXT:    s_mov_b32 s9, s1
1512; CISI-NEXT:    s_mov_b32 s0, s2
1513; CISI-NEXT:    s_mov_b32 s1, s3
1514; CISI-NEXT:    s_mov_b32 s2, s10
1515; CISI-NEXT:    s_mov_b32 s3, s11
1516; CISI-NEXT:    v_mov_b32_e32 v3, s7
1517; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1518; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
1519; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1520; CISI-NEXT:    s_endpgm
1521;
1522; VI-LABEL: susubo64:
1523; VI:       ; %bb.0:
1524; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1525; VI-NEXT:    s_waitcnt lgkmcnt(0)
1526; VI-NEXT:    v_mov_b32_e32 v0, s0
1527; VI-NEXT:    s_sub_u32 s0, s4, s6
1528; VI-NEXT:    v_mov_b32_e32 v4, s4
1529; VI-NEXT:    v_mov_b32_e32 v1, s1
1530; VI-NEXT:    s_subb_u32 s1, s5, s7
1531; VI-NEXT:    v_mov_b32_e32 v5, s5
1532; VI-NEXT:    v_mov_b32_e32 v7, s1
1533; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
1534; VI-NEXT:    v_mov_b32_e32 v6, s0
1535; VI-NEXT:    v_mov_b32_e32 v2, s2
1536; VI-NEXT:    v_mov_b32_e32 v3, s3
1537; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
1538; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1539; VI-NEXT:    flat_store_byte v[2:3], v0
1540; VI-NEXT:    s_endpgm
1541;
1542; GFX9-LABEL: susubo64:
1543; GFX9:       ; %bb.0:
1544; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1545; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1546; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1547; GFX9-NEXT:    s_sub_u32 s0, s12, s14
1548; GFX9-NEXT:    v_mov_b32_e32 v0, s12
1549; GFX9-NEXT:    v_mov_b32_e32 v1, s13
1550; GFX9-NEXT:    s_subb_u32 s1, s13, s15
1551; GFX9-NEXT:    v_mov_b32_e32 v3, s1
1552; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
1553; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1554; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1555; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
1556; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
1557; GFX9-NEXT:    s_endpgm
1558;
1559; GFX1010-LABEL: susubo64:
1560; GFX1010:       ; %bb.0:
1561; GFX1010-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1562; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
1563; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
1564; GFX1010-NEXT:    s_sub_u32 s0, s12, s14
1565; GFX1010-NEXT:    s_subb_u32 s1, s13, s15
1566; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
1567; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
1568; GFX1010-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
1569; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
1570; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
1571; GFX1010-NEXT:    global_store_byte v2, v3, s[10:11]
1572; GFX1010-NEXT:    s_endpgm
1573;
1574; GFX1030W32-LABEL: susubo64:
1575; GFX1030W32:       ; %bb.0:
1576; GFX1030W32-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1577; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
1578; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
1579; GFX1030W32-NEXT:    s_sub_u32 s6, s4, s6
1580; GFX1030W32-NEXT:    s_subb_u32 s7, s5, s7
1581; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s6
1582; GFX1030W32-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
1583; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s7
1584; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
1585; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1586; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
1587; GFX1030W32-NEXT:    s_endpgm
1588;
1589; GFX1030W64-LABEL: susubo64:
1590; GFX1030W64:       ; %bb.0:
1591; GFX1030W64-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1592; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
1593; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX1030W64-NEXT:    s_sub_u32 s6, s4, s6
1595; GFX1030W64-NEXT:    s_subb_u32 s7, s5, s7
1596; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s6
1597; GFX1030W64-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5]
1598; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s7
1599; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
1600; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1601; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
1602; GFX1030W64-NEXT:    s_endpgm
1603;
1604; GFX11-LABEL: susubo64:
1605; GFX11:       ; %bb.0:
1606; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
1607; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1608; GFX11-NEXT:    s_sub_u32 s6, s4, s6
1609; GFX11-NEXT:    s_subb_u32 s7, s5, s7
1610; GFX11-NEXT:    v_mov_b32_e32 v0, s6
1611; GFX11-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
1612; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
1613; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1614; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
1615; GFX11-NEXT:    s_clause 0x1
1616; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1617; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
1618; GFX11-NEXT:    s_endpgm
1619  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
1620  %val = extractvalue { i64, i1 } %usub, 0
1621  %carry = extractvalue { i64, i1 } %usub, 1
1622  store i64 %val, ptr addrspace(1) %out, align 8
1623  store i1 %carry, ptr addrspace(1) %carryout
1624  ret void
1625}
1626
1627; GCN-ISEL-LABEL: name:   vusubo64
1628; GCN-ISEL-LABEL: body:
1629; GCN-ISEL-LABEL: bb.0
1630; GCN-ISEL: V_SUB_U64_PSEUDO
1631
1632define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 {
1633; CISI-LABEL: vusubo64:
1634; CISI:       ; %bb.0:
1635; CISI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1636; CISI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1637; CISI-NEXT:    s_mov_b32 s7, 0xf000
1638; CISI-NEXT:    s_mov_b32 s6, -1
1639; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1640; CISI-NEXT:    s_mov_b32 s4, s0
1641; CISI-NEXT:    v_mov_b32_e32 v1, s9
1642; CISI-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
1643; CISI-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
1644; CISI-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
1645; CISI-NEXT:    s_mov_b32 s5, s1
1646; CISI-NEXT:    s_mov_b32 s0, s2
1647; CISI-NEXT:    s_mov_b32 s1, s3
1648; CISI-NEXT:    s_mov_b32 s2, s6
1649; CISI-NEXT:    s_mov_b32 s3, s7
1650; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1651; CISI-NEXT:    s_waitcnt expcnt(0)
1652; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1653; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1654; CISI-NEXT:    s_endpgm
1655;
1656; VI-LABEL: vusubo64:
1657; VI:       ; %bb.0:
1658; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1659; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1660; VI-NEXT:    s_waitcnt lgkmcnt(0)
1661; VI-NEXT:    v_mov_b32_e32 v1, s0
1662; VI-NEXT:    v_mov_b32_e32 v6, s5
1663; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v0
1664; VI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
1665; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
1666; VI-NEXT:    v_mov_b32_e32 v2, s1
1667; VI-NEXT:    v_mov_b32_e32 v3, s2
1668; VI-NEXT:    v_mov_b32_e32 v4, s3
1669; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1670; VI-NEXT:    flat_store_dwordx2 v[1:2], v[5:6]
1671; VI-NEXT:    flat_store_byte v[3:4], v0
1672; VI-NEXT:    s_endpgm
1673;
1674; GFX9-LABEL: vusubo64:
1675; GFX9:       ; %bb.0:
1676; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1677; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1678; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1679; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1680; GFX9-NEXT:    v_mov_b32_e32 v1, s7
1681; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
1682; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1683; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
1684; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1685; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1686; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
1687; GFX9-NEXT:    s_endpgm
1688;
1689; GFX1010-LABEL: vusubo64:
1690; GFX1010:       ; %bb.0:
1691; GFX1010-NEXT:    s_clause 0x1
1692; GFX1010-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1693; GFX1010-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1694; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
1695; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
1696; GFX1010-NEXT:    v_sub_co_u32 v0, s4, s6, v0
1697; GFX1010-NEXT:    v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4
1698; GFX1010-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1699; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1700; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1701; GFX1010-NEXT:    global_store_byte v2, v3, s[2:3]
1702; GFX1010-NEXT:    s_endpgm
1703;
1704; GFX1030W32-LABEL: vusubo64:
1705; GFX1030W32:       ; %bb.0:
1706; GFX1030W32-NEXT:    s_clause 0x1
1707; GFX1030W32-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1708; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1709; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
1710; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX1030W32-NEXT:    v_sub_co_u32 v0, s4, s6, v0
1712; GFX1030W32-NEXT:    v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
1713; GFX1030W32-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1714; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1715; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1716; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
1717; GFX1030W32-NEXT:    s_endpgm
1718;
1719; GFX1030W64-LABEL: vusubo64:
1720; GFX1030W64:       ; %bb.0:
1721; GFX1030W64-NEXT:    s_clause 0x1
1722; GFX1030W64-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1723; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1724; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
1725; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
1726; GFX1030W64-NEXT:    v_sub_co_u32 v0, s[4:5], s6, v0
1727; GFX1030W64-NEXT:    v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5]
1728; GFX1030W64-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
1729; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1730; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1731; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
1732; GFX1030W64-NEXT:    s_endpgm
1733;
1734; GFX11-LABEL: vusubo64:
1735; GFX11:       ; %bb.0:
1736; GFX11-NEXT:    s_clause 0x1
1737; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1738; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1739; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1740; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1741; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1742; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1743; GFX11-NEXT:    v_sub_co_u32 v0, s4, s6, v0
1744; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, null, s7, 0, s4
1745; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1746; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1]
1747; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
1748; GFX11-NEXT:    s_clause 0x1
1749; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1750; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
1751; GFX11-NEXT:    s_endpgm
1752  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1753  %tid.ext = sext i32 %tid to i64
1754  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext)
1755  %val = extractvalue { i64, i1 } %usub, 0
1756  %carry = extractvalue { i64, i1 } %usub, 1
1757  store i64 %val, ptr addrspace(1) %out, align 8
1758  store i1 %carry, ptr addrspace(1) %carryout
1759  ret void
1760}
1761
1762; GCN-ISEL-LABEL: name:   sudiv64
1763; GCN-ISEL-LABEL: body:
1764; GCN-ISEL-LABEL: bb.3
1765; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64
1766; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, killed %{{[0-9]+}}, killed %[[CARRY]]
1767; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64
1768; GCN-ISEL: S_SUB_CO_PSEUDO killed %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]]
1769
1770define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
1771; CISI-LABEL: sudiv64:
1772; CISI:       ; %bb.0:
1773; CISI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1774; CISI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
1775; CISI-NEXT:    s_waitcnt lgkmcnt(0)
1776; CISI-NEXT:    s_or_b64 s[0:1], s[10:11], s[2:3]
1777; CISI-NEXT:    s_mov_b32 s0, 0
1778; CISI-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[0:1], 0
1779; CISI-NEXT:    s_and_b64 vcc, exec, s[0:1]
1780; CISI-NEXT:    s_cbranch_vccz .LBB16_4
1781; CISI-NEXT:  ; %bb.1:
1782; CISI-NEXT:    v_cvt_f32_u32_e32 v0, s2
1783; CISI-NEXT:    v_cvt_f32_u32_e32 v1, s3
1784; CISI-NEXT:    s_sub_u32 s0, 0, s2
1785; CISI-NEXT:    s_subb_u32 s1, 0, s3
1786; CISI-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
1787; CISI-NEXT:    v_rcp_f32_e32 v0, v0
1788; CISI-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1789; CISI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1790; CISI-NEXT:    v_trunc_f32_e32 v1, v1
1791; CISI-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
1792; CISI-NEXT:    v_cvt_u32_f32_e32 v1, v1
1793; CISI-NEXT:    v_cvt_u32_f32_e32 v0, v0
1794; CISI-NEXT:    v_mul_lo_u32 v2, s0, v1
1795; CISI-NEXT:    v_mul_hi_u32 v3, s0, v0
1796; CISI-NEXT:    v_mul_lo_u32 v5, s1, v0
1797; CISI-NEXT:    v_mul_lo_u32 v4, s0, v0
1798; CISI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1799; CISI-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
1800; CISI-NEXT:    v_mul_hi_u32 v3, v0, v4
1801; CISI-NEXT:    v_mul_lo_u32 v5, v0, v2
1802; CISI-NEXT:    v_mul_hi_u32 v7, v0, v2
1803; CISI-NEXT:    v_mul_lo_u32 v6, v1, v4
1804; CISI-NEXT:    v_mul_hi_u32 v4, v1, v4
1805; CISI-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
1806; CISI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
1807; CISI-NEXT:    v_mul_hi_u32 v7, v1, v2
1808; CISI-NEXT:    v_mul_lo_u32 v2, v1, v2
1809; CISI-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
1810; CISI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
1811; CISI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
1812; CISI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1813; CISI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1814; CISI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1815; CISI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1816; CISI-NEXT:    v_mul_lo_u32 v2, s0, v1
1817; CISI-NEXT:    v_mul_hi_u32 v3, s0, v0
1818; CISI-NEXT:    v_mul_lo_u32 v4, s1, v0
1819; CISI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1820; CISI-NEXT:    v_mul_lo_u32 v3, s0, v0
1821; CISI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1822; CISI-NEXT:    v_mul_lo_u32 v6, v0, v2
1823; CISI-NEXT:    v_mul_hi_u32 v7, v0, v3
1824; CISI-NEXT:    v_mul_hi_u32 v8, v0, v2
1825; CISI-NEXT:    v_mul_hi_u32 v5, v1, v3
1826; CISI-NEXT:    v_mul_lo_u32 v3, v1, v3
1827; CISI-NEXT:    v_mul_hi_u32 v4, v1, v2
1828; CISI-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
1829; CISI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
1830; CISI-NEXT:    v_mul_lo_u32 v2, v1, v2
1831; CISI-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
1832; CISI-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
1833; CISI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
1834; CISI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1835; CISI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1836; CISI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1837; CISI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1838; CISI-NEXT:    v_mul_lo_u32 v2, s10, v1
1839; CISI-NEXT:    v_mul_hi_u32 v3, s10, v0
1840; CISI-NEXT:    v_mul_hi_u32 v4, s10, v1
1841; CISI-NEXT:    v_mul_hi_u32 v5, s11, v1
1842; CISI-NEXT:    v_mul_lo_u32 v1, s11, v1
1843; CISI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1844; CISI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
1845; CISI-NEXT:    v_mul_lo_u32 v4, s11, v0
1846; CISI-NEXT:    v_mul_hi_u32 v0, s11, v0
1847; CISI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1848; CISI-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
1849; CISI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
1850; CISI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1851; CISI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
1852; CISI-NEXT:    v_mul_lo_u32 v2, s2, v1
1853; CISI-NEXT:    v_mul_hi_u32 v3, s2, v0
1854; CISI-NEXT:    v_mul_lo_u32 v4, s3, v0
1855; CISI-NEXT:    v_mov_b32_e32 v5, s3
1856; CISI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1857; CISI-NEXT:    v_mul_lo_u32 v3, s2, v0
1858; CISI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1859; CISI-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
1860; CISI-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
1861; CISI-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
1862; CISI-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
1863; CISI-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
1864; CISI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
1865; CISI-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
1866; CISI-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
1867; CISI-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1868; CISI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
1869; CISI-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
1870; CISI-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
1871; CISI-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
1872; CISI-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
1873; CISI-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
1874; CISI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
1875; CISI-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
1876; CISI-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
1877; CISI-NEXT:    v_mov_b32_e32 v6, s11
1878; CISI-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
1879; CISI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
1880; CISI-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
1881; CISI-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
1882; CISI-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
1883; CISI-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
1884; CISI-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
1885; CISI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
1886; CISI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1887; CISI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1888; CISI-NEXT:    s_cbranch_execnz .LBB16_3
1889; CISI-NEXT:  .LBB16_2:
1890; CISI-NEXT:    v_cvt_f32_u32_e32 v0, s2
1891; CISI-NEXT:    s_sub_i32 s0, 0, s2
1892; CISI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1893; CISI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1894; CISI-NEXT:    v_cvt_u32_f32_e32 v0, v0
1895; CISI-NEXT:    v_mul_lo_u32 v1, s0, v0
1896; CISI-NEXT:    v_mul_hi_u32 v1, v0, v1
1897; CISI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1898; CISI-NEXT:    v_mul_hi_u32 v0, s10, v0
1899; CISI-NEXT:    v_readfirstlane_b32 s0, v0
1900; CISI-NEXT:    s_mul_i32 s0, s0, s2
1901; CISI-NEXT:    s_sub_i32 s0, s10, s0
1902; CISI-NEXT:    s_sub_i32 s1, s0, s2
1903; CISI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
1904; CISI-NEXT:    s_cmp_ge_u32 s0, s2
1905; CISI-NEXT:    s_cselect_b64 vcc, -1, 0
1906; CISI-NEXT:    s_cselect_b32 s0, s1, s0
1907; CISI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1908; CISI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
1909; CISI-NEXT:    s_cmp_ge_u32 s0, s2
1910; CISI-NEXT:    s_cselect_b64 vcc, -1, 0
1911; CISI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1912; CISI-NEXT:    v_mov_b32_e32 v1, 0
1913; CISI-NEXT:  .LBB16_3:
1914; CISI-NEXT:    s_mov_b32 s11, 0xf000
1915; CISI-NEXT:    s_mov_b32 s10, -1
1916; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1917; CISI-NEXT:    s_endpgm
1918; CISI-NEXT:  .LBB16_4:
1919; CISI-NEXT:    ; implicit-def: $vgpr0_vgpr1
1920; CISI-NEXT:    s_branch .LBB16_2
1921;
1922; VI-LABEL: sudiv64:
1923; VI:       ; %bb.0:
1924; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
1925; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
1926; VI-NEXT:    s_waitcnt lgkmcnt(0)
1927; VI-NEXT:    s_or_b64 s[0:1], s[10:11], s[2:3]
1928; VI-NEXT:    s_mov_b32 s0, 0
1929; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
1930; VI-NEXT:    s_cbranch_scc0 .LBB16_4
1931; VI-NEXT:  ; %bb.1:
1932; VI-NEXT:    v_cvt_f32_u32_e32 v0, s2
1933; VI-NEXT:    v_cvt_f32_u32_e32 v1, s3
1934; VI-NEXT:    s_sub_u32 s4, 0, s2
1935; VI-NEXT:    s_subb_u32 s5, 0, s3
1936; VI-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
1937; VI-NEXT:    v_rcp_f32_e32 v0, v0
1938; VI-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1939; VI-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1940; VI-NEXT:    v_trunc_f32_e32 v1, v1
1941; VI-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
1942; VI-NEXT:    v_cvt_u32_f32_e32 v4, v1
1943; VI-NEXT:    v_cvt_u32_f32_e32 v5, v0
1944; VI-NEXT:    v_mul_lo_u32 v2, s4, v4
1945; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v5, 0
1946; VI-NEXT:    v_mul_lo_u32 v3, s5, v5
1947; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
1948; VI-NEXT:    v_add_u32_e32 v3, vcc, v1, v3
1949; VI-NEXT:    v_mul_hi_u32 v6, v5, v0
1950; VI-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0
1951; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v1
1952; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, v0, 0
1953; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v2, vcc
1954; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v3, 0
1955; VI-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
1956; VI-NEXT:    v_addc_u32_e32 v0, vcc, v7, v1, vcc
1957; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1958; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1959; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1960; VI-NEXT:    v_add_u32_e32 v6, vcc, v5, v0
1961; VI-NEXT:    v_addc_u32_e32 v7, vcc, v4, v1, vcc
1962; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v6, 0
1963; VI-NEXT:    v_mul_lo_u32 v4, s4, v7
1964; VI-NEXT:    v_mul_lo_u32 v5, s5, v6
1965; VI-NEXT:    v_mul_hi_u32 v8, v6, v0
1966; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0
1967; VI-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
1968; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
1969; VI-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0
1970; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0
1971; VI-NEXT:    v_add_u32_e32 v4, vcc, v8, v4
1972; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1973; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
1974; VI-NEXT:    v_addc_u32_e32 v2, vcc, v5, v3, vcc
1975; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1976; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
1977; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1978; VI-NEXT:    v_add_u32_e32 v2, vcc, v6, v0
1979; VI-NEXT:    v_addc_u32_e32 v3, vcc, v7, v1, vcc
1980; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
1981; VI-NEXT:    v_mul_hi_u32 v4, s10, v2
1982; VI-NEXT:    v_readfirstlane_b32 s4, v1
1983; VI-NEXT:    v_readfirstlane_b32 s5, v0
1984; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s11, v3, 0
1985; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s11, v2, 0
1986; VI-NEXT:    v_readfirstlane_b32 s6, v4
1987; VI-NEXT:    s_add_u32 s0, s6, s5
1988; VI-NEXT:    s_addc_u32 s1, 0, s4
1989; VI-NEXT:    v_readfirstlane_b32 s6, v2
1990; VI-NEXT:    v_readfirstlane_b32 s5, v3
1991; VI-NEXT:    s_add_u32 s0, s0, s6
1992; VI-NEXT:    v_readfirstlane_b32 s4, v1
1993; VI-NEXT:    s_addc_u32 s0, s1, s5
1994; VI-NEXT:    s_addc_u32 s6, s4, 0
1995; VI-NEXT:    v_readfirstlane_b32 s1, v0
1996; VI-NEXT:    s_add_u32 s7, s0, s1
1997; VI-NEXT:    v_mov_b32_e32 v2, s7
1998; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0
1999; VI-NEXT:    s_addc_u32 s6, 0, s6
2000; VI-NEXT:    s_mul_i32 s0, s2, s6
2001; VI-NEXT:    v_readfirstlane_b32 s1, v1
2002; VI-NEXT:    s_add_i32 s0, s1, s0
2003; VI-NEXT:    s_mul_i32 s1, s3, s7
2004; VI-NEXT:    s_add_i32 s12, s0, s1
2005; VI-NEXT:    s_sub_i32 s0, s11, s12
2006; VI-NEXT:    v_sub_u32_e32 v0, vcc, s10, v0
2007; VI-NEXT:    s_cmp_lg_u64 vcc, 0
2008; VI-NEXT:    s_subb_u32 s13, s0, s3
2009; VI-NEXT:    v_subrev_u32_e64 v1, s[0:1], s2, v0
2010; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
2011; VI-NEXT:    s_subb_u32 s13, s13, 0
2012; VI-NEXT:    s_cmp_ge_u32 s13, s3
2013; VI-NEXT:    s_cselect_b32 s14, -1, 0
2014; VI-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v1
2015; VI-NEXT:    s_cmp_eq_u32 s13, s3
2016; VI-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
2017; VI-NEXT:    v_mov_b32_e32 v3, s14
2018; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
2019; VI-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
2020; VI-NEXT:    s_add_u32 s0, s7, 1
2021; VI-NEXT:    s_addc_u32 s13, s6, 0
2022; VI-NEXT:    s_add_u32 s1, s7, 2
2023; VI-NEXT:    s_addc_u32 s7, s6, 0
2024; VI-NEXT:    v_mov_b32_e32 v3, s0
2025; VI-NEXT:    v_mov_b32_e32 v4, s1
2026; VI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
2027; VI-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2028; VI-NEXT:    v_mov_b32_e32 v1, s13
2029; VI-NEXT:    v_mov_b32_e32 v4, s7
2030; VI-NEXT:    s_cmp_lg_u64 vcc, 0
2031; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2032; VI-NEXT:    s_subb_u32 s0, s11, s12
2033; VI-NEXT:    s_cmp_ge_u32 s0, s3
2034; VI-NEXT:    s_cselect_b32 s1, -1, 0
2035; VI-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2036; VI-NEXT:    s_cmp_eq_u32 s0, s3
2037; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2038; VI-NEXT:    v_mov_b32_e32 v4, s1
2039; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2040; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
2041; VI-NEXT:    v_mov_b32_e32 v4, s6
2042; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
2043; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
2044; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
2045; VI-NEXT:    s_cbranch_execnz .LBB16_3
2046; VI-NEXT:  .LBB16_2:
2047; VI-NEXT:    v_cvt_f32_u32_e32 v0, s2
2048; VI-NEXT:    s_sub_i32 s0, 0, s2
2049; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2050; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2051; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
2052; VI-NEXT:    v_mul_lo_u32 v1, s0, v0
2053; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
2054; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2055; VI-NEXT:    v_mul_hi_u32 v0, s10, v0
2056; VI-NEXT:    v_readfirstlane_b32 s0, v0
2057; VI-NEXT:    s_mul_i32 s0, s0, s2
2058; VI-NEXT:    s_sub_i32 s0, s10, s0
2059; VI-NEXT:    s_sub_i32 s1, s0, s2
2060; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
2061; VI-NEXT:    s_cmp_ge_u32 s0, s2
2062; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2063; VI-NEXT:    s_cselect_b32 s0, s1, s0
2064; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2065; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
2066; VI-NEXT:    s_cmp_ge_u32 s0, s2
2067; VI-NEXT:    s_cselect_b64 vcc, -1, 0
2068; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
2069; VI-NEXT:    v_mov_b32_e32 v1, 0
2070; VI-NEXT:  .LBB16_3:
2071; VI-NEXT:    v_mov_b32_e32 v2, s8
2072; VI-NEXT:    v_mov_b32_e32 v3, s9
2073; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2074; VI-NEXT:    s_endpgm
2075; VI-NEXT:  .LBB16_4:
2076; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
2077; VI-NEXT:    s_branch .LBB16_2
2078;
2079; GFX9-LABEL: sudiv64:
2080; GFX9:       ; %bb.0:
2081; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2082; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
2083; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2084; GFX9-NEXT:    s_or_b64 s[0:1], s[10:11], s[2:3]
2085; GFX9-NEXT:    s_mov_b32 s0, 0
2086; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
2087; GFX9-NEXT:    s_cbranch_scc0 .LBB16_4
2088; GFX9-NEXT:  ; %bb.1:
2089; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2090; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
2091; GFX9-NEXT:    s_sub_u32 s0, 0, s2
2092; GFX9-NEXT:    s_subb_u32 s1, 0, s3
2093; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
2094; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
2095; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
2096; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
2097; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2098; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
2099; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2100; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2101; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
2102; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
2103; GFX9-NEXT:    s_mul_i32 s12, s0, s6
2104; GFX9-NEXT:    s_mul_hi_u32 s14, s0, s7
2105; GFX9-NEXT:    s_mul_i32 s13, s1, s7
2106; GFX9-NEXT:    s_add_i32 s12, s14, s12
2107; GFX9-NEXT:    s_add_i32 s12, s12, s13
2108; GFX9-NEXT:    s_mul_i32 s15, s0, s7
2109; GFX9-NEXT:    s_mul_hi_u32 s13, s7, s12
2110; GFX9-NEXT:    s_mul_i32 s14, s7, s12
2111; GFX9-NEXT:    s_mul_hi_u32 s7, s7, s15
2112; GFX9-NEXT:    s_add_u32 s7, s7, s14
2113; GFX9-NEXT:    s_addc_u32 s13, 0, s13
2114; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s15
2115; GFX9-NEXT:    s_mul_i32 s15, s6, s15
2116; GFX9-NEXT:    s_add_u32 s7, s7, s15
2117; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s12
2118; GFX9-NEXT:    s_addc_u32 s7, s13, s16
2119; GFX9-NEXT:    s_addc_u32 s13, s14, 0
2120; GFX9-NEXT:    s_mul_i32 s12, s6, s12
2121; GFX9-NEXT:    s_add_u32 s7, s7, s12
2122; GFX9-NEXT:    s_addc_u32 s12, 0, s13
2123; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s7, v0
2124; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
2125; GFX9-NEXT:    s_addc_u32 s6, s6, s12
2126; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
2127; GFX9-NEXT:    s_mul_i32 s7, s0, s6
2128; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
2129; GFX9-NEXT:    s_add_i32 s7, s13, s7
2130; GFX9-NEXT:    s_mul_i32 s1, s1, s12
2131; GFX9-NEXT:    s_add_i32 s7, s7, s1
2132; GFX9-NEXT:    s_mul_i32 s0, s0, s12
2133; GFX9-NEXT:    s_mul_hi_u32 s13, s6, s0
2134; GFX9-NEXT:    s_mul_i32 s14, s6, s0
2135; GFX9-NEXT:    s_mul_i32 s16, s12, s7
2136; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
2137; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s7
2138; GFX9-NEXT:    s_add_u32 s0, s0, s16
2139; GFX9-NEXT:    s_addc_u32 s12, 0, s15
2140; GFX9-NEXT:    s_add_u32 s0, s0, s14
2141; GFX9-NEXT:    s_mul_hi_u32 s1, s6, s7
2142; GFX9-NEXT:    s_addc_u32 s0, s12, s13
2143; GFX9-NEXT:    s_addc_u32 s1, s1, 0
2144; GFX9-NEXT:    s_mul_i32 s7, s6, s7
2145; GFX9-NEXT:    s_add_u32 s0, s0, s7
2146; GFX9-NEXT:    s_addc_u32 s1, 0, s1
2147; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
2148; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
2149; GFX9-NEXT:    s_addc_u32 s0, s6, s1
2150; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
2151; GFX9-NEXT:    s_mul_i32 s6, s10, s0
2152; GFX9-NEXT:    s_mul_hi_u32 s12, s10, s7
2153; GFX9-NEXT:    s_mul_hi_u32 s1, s10, s0
2154; GFX9-NEXT:    s_add_u32 s6, s12, s6
2155; GFX9-NEXT:    s_addc_u32 s1, 0, s1
2156; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s7
2157; GFX9-NEXT:    s_mul_i32 s7, s11, s7
2158; GFX9-NEXT:    s_add_u32 s6, s6, s7
2159; GFX9-NEXT:    s_mul_hi_u32 s12, s11, s0
2160; GFX9-NEXT:    s_addc_u32 s1, s1, s13
2161; GFX9-NEXT:    s_addc_u32 s6, s12, 0
2162; GFX9-NEXT:    s_mul_i32 s0, s11, s0
2163; GFX9-NEXT:    s_add_u32 s7, s1, s0
2164; GFX9-NEXT:    s_addc_u32 s6, 0, s6
2165; GFX9-NEXT:    s_mul_i32 s0, s2, s6
2166; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s7
2167; GFX9-NEXT:    s_add_i32 s0, s1, s0
2168; GFX9-NEXT:    s_mul_i32 s1, s3, s7
2169; GFX9-NEXT:    s_add_i32 s12, s0, s1
2170; GFX9-NEXT:    s_mul_i32 s1, s2, s7
2171; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2172; GFX9-NEXT:    s_sub_i32 s0, s11, s12
2173; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s10, v0
2174; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
2175; GFX9-NEXT:    s_subb_u32 s13, s0, s3
2176; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s2, v0
2177; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
2178; GFX9-NEXT:    s_subb_u32 s13, s13, 0
2179; GFX9-NEXT:    s_cmp_ge_u32 s13, s3
2180; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
2181; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v1
2182; GFX9-NEXT:    s_cmp_eq_u32 s13, s3
2183; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
2184; GFX9-NEXT:    v_mov_b32_e32 v2, s14
2185; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
2186; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
2187; GFX9-NEXT:    s_add_u32 s0, s7, 1
2188; GFX9-NEXT:    s_addc_u32 s13, s6, 0
2189; GFX9-NEXT:    s_add_u32 s1, s7, 2
2190; GFX9-NEXT:    s_addc_u32 s14, s6, 0
2191; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2192; GFX9-NEXT:    v_mov_b32_e32 v3, s1
2193; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
2194; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
2195; GFX9-NEXT:    v_mov_b32_e32 v1, s13
2196; GFX9-NEXT:    v_mov_b32_e32 v3, s14
2197; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
2198; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
2199; GFX9-NEXT:    s_subb_u32 s0, s11, s12
2200; GFX9-NEXT:    s_cmp_ge_u32 s0, s3
2201; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
2202; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2203; GFX9-NEXT:    s_cmp_eq_u32 s0, s3
2204; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2205; GFX9-NEXT:    v_mov_b32_e32 v3, s1
2206; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
2207; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
2208; GFX9-NEXT:    v_mov_b32_e32 v3, s6
2209; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
2210; GFX9-NEXT:    v_mov_b32_e32 v0, s7
2211; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
2212; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2213; GFX9-NEXT:    s_cbranch_execnz .LBB16_3
2214; GFX9-NEXT:  .LBB16_2:
2215; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2216; GFX9-NEXT:    s_sub_i32 s0, 0, s2
2217; GFX9-NEXT:    s_mov_b32 s1, 0
2218; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2219; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2220; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2221; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
2222; GFX9-NEXT:    s_mul_i32 s0, s0, s3
2223; GFX9-NEXT:    s_mul_hi_u32 s0, s3, s0
2224; GFX9-NEXT:    s_add_i32 s3, s3, s0
2225; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s3
2226; GFX9-NEXT:    s_mul_i32 s4, s0, s2
2227; GFX9-NEXT:    s_sub_i32 s4, s10, s4
2228; GFX9-NEXT:    s_add_i32 s3, s0, 1
2229; GFX9-NEXT:    s_sub_i32 s5, s4, s2
2230; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
2231; GFX9-NEXT:    s_cselect_b32 s0, s3, s0
2232; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
2233; GFX9-NEXT:    s_add_i32 s3, s0, 1
2234; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
2235; GFX9-NEXT:    s_cselect_b32 s0, s3, s0
2236; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2237; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2238; GFX9-NEXT:  .LBB16_3:
2239; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2240; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
2241; GFX9-NEXT:    s_endpgm
2242; GFX9-NEXT:  .LBB16_4:
2243; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2244; GFX9-NEXT:    s_branch .LBB16_2
2245;
2246; GFX1010-LABEL: sudiv64:
2247; GFX1010:       ; %bb.0:
2248; GFX1010-NEXT:    s_clause 0x1
2249; GFX1010-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2250; GFX1010-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
2251; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
2252; GFX1010-NEXT:    s_or_b64 s[4:5], s[10:11], s[2:3]
2253; GFX1010-NEXT:    s_mov_b32 s4, 0
2254; GFX1010-NEXT:    s_cmp_lg_u64 s[4:5], 0
2255; GFX1010-NEXT:    s_cbranch_scc0 .LBB16_4
2256; GFX1010-NEXT:  ; %bb.1:
2257; GFX1010-NEXT:    v_cvt_f32_u32_e32 v0, s2
2258; GFX1010-NEXT:    v_cvt_f32_u32_e32 v1, s3
2259; GFX1010-NEXT:    s_sub_u32 s5, 0, s2
2260; GFX1010-NEXT:    s_subb_u32 s6, 0, s3
2261; GFX1010-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
2262; GFX1010-NEXT:    v_rcp_f32_e32 v0, v0
2263; GFX1010-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
2264; GFX1010-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
2265; GFX1010-NEXT:    v_trunc_f32_e32 v1, v1
2266; GFX1010-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
2267; GFX1010-NEXT:    v_cvt_u32_f32_e32 v1, v1
2268; GFX1010-NEXT:    v_cvt_u32_f32_e32 v0, v0
2269; GFX1010-NEXT:    v_readfirstlane_b32 s0, v1
2270; GFX1010-NEXT:    v_readfirstlane_b32 s1, v0
2271; GFX1010-NEXT:    s_mul_i32 s7, s5, s0
2272; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s1
2273; GFX1010-NEXT:    s_mul_i32 s12, s6, s1
2274; GFX1010-NEXT:    s_add_i32 s7, s13, s7
2275; GFX1010-NEXT:    s_mul_i32 s14, s5, s1
2276; GFX1010-NEXT:    s_add_i32 s7, s7, s12
2277; GFX1010-NEXT:    s_mul_hi_u32 s13, s1, s14
2278; GFX1010-NEXT:    s_mul_hi_u32 s15, s0, s14
2279; GFX1010-NEXT:    s_mul_i32 s12, s0, s14
2280; GFX1010-NEXT:    s_mul_hi_u32 s14, s1, s7
2281; GFX1010-NEXT:    s_mul_i32 s1, s1, s7
2282; GFX1010-NEXT:    s_mul_hi_u32 s16, s0, s7
2283; GFX1010-NEXT:    s_add_u32 s1, s13, s1
2284; GFX1010-NEXT:    s_addc_u32 s13, 0, s14
2285; GFX1010-NEXT:    s_add_u32 s1, s1, s12
2286; GFX1010-NEXT:    s_mul_i32 s7, s0, s7
2287; GFX1010-NEXT:    s_addc_u32 s1, s13, s15
2288; GFX1010-NEXT:    s_addc_u32 s12, s16, 0
2289; GFX1010-NEXT:    s_add_u32 s1, s1, s7
2290; GFX1010-NEXT:    s_addc_u32 s7, 0, s12
2291; GFX1010-NEXT:    v_add_co_u32 v0, s1, v0, s1
2292; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
2293; GFX1010-NEXT:    s_addc_u32 s0, s0, s7
2294; GFX1010-NEXT:    v_readfirstlane_b32 s1, v0
2295; GFX1010-NEXT:    s_mul_i32 s7, s5, s0
2296; GFX1010-NEXT:    s_mul_hi_u32 s12, s5, s1
2297; GFX1010-NEXT:    s_mul_i32 s6, s6, s1
2298; GFX1010-NEXT:    s_add_i32 s7, s12, s7
2299; GFX1010-NEXT:    s_mul_i32 s5, s5, s1
2300; GFX1010-NEXT:    s_add_i32 s7, s7, s6
2301; GFX1010-NEXT:    s_mul_hi_u32 s12, s0, s5
2302; GFX1010-NEXT:    s_mul_i32 s13, s0, s5
2303; GFX1010-NEXT:    s_mul_hi_u32 s5, s1, s5
2304; GFX1010-NEXT:    s_mul_hi_u32 s14, s1, s7
2305; GFX1010-NEXT:    s_mul_i32 s1, s1, s7
2306; GFX1010-NEXT:    s_mul_hi_u32 s6, s0, s7
2307; GFX1010-NEXT:    s_add_u32 s1, s5, s1
2308; GFX1010-NEXT:    s_addc_u32 s5, 0, s14
2309; GFX1010-NEXT:    s_add_u32 s1, s1, s13
2310; GFX1010-NEXT:    s_mul_i32 s7, s0, s7
2311; GFX1010-NEXT:    s_addc_u32 s1, s5, s12
2312; GFX1010-NEXT:    s_addc_u32 s5, s6, 0
2313; GFX1010-NEXT:    s_add_u32 s1, s1, s7
2314; GFX1010-NEXT:    s_addc_u32 s5, 0, s5
2315; GFX1010-NEXT:    v_add_co_u32 v0, s1, v0, s1
2316; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
2317; GFX1010-NEXT:    s_addc_u32 s0, s0, s5
2318; GFX1010-NEXT:    v_readfirstlane_b32 s1, v0
2319; GFX1010-NEXT:    s_mul_i32 s6, s10, s0
2320; GFX1010-NEXT:    s_mul_hi_u32 s5, s10, s0
2321; GFX1010-NEXT:    s_mul_hi_u32 s7, s11, s0
2322; GFX1010-NEXT:    s_mul_i32 s0, s11, s0
2323; GFX1010-NEXT:    s_mul_hi_u32 s12, s10, s1
2324; GFX1010-NEXT:    s_mul_hi_u32 s13, s11, s1
2325; GFX1010-NEXT:    s_mul_i32 s1, s11, s1
2326; GFX1010-NEXT:    s_add_u32 s6, s12, s6
2327; GFX1010-NEXT:    s_addc_u32 s5, 0, s5
2328; GFX1010-NEXT:    s_add_u32 s1, s6, s1
2329; GFX1010-NEXT:    s_addc_u32 s1, s5, s13
2330; GFX1010-NEXT:    s_addc_u32 s5, s7, 0
2331; GFX1010-NEXT:    s_add_u32 s1, s1, s0
2332; GFX1010-NEXT:    s_addc_u32 s5, 0, s5
2333; GFX1010-NEXT:    s_mul_hi_u32 s0, s2, s1
2334; GFX1010-NEXT:    s_mul_i32 s7, s2, s5
2335; GFX1010-NEXT:    s_mul_i32 s12, s2, s1
2336; GFX1010-NEXT:    s_add_i32 s0, s0, s7
2337; GFX1010-NEXT:    v_sub_co_u32 v0, s7, s10, s12
2338; GFX1010-NEXT:    s_mul_i32 s6, s3, s1
2339; GFX1010-NEXT:    s_add_i32 s0, s0, s6
2340; GFX1010-NEXT:    v_sub_co_u32 v1, s12, v0, s2
2341; GFX1010-NEXT:    s_sub_i32 s6, s11, s0
2342; GFX1010-NEXT:    s_cmp_lg_u32 s7, 0
2343; GFX1010-NEXT:    s_subb_u32 s6, s6, s3
2344; GFX1010-NEXT:    s_cmp_lg_u32 s12, 0
2345; GFX1010-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v1
2346; GFX1010-NEXT:    s_subb_u32 s6, s6, 0
2347; GFX1010-NEXT:    s_cmp_ge_u32 s6, s3
2348; GFX1010-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2349; GFX1010-NEXT:    s_cselect_b32 s12, -1, 0
2350; GFX1010-NEXT:    s_cmp_eq_u32 s6, s3
2351; GFX1010-NEXT:    s_cselect_b32 vcc_lo, -1, 0
2352; GFX1010-NEXT:    s_add_u32 s6, s1, 1
2353; GFX1010-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2354; GFX1010-NEXT:    s_addc_u32 s12, s5, 0
2355; GFX1010-NEXT:    s_add_u32 s13, s1, 2
2356; GFX1010-NEXT:    s_addc_u32 s14, s5, 0
2357; GFX1010-NEXT:    s_cmp_lg_u32 s7, 0
2358; GFX1010-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v0
2359; GFX1010-NEXT:    s_subb_u32 s0, s11, s0
2360; GFX1010-NEXT:    v_mov_b32_e32 v2, s13
2361; GFX1010-NEXT:    s_cmp_ge_u32 s0, s3
2362; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2363; GFX1010-NEXT:    s_cselect_b32 s7, -1, 0
2364; GFX1010-NEXT:    s_cmp_eq_u32 s0, s3
2365; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
2366; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
2367; GFX1010-NEXT:    v_mov_b32_e32 v1, s14
2368; GFX1010-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s0
2369; GFX1010-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
2370; GFX1010-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2371; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
2372; GFX1010-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
2373; GFX1010-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
2374; GFX1010-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
2375; GFX1010-NEXT:    s_cbranch_vccnz .LBB16_3
2376; GFX1010-NEXT:  .LBB16_2:
2377; GFX1010-NEXT:    v_cvt_f32_u32_e32 v0, s2
2378; GFX1010-NEXT:    s_sub_i32 s1, 0, s2
2379; GFX1010-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2380; GFX1010-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2381; GFX1010-NEXT:    v_cvt_u32_f32_e32 v0, v0
2382; GFX1010-NEXT:    v_readfirstlane_b32 s0, v0
2383; GFX1010-NEXT:    s_mul_i32 s1, s1, s0
2384; GFX1010-NEXT:    s_mul_hi_u32 s1, s0, s1
2385; GFX1010-NEXT:    s_add_i32 s0, s0, s1
2386; GFX1010-NEXT:    s_mul_hi_u32 s0, s10, s0
2387; GFX1010-NEXT:    s_mul_i32 s1, s0, s2
2388; GFX1010-NEXT:    s_add_i32 s3, s0, 1
2389; GFX1010-NEXT:    s_sub_i32 s1, s10, s1
2390; GFX1010-NEXT:    s_sub_i32 s4, s1, s2
2391; GFX1010-NEXT:    s_cmp_ge_u32 s1, s2
2392; GFX1010-NEXT:    s_cselect_b32 s0, s3, s0
2393; GFX1010-NEXT:    s_cselect_b32 s1, s4, s1
2394; GFX1010-NEXT:    s_add_i32 s3, s0, 1
2395; GFX1010-NEXT:    s_cmp_ge_u32 s1, s2
2396; GFX1010-NEXT:    s_mov_b32 s1, 0
2397; GFX1010-NEXT:    s_cselect_b32 s0, s3, s0
2398; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
2399; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
2400; GFX1010-NEXT:  .LBB16_3:
2401; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
2402; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
2403; GFX1010-NEXT:    s_endpgm
2404; GFX1010-NEXT:  .LBB16_4:
2405; GFX1010-NEXT:    ; implicit-def: $vgpr0_vgpr1
2406; GFX1010-NEXT:    s_branch .LBB16_2
2407;
2408; GFX1030W32-LABEL: sudiv64:
2409; GFX1030W32:       ; %bb.0:
2410; GFX1030W32-NEXT:    s_clause 0x1
2411; GFX1030W32-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2412; GFX1030W32-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
2413; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
2414; GFX1030W32-NEXT:    s_or_b64 s[4:5], s[10:11], s[2:3]
2415; GFX1030W32-NEXT:    s_mov_b32 s4, 0
2416; GFX1030W32-NEXT:    s_cmp_lg_u64 s[4:5], 0
2417; GFX1030W32-NEXT:    s_cbranch_scc0 .LBB16_4
2418; GFX1030W32-NEXT:  ; %bb.1:
2419; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v0, s2
2420; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v1, s3
2421; GFX1030W32-NEXT:    s_sub_u32 s5, 0, s2
2422; GFX1030W32-NEXT:    s_subb_u32 s6, 0, s3
2423; GFX1030W32-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
2424; GFX1030W32-NEXT:    v_rcp_f32_e32 v0, v0
2425; GFX1030W32-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
2426; GFX1030W32-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
2427; GFX1030W32-NEXT:    v_trunc_f32_e32 v1, v1
2428; GFX1030W32-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
2429; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v1, v1
2430; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v0, v0
2431; GFX1030W32-NEXT:    v_readfirstlane_b32 s0, v1
2432; GFX1030W32-NEXT:    v_readfirstlane_b32 s1, v0
2433; GFX1030W32-NEXT:    s_mul_i32 s7, s5, s0
2434; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s5, s1
2435; GFX1030W32-NEXT:    s_mul_i32 s12, s6, s1
2436; GFX1030W32-NEXT:    s_add_i32 s7, s13, s7
2437; GFX1030W32-NEXT:    s_mul_i32 s14, s5, s1
2438; GFX1030W32-NEXT:    s_add_i32 s7, s7, s12
2439; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s1, s14
2440; GFX1030W32-NEXT:    s_mul_hi_u32 s15, s0, s14
2441; GFX1030W32-NEXT:    s_mul_i32 s12, s0, s14
2442; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s1, s7
2443; GFX1030W32-NEXT:    s_mul_i32 s1, s1, s7
2444; GFX1030W32-NEXT:    s_mul_hi_u32 s16, s0, s7
2445; GFX1030W32-NEXT:    s_add_u32 s1, s13, s1
2446; GFX1030W32-NEXT:    s_addc_u32 s13, 0, s14
2447; GFX1030W32-NEXT:    s_add_u32 s1, s1, s12
2448; GFX1030W32-NEXT:    s_mul_i32 s7, s0, s7
2449; GFX1030W32-NEXT:    s_addc_u32 s1, s13, s15
2450; GFX1030W32-NEXT:    s_addc_u32 s12, s16, 0
2451; GFX1030W32-NEXT:    s_add_u32 s1, s1, s7
2452; GFX1030W32-NEXT:    s_addc_u32 s7, 0, s12
2453; GFX1030W32-NEXT:    v_add_co_u32 v0, s1, v0, s1
2454; GFX1030W32-NEXT:    s_cmp_lg_u32 s1, 0
2455; GFX1030W32-NEXT:    s_addc_u32 s0, s0, s7
2456; GFX1030W32-NEXT:    v_readfirstlane_b32 s1, v0
2457; GFX1030W32-NEXT:    s_mul_i32 s7, s5, s0
2458; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s5, s1
2459; GFX1030W32-NEXT:    s_mul_i32 s6, s6, s1
2460; GFX1030W32-NEXT:    s_add_i32 s7, s12, s7
2461; GFX1030W32-NEXT:    s_mul_i32 s5, s5, s1
2462; GFX1030W32-NEXT:    s_add_i32 s7, s7, s6
2463; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s0, s5
2464; GFX1030W32-NEXT:    s_mul_i32 s13, s0, s5
2465; GFX1030W32-NEXT:    s_mul_hi_u32 s5, s1, s5
2466; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s1, s7
2467; GFX1030W32-NEXT:    s_mul_i32 s1, s1, s7
2468; GFX1030W32-NEXT:    s_mul_hi_u32 s6, s0, s7
2469; GFX1030W32-NEXT:    s_add_u32 s1, s5, s1
2470; GFX1030W32-NEXT:    s_addc_u32 s5, 0, s14
2471; GFX1030W32-NEXT:    s_add_u32 s1, s1, s13
2472; GFX1030W32-NEXT:    s_mul_i32 s7, s0, s7
2473; GFX1030W32-NEXT:    s_addc_u32 s1, s5, s12
2474; GFX1030W32-NEXT:    s_addc_u32 s5, s6, 0
2475; GFX1030W32-NEXT:    s_add_u32 s1, s1, s7
2476; GFX1030W32-NEXT:    s_addc_u32 s5, 0, s5
2477; GFX1030W32-NEXT:    v_add_co_u32 v0, s1, v0, s1
2478; GFX1030W32-NEXT:    s_cmp_lg_u32 s1, 0
2479; GFX1030W32-NEXT:    s_addc_u32 s0, s0, s5
2480; GFX1030W32-NEXT:    v_readfirstlane_b32 s1, v0
2481; GFX1030W32-NEXT:    s_mul_i32 s6, s10, s0
2482; GFX1030W32-NEXT:    s_mul_hi_u32 s5, s10, s0
2483; GFX1030W32-NEXT:    s_mul_hi_u32 s7, s11, s0
2484; GFX1030W32-NEXT:    s_mul_i32 s0, s11, s0
2485; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s10, s1
2486; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s11, s1
2487; GFX1030W32-NEXT:    s_mul_i32 s1, s11, s1
2488; GFX1030W32-NEXT:    s_add_u32 s6, s12, s6
2489; GFX1030W32-NEXT:    s_addc_u32 s5, 0, s5
2490; GFX1030W32-NEXT:    s_add_u32 s1, s6, s1
2491; GFX1030W32-NEXT:    s_addc_u32 s1, s5, s13
2492; GFX1030W32-NEXT:    s_addc_u32 s5, s7, 0
2493; GFX1030W32-NEXT:    s_add_u32 s1, s1, s0
2494; GFX1030W32-NEXT:    s_addc_u32 s5, 0, s5
2495; GFX1030W32-NEXT:    s_mul_hi_u32 s0, s2, s1
2496; GFX1030W32-NEXT:    s_mul_i32 s7, s2, s5
2497; GFX1030W32-NEXT:    s_mul_i32 s12, s2, s1
2498; GFX1030W32-NEXT:    s_add_i32 s0, s0, s7
2499; GFX1030W32-NEXT:    v_sub_co_u32 v0, s7, s10, s12
2500; GFX1030W32-NEXT:    s_mul_i32 s6, s3, s1
2501; GFX1030W32-NEXT:    s_add_i32 s0, s0, s6
2502; GFX1030W32-NEXT:    v_sub_co_u32 v1, s12, v0, s2
2503; GFX1030W32-NEXT:    s_sub_i32 s6, s11, s0
2504; GFX1030W32-NEXT:    s_cmp_lg_u32 s7, 0
2505; GFX1030W32-NEXT:    s_subb_u32 s6, s6, s3
2506; GFX1030W32-NEXT:    s_cmp_lg_u32 s12, 0
2507; GFX1030W32-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v1
2508; GFX1030W32-NEXT:    s_subb_u32 s6, s6, 0
2509; GFX1030W32-NEXT:    s_cmp_ge_u32 s6, s3
2510; GFX1030W32-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2511; GFX1030W32-NEXT:    s_cselect_b32 s12, -1, 0
2512; GFX1030W32-NEXT:    s_cmp_eq_u32 s6, s3
2513; GFX1030W32-NEXT:    s_cselect_b32 vcc_lo, -1, 0
2514; GFX1030W32-NEXT:    s_add_u32 s6, s1, 1
2515; GFX1030W32-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2516; GFX1030W32-NEXT:    s_addc_u32 s12, s5, 0
2517; GFX1030W32-NEXT:    s_add_u32 s13, s1, 2
2518; GFX1030W32-NEXT:    s_addc_u32 s14, s5, 0
2519; GFX1030W32-NEXT:    s_cmp_lg_u32 s7, 0
2520; GFX1030W32-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v0
2521; GFX1030W32-NEXT:    s_subb_u32 s0, s11, s0
2522; GFX1030W32-NEXT:    v_mov_b32_e32 v2, s13
2523; GFX1030W32-NEXT:    s_cmp_ge_u32 s0, s3
2524; GFX1030W32-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2525; GFX1030W32-NEXT:    s_cselect_b32 s7, -1, 0
2526; GFX1030W32-NEXT:    s_cmp_eq_u32 s0, s3
2527; GFX1030W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
2528; GFX1030W32-NEXT:    s_cselect_b32 s0, -1, 0
2529; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s14
2530; GFX1030W32-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s0
2531; GFX1030W32-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
2532; GFX1030W32-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2533; GFX1030W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
2534; GFX1030W32-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
2535; GFX1030W32-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
2536; GFX1030W32-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
2537; GFX1030W32-NEXT:    s_cbranch_vccnz .LBB16_3
2538; GFX1030W32-NEXT:  .LBB16_2:
2539; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v0, s2
2540; GFX1030W32-NEXT:    s_sub_i32 s1, 0, s2
2541; GFX1030W32-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2542; GFX1030W32-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2543; GFX1030W32-NEXT:    v_cvt_u32_f32_e32 v0, v0
2544; GFX1030W32-NEXT:    v_readfirstlane_b32 s0, v0
2545; GFX1030W32-NEXT:    s_mul_i32 s1, s1, s0
2546; GFX1030W32-NEXT:    s_mul_hi_u32 s1, s0, s1
2547; GFX1030W32-NEXT:    s_add_i32 s0, s0, s1
2548; GFX1030W32-NEXT:    s_mul_hi_u32 s0, s10, s0
2549; GFX1030W32-NEXT:    s_mul_i32 s1, s0, s2
2550; GFX1030W32-NEXT:    s_add_i32 s3, s0, 1
2551; GFX1030W32-NEXT:    s_sub_i32 s1, s10, s1
2552; GFX1030W32-NEXT:    s_sub_i32 s4, s1, s2
2553; GFX1030W32-NEXT:    s_cmp_ge_u32 s1, s2
2554; GFX1030W32-NEXT:    s_cselect_b32 s0, s3, s0
2555; GFX1030W32-NEXT:    s_cselect_b32 s1, s4, s1
2556; GFX1030W32-NEXT:    s_add_i32 s3, s0, 1
2557; GFX1030W32-NEXT:    s_cmp_ge_u32 s1, s2
2558; GFX1030W32-NEXT:    s_mov_b32 s1, 0
2559; GFX1030W32-NEXT:    s_cselect_b32 s0, s3, s0
2560; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s0
2561; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s1
2562; GFX1030W32-NEXT:  .LBB16_3:
2563; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
2564; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
2565; GFX1030W32-NEXT:    s_endpgm
2566; GFX1030W32-NEXT:  .LBB16_4:
2567; GFX1030W32-NEXT:    ; implicit-def: $vgpr0_vgpr1
2568; GFX1030W32-NEXT:    s_branch .LBB16_2
2569;
2570; GFX1030W64-LABEL: sudiv64:
2571; GFX1030W64:       ; %bb.0:
2572; GFX1030W64-NEXT:    s_clause 0x1
2573; GFX1030W64-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
2574; GFX1030W64-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
2575; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
2576; GFX1030W64-NEXT:    s_or_b64 s[0:1], s[10:11], s[2:3]
2577; GFX1030W64-NEXT:    s_mov_b32 s0, 0
2578; GFX1030W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2579; GFX1030W64-NEXT:    s_cbranch_scc0 .LBB16_4
2580; GFX1030W64-NEXT:  ; %bb.1:
2581; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s2
2582; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v1, s3
2583; GFX1030W64-NEXT:    s_sub_u32 s5, 0, s2
2584; GFX1030W64-NEXT:    s_subb_u32 s6, 0, s3
2585; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
2586; GFX1030W64-NEXT:    v_rcp_f32_e32 v0, v0
2587; GFX1030W64-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
2588; GFX1030W64-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
2589; GFX1030W64-NEXT:    v_trunc_f32_e32 v1, v1
2590; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
2591; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v1, v1
2592; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v0, v0
2593; GFX1030W64-NEXT:    v_readfirstlane_b32 s4, v1
2594; GFX1030W64-NEXT:    v_readfirstlane_b32 s0, v0
2595; GFX1030W64-NEXT:    s_mul_i32 s1, s5, s4
2596; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s5, s0
2597; GFX1030W64-NEXT:    s_mul_i32 s7, s6, s0
2598; GFX1030W64-NEXT:    s_add_i32 s1, s12, s1
2599; GFX1030W64-NEXT:    s_mul_i32 s13, s5, s0
2600; GFX1030W64-NEXT:    s_add_i32 s1, s1, s7
2601; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s0, s13
2602; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s4, s13
2603; GFX1030W64-NEXT:    s_mul_i32 s7, s4, s13
2604; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s0, s1
2605; GFX1030W64-NEXT:    s_mul_i32 s0, s0, s1
2606; GFX1030W64-NEXT:    s_mul_hi_u32 s15, s4, s1
2607; GFX1030W64-NEXT:    s_add_u32 s0, s12, s0
2608; GFX1030W64-NEXT:    s_addc_u32 s12, 0, s13
2609; GFX1030W64-NEXT:    s_add_u32 s0, s0, s7
2610; GFX1030W64-NEXT:    s_mul_i32 s1, s4, s1
2611; GFX1030W64-NEXT:    s_addc_u32 s0, s12, s14
2612; GFX1030W64-NEXT:    s_addc_u32 s7, s15, 0
2613; GFX1030W64-NEXT:    s_add_u32 s0, s0, s1
2614; GFX1030W64-NEXT:    s_addc_u32 s7, 0, s7
2615; GFX1030W64-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
2616; GFX1030W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2617; GFX1030W64-NEXT:    s_addc_u32 s4, s4, s7
2618; GFX1030W64-NEXT:    v_readfirstlane_b32 s0, v0
2619; GFX1030W64-NEXT:    s_mul_i32 s1, s5, s4
2620; GFX1030W64-NEXT:    s_mul_hi_u32 s7, s5, s0
2621; GFX1030W64-NEXT:    s_mul_i32 s6, s6, s0
2622; GFX1030W64-NEXT:    s_add_i32 s1, s7, s1
2623; GFX1030W64-NEXT:    s_mul_i32 s5, s5, s0
2624; GFX1030W64-NEXT:    s_add_i32 s1, s1, s6
2625; GFX1030W64-NEXT:    s_mul_hi_u32 s7, s4, s5
2626; GFX1030W64-NEXT:    s_mul_i32 s12, s4, s5
2627; GFX1030W64-NEXT:    s_mul_hi_u32 s5, s0, s5
2628; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s0, s1
2629; GFX1030W64-NEXT:    s_mul_i32 s0, s0, s1
2630; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s4, s1
2631; GFX1030W64-NEXT:    s_add_u32 s0, s5, s0
2632; GFX1030W64-NEXT:    s_addc_u32 s5, 0, s13
2633; GFX1030W64-NEXT:    s_add_u32 s0, s0, s12
2634; GFX1030W64-NEXT:    s_mul_i32 s1, s4, s1
2635; GFX1030W64-NEXT:    s_addc_u32 s0, s5, s7
2636; GFX1030W64-NEXT:    s_addc_u32 s5, s6, 0
2637; GFX1030W64-NEXT:    s_add_u32 s0, s0, s1
2638; GFX1030W64-NEXT:    s_addc_u32 s5, 0, s5
2639; GFX1030W64-NEXT:    v_add_co_u32 v0, s[0:1], v0, s0
2640; GFX1030W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2641; GFX1030W64-NEXT:    s_addc_u32 s0, s4, s5
2642; GFX1030W64-NEXT:    v_readfirstlane_b32 s1, v0
2643; GFX1030W64-NEXT:    s_mul_i32 s5, s10, s0
2644; GFX1030W64-NEXT:    s_mul_hi_u32 s4, s10, s0
2645; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s11, s0
2646; GFX1030W64-NEXT:    s_mul_i32 s0, s11, s0
2647; GFX1030W64-NEXT:    s_mul_hi_u32 s7, s10, s1
2648; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s11, s1
2649; GFX1030W64-NEXT:    s_mul_i32 s1, s11, s1
2650; GFX1030W64-NEXT:    s_add_u32 s5, s7, s5
2651; GFX1030W64-NEXT:    s_addc_u32 s4, 0, s4
2652; GFX1030W64-NEXT:    s_add_u32 s1, s5, s1
2653; GFX1030W64-NEXT:    s_addc_u32 s1, s4, s12
2654; GFX1030W64-NEXT:    s_addc_u32 s4, s6, 0
2655; GFX1030W64-NEXT:    s_add_u32 s6, s1, s0
2656; GFX1030W64-NEXT:    s_addc_u32 s7, 0, s4
2657; GFX1030W64-NEXT:    s_mul_hi_u32 s0, s2, s6
2658; GFX1030W64-NEXT:    s_mul_i32 s1, s2, s7
2659; GFX1030W64-NEXT:    s_mul_i32 s5, s2, s6
2660; GFX1030W64-NEXT:    s_add_i32 s12, s0, s1
2661; GFX1030W64-NEXT:    v_sub_co_u32 v0, s[0:1], s10, s5
2662; GFX1030W64-NEXT:    s_mul_i32 s4, s3, s6
2663; GFX1030W64-NEXT:    s_add_i32 s12, s12, s4
2664; GFX1030W64-NEXT:    v_sub_co_u32 v1, s[4:5], v0, s2
2665; GFX1030W64-NEXT:    s_sub_i32 s13, s11, s12
2666; GFX1030W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2667; GFX1030W64-NEXT:    s_subb_u32 s13, s13, s3
2668; GFX1030W64-NEXT:    s_cmp_lg_u64 s[4:5], 0
2669; GFX1030W64-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
2670; GFX1030W64-NEXT:    s_subb_u32 s4, s13, 0
2671; GFX1030W64-NEXT:    s_cmp_ge_u32 s4, s3
2672; GFX1030W64-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
2673; GFX1030W64-NEXT:    s_cselect_b32 s5, -1, 0
2674; GFX1030W64-NEXT:    s_cmp_eq_u32 s4, s3
2675; GFX1030W64-NEXT:    s_cselect_b64 vcc, -1, 0
2676; GFX1030W64-NEXT:    s_add_u32 s4, s6, 1
2677; GFX1030W64-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc
2678; GFX1030W64-NEXT:    s_addc_u32 s5, s7, 0
2679; GFX1030W64-NEXT:    s_add_u32 s13, s6, 2
2680; GFX1030W64-NEXT:    s_addc_u32 s14, s7, 0
2681; GFX1030W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
2682; GFX1030W64-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2683; GFX1030W64-NEXT:    s_subb_u32 s0, s11, s12
2684; GFX1030W64-NEXT:    v_mov_b32_e32 v2, s13
2685; GFX1030W64-NEXT:    s_cmp_ge_u32 s0, s3
2686; GFX1030W64-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
2687; GFX1030W64-NEXT:    s_cselect_b32 s11, -1, 0
2688; GFX1030W64-NEXT:    s_cmp_eq_u32 s0, s3
2689; GFX1030W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
2690; GFX1030W64-NEXT:    s_cselect_b64 s[0:1], -1, 0
2691; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s14
2692; GFX1030W64-NEXT:    v_cndmask_b32_e64 v0, s11, v0, s[0:1]
2693; GFX1030W64-NEXT:    v_cndmask_b32_e32 v2, s4, v2, vcc
2694; GFX1030W64-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc
2695; GFX1030W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
2696; GFX1030W64-NEXT:    v_cndmask_b32_e32 v1, s7, v1, vcc
2697; GFX1030W64-NEXT:    v_cndmask_b32_e32 v0, s6, v2, vcc
2698; GFX1030W64-NEXT:    s_cbranch_execnz .LBB16_3
2699; GFX1030W64-NEXT:  .LBB16_2:
2700; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s2
2701; GFX1030W64-NEXT:    s_sub_i32 s1, 0, s2
2702; GFX1030W64-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2703; GFX1030W64-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2704; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v0, v0
2705; GFX1030W64-NEXT:    v_readfirstlane_b32 s0, v0
2706; GFX1030W64-NEXT:    s_mul_i32 s1, s1, s0
2707; GFX1030W64-NEXT:    s_mul_hi_u32 s1, s0, s1
2708; GFX1030W64-NEXT:    s_add_i32 s0, s0, s1
2709; GFX1030W64-NEXT:    s_mul_hi_u32 s0, s10, s0
2710; GFX1030W64-NEXT:    s_mul_i32 s1, s0, s2
2711; GFX1030W64-NEXT:    s_add_i32 s3, s0, 1
2712; GFX1030W64-NEXT:    s_sub_i32 s1, s10, s1
2713; GFX1030W64-NEXT:    s_sub_i32 s4, s1, s2
2714; GFX1030W64-NEXT:    s_cmp_ge_u32 s1, s2
2715; GFX1030W64-NEXT:    s_cselect_b32 s0, s3, s0
2716; GFX1030W64-NEXT:    s_cselect_b32 s1, s4, s1
2717; GFX1030W64-NEXT:    s_add_i32 s3, s0, 1
2718; GFX1030W64-NEXT:    s_cmp_ge_u32 s1, s2
2719; GFX1030W64-NEXT:    s_mov_b32 s1, 0
2720; GFX1030W64-NEXT:    s_cselect_b32 s0, s3, s0
2721; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s0
2722; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s1
2723; GFX1030W64-NEXT:  .LBB16_3:
2724; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
2725; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
2726; GFX1030W64-NEXT:    s_endpgm
2727; GFX1030W64-NEXT:  .LBB16_4:
2728; GFX1030W64-NEXT:    ; implicit-def: $vgpr0_vgpr1
2729; GFX1030W64-NEXT:    s_branch .LBB16_2
2730;
2731; GFX11-LABEL: sudiv64:
2732; GFX11:       ; %bb.0:
2733; GFX11-NEXT:    s_clause 0x1
2734; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
2735; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x34
2736; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2737; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[2:3]
2738; GFX11-NEXT:    s_mov_b32 s4, 0
2739; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2740; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
2741; GFX11-NEXT:    s_cbranch_scc0 .LBB16_4
2742; GFX11-NEXT:  ; %bb.1:
2743; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
2744; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, s3
2745; GFX11-NEXT:    s_sub_u32 s5, 0, s2
2746; GFX11-NEXT:    s_subb_u32 s6, 0, s3
2747; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2748; GFX11-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
2749; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
2750; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2751; GFX11-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
2752; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2753; GFX11-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
2754; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
2755; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2756; GFX11-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
2757; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
2758; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
2759; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2760; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
2761; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
2762; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2763; GFX11-NEXT:    s_mul_i32 s7, s5, s0
2764; GFX11-NEXT:    s_mul_hi_u32 s13, s5, s1
2765; GFX11-NEXT:    s_mul_i32 s12, s6, s1
2766; GFX11-NEXT:    s_add_i32 s7, s13, s7
2767; GFX11-NEXT:    s_mul_i32 s14, s5, s1
2768; GFX11-NEXT:    s_add_i32 s7, s7, s12
2769; GFX11-NEXT:    s_mul_hi_u32 s13, s1, s14
2770; GFX11-NEXT:    s_mul_hi_u32 s15, s0, s14
2771; GFX11-NEXT:    s_mul_i32 s12, s0, s14
2772; GFX11-NEXT:    s_mul_hi_u32 s14, s1, s7
2773; GFX11-NEXT:    s_mul_i32 s1, s1, s7
2774; GFX11-NEXT:    s_mul_hi_u32 s16, s0, s7
2775; GFX11-NEXT:    s_add_u32 s1, s13, s1
2776; GFX11-NEXT:    s_addc_u32 s13, 0, s14
2777; GFX11-NEXT:    s_add_u32 s1, s1, s12
2778; GFX11-NEXT:    s_mul_i32 s7, s0, s7
2779; GFX11-NEXT:    s_addc_u32 s1, s13, s15
2780; GFX11-NEXT:    s_addc_u32 s12, s16, 0
2781; GFX11-NEXT:    s_add_u32 s1, s1, s7
2782; GFX11-NEXT:    s_addc_u32 s7, 0, s12
2783; GFX11-NEXT:    v_add_co_u32 v0, s1, v0, s1
2784; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2785; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
2786; GFX11-NEXT:    s_addc_u32 s0, s0, s7
2787; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
2788; GFX11-NEXT:    s_mul_i32 s7, s5, s0
2789; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2790; GFX11-NEXT:    s_mul_hi_u32 s12, s5, s1
2791; GFX11-NEXT:    s_mul_i32 s6, s6, s1
2792; GFX11-NEXT:    s_add_i32 s7, s12, s7
2793; GFX11-NEXT:    s_mul_i32 s5, s5, s1
2794; GFX11-NEXT:    s_add_i32 s7, s7, s6
2795; GFX11-NEXT:    s_mul_hi_u32 s12, s0, s5
2796; GFX11-NEXT:    s_mul_i32 s13, s0, s5
2797; GFX11-NEXT:    s_mul_hi_u32 s5, s1, s5
2798; GFX11-NEXT:    s_mul_hi_u32 s14, s1, s7
2799; GFX11-NEXT:    s_mul_i32 s1, s1, s7
2800; GFX11-NEXT:    s_mul_hi_u32 s6, s0, s7
2801; GFX11-NEXT:    s_add_u32 s1, s5, s1
2802; GFX11-NEXT:    s_addc_u32 s5, 0, s14
2803; GFX11-NEXT:    s_add_u32 s1, s1, s13
2804; GFX11-NEXT:    s_mul_i32 s7, s0, s7
2805; GFX11-NEXT:    s_addc_u32 s1, s5, s12
2806; GFX11-NEXT:    s_addc_u32 s5, s6, 0
2807; GFX11-NEXT:    s_add_u32 s1, s1, s7
2808; GFX11-NEXT:    s_addc_u32 s5, 0, s5
2809; GFX11-NEXT:    v_add_co_u32 v0, s1, v0, s1
2810; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2811; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
2812; GFX11-NEXT:    s_addc_u32 s0, s0, s5
2813; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
2814; GFX11-NEXT:    s_mul_i32 s6, s10, s0
2815; GFX11-NEXT:    s_mul_hi_u32 s5, s10, s0
2816; GFX11-NEXT:    s_mul_hi_u32 s7, s11, s0
2817; GFX11-NEXT:    s_mul_i32 s0, s11, s0
2818; GFX11-NEXT:    s_mul_hi_u32 s12, s10, s1
2819; GFX11-NEXT:    s_mul_hi_u32 s13, s11, s1
2820; GFX11-NEXT:    s_mul_i32 s1, s11, s1
2821; GFX11-NEXT:    s_add_u32 s6, s12, s6
2822; GFX11-NEXT:    s_addc_u32 s5, 0, s5
2823; GFX11-NEXT:    s_add_u32 s1, s6, s1
2824; GFX11-NEXT:    s_addc_u32 s1, s5, s13
2825; GFX11-NEXT:    s_addc_u32 s5, s7, 0
2826; GFX11-NEXT:    s_add_u32 s1, s1, s0
2827; GFX11-NEXT:    s_addc_u32 s5, 0, s5
2828; GFX11-NEXT:    s_mul_hi_u32 s0, s2, s1
2829; GFX11-NEXT:    s_mul_i32 s7, s2, s5
2830; GFX11-NEXT:    s_mul_i32 s12, s2, s1
2831; GFX11-NEXT:    s_add_i32 s0, s0, s7
2832; GFX11-NEXT:    v_sub_co_u32 v0, s7, s10, s12
2833; GFX11-NEXT:    s_mul_i32 s6, s3, s1
2834; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2835; GFX11-NEXT:    s_add_i32 s0, s0, s6
2836; GFX11-NEXT:    v_sub_co_u32 v1, s12, v0, s2
2837; GFX11-NEXT:    s_sub_i32 s6, s11, s0
2838; GFX11-NEXT:    s_cmp_lg_u32 s7, 0
2839; GFX11-NEXT:    s_subb_u32 s6, s6, s3
2840; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
2841; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v1
2842; GFX11-NEXT:    s_subb_u32 s6, s6, 0
2843; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2844; GFX11-NEXT:    s_cmp_ge_u32 s6, s3
2845; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2846; GFX11-NEXT:    s_cselect_b32 s12, -1, 0
2847; GFX11-NEXT:    s_cmp_eq_u32 s6, s3
2848; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
2849; GFX11-NEXT:    s_add_u32 s6, s1, 1
2850; GFX11-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2851; GFX11-NEXT:    s_addc_u32 s12, s5, 0
2852; GFX11-NEXT:    s_add_u32 s13, s1, 2
2853; GFX11-NEXT:    s_addc_u32 s14, s5, 0
2854; GFX11-NEXT:    v_mov_b32_e32 v2, s13
2855; GFX11-NEXT:    s_cmp_lg_u32 s7, 0
2856; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v0
2857; GFX11-NEXT:    s_subb_u32 s0, s11, s0
2858; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2859; GFX11-NEXT:    s_cmp_ge_u32 s0, s3
2860; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2861; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
2862; GFX11-NEXT:    s_cmp_eq_u32 s0, s3
2863; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
2864; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
2865; GFX11-NEXT:    v_mov_b32_e32 v1, s14
2866; GFX11-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s0
2867; GFX11-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
2868; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2869; GFX11-NEXT:    v_cndmask_b32_e32 v1, s12, v1, vcc_lo
2870; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
2871; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
2872; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
2873; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
2874; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
2875; GFX11-NEXT:    s_cbranch_vccnz .LBB16_3
2876; GFX11-NEXT:  .LBB16_2:
2877; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
2878; GFX11-NEXT:    s_sub_i32 s1, 0, s2
2879; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2880; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2881; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2882; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2883; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
2884; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2885; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
2886; GFX11-NEXT:    s_mul_i32 s1, s1, s0
2887; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2888; GFX11-NEXT:    s_mul_hi_u32 s1, s0, s1
2889; GFX11-NEXT:    s_add_i32 s0, s0, s1
2890; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2891; GFX11-NEXT:    s_mul_hi_u32 s0, s10, s0
2892; GFX11-NEXT:    s_mul_i32 s1, s0, s2
2893; GFX11-NEXT:    s_add_i32 s3, s0, 1
2894; GFX11-NEXT:    s_sub_i32 s1, s10, s1
2895; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2896; GFX11-NEXT:    s_sub_i32 s4, s1, s2
2897; GFX11-NEXT:    s_cmp_ge_u32 s1, s2
2898; GFX11-NEXT:    s_cselect_b32 s0, s3, s0
2899; GFX11-NEXT:    s_cselect_b32 s1, s4, s1
2900; GFX11-NEXT:    s_add_i32 s3, s0, 1
2901; GFX11-NEXT:    s_cmp_ge_u32 s1, s2
2902; GFX11-NEXT:    s_mov_b32 s1, 0
2903; GFX11-NEXT:    s_cselect_b32 s0, s3, s0
2904; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2905; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2906; GFX11-NEXT:  .LBB16_3:
2907; GFX11-NEXT:    v_mov_b32_e32 v2, 0
2908; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
2909; GFX11-NEXT:    s_endpgm
2910; GFX11-NEXT:  .LBB16_4:
2911; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
2912; GFX11-NEXT:    s_branch .LBB16_2
2913  %result = udiv i64 %x, %y
2914  store i64 %result, ptr addrspace(1) %out
2915  ret void
2916}
2917
2918
2919
2920declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
2921
2922declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
2923
2924declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1
2925
2926declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
2927
2928declare i32 @llvm.amdgcn.workitem.id.x() #1
2929
2930attributes #0 = { nounwind }
2931attributes #1 = { nounwind readnone }
2932
2933;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2934; GCN-ISEL: {{.*}}
2935