xref: /llvm-project/llvm/test/CodeGen/AMDGPU/uaddo.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5
6define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
7; SI-LABEL: s_uaddo_i64_zext:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
10; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
11; SI-NEXT:    s_mov_b32 s7, 0xf000
12; SI-NEXT:    s_mov_b32 s6, -1
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s4, s0
15; SI-NEXT:    s_mov_b32 s5, s1
16; SI-NEXT:    s_add_u32 s0, s2, s8
17; SI-NEXT:    v_mov_b32_e32 v0, s2
18; SI-NEXT:    v_mov_b32_e32 v1, s3
19; SI-NEXT:    s_addc_u32 s1, s3, s9
20; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
21; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
22; SI-NEXT:    v_mov_b32_e32 v1, s1
23; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
24; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
25; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: s_uaddo_i64_zext:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
31; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
32; VI-NEXT:    s_waitcnt lgkmcnt(0)
33; VI-NEXT:    v_mov_b32_e32 v0, s0
34; VI-NEXT:    s_add_u32 s0, s2, s4
35; VI-NEXT:    v_mov_b32_e32 v2, s2
36; VI-NEXT:    v_mov_b32_e32 v1, s1
37; VI-NEXT:    v_mov_b32_e32 v3, s3
38; VI-NEXT:    s_addc_u32 s1, s3, s5
39; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
40; VI-NEXT:    v_mov_b32_e32 v3, s1
41; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
42; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
43; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
44; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
45; VI-NEXT:    s_endpgm
46;
47; GFX9-LABEL: s_uaddo_i64_zext:
48; GFX9:       ; %bb.0:
49; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
50; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
51; GFX9-NEXT:    v_mov_b32_e32 v2, 0
52; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX9-NEXT:    v_mov_b32_e32 v0, s2
54; GFX9-NEXT:    s_add_u32 s4, s2, s6
55; GFX9-NEXT:    v_mov_b32_e32 v1, s3
56; GFX9-NEXT:    s_addc_u32 s5, s3, s7
57; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
58; GFX9-NEXT:    v_mov_b32_e32 v1, s5
59; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
60; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
61; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
62; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
63; GFX9-NEXT:    s_endpgm
64  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
65  %val = extractvalue { i64, i1 } %uadd, 0
66  %carry = extractvalue { i64, i1 } %uadd, 1
67  %ext = zext i1 %carry to i64
68  %add2 = add i64 %val, %ext
69  store i64 %add2, ptr addrspace(1) %out, align 8
70  ret void
71}
72
73; FIXME: Could do scalar
74
75define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
76; SI-LABEL: s_uaddo_i32:
77; SI:       ; %bb.0:
78; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
79; SI-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0xd
80; SI-NEXT:    s_mov_b32 s7, 0xf000
81; SI-NEXT:    s_mov_b32 s6, -1
82; SI-NEXT:    s_mov_b32 s10, s6
83; SI-NEXT:    s_mov_b32 s11, s7
84; SI-NEXT:    s_waitcnt lgkmcnt(0)
85; SI-NEXT:    s_mov_b32 s4, s0
86; SI-NEXT:    s_mov_b32 s5, s1
87; SI-NEXT:    s_mov_b32 s8, s2
88; SI-NEXT:    s_mov_b32 s9, s3
89; SI-NEXT:    v_mov_b32_e32 v0, s13
90; SI-NEXT:    v_add_i32_e32 v0, vcc, s12, v0
91; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
92; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
93; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
94; SI-NEXT:    s_endpgm
95;
96; VI-LABEL: s_uaddo_i32:
97; VI:       ; %bb.0:
98; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
99; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
100; VI-NEXT:    s_waitcnt lgkmcnt(0)
101; VI-NEXT:    v_mov_b32_e32 v0, s0
102; VI-NEXT:    v_mov_b32_e32 v4, s5
103; VI-NEXT:    v_mov_b32_e32 v1, s1
104; VI-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
105; VI-NEXT:    v_mov_b32_e32 v2, s2
106; VI-NEXT:    v_mov_b32_e32 v3, s3
107; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
108; VI-NEXT:    flat_store_dword v[0:1], v4
109; VI-NEXT:    flat_store_byte v[2:3], v5
110; VI-NEXT:    s_endpgm
111;
112; GFX9-LABEL: s_uaddo_i32:
113; GFX9:       ; %bb.0:
114; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
115; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
116; GFX9-NEXT:    v_mov_b32_e32 v0, 0
117; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX9-NEXT:    v_mov_b32_e32 v1, s7
119; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s6, v1
120; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
121; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
122; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
123; GFX9-NEXT:    s_endpgm
124  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
125  %val = extractvalue { i32, i1 } %uadd, 0
126  %carry = extractvalue { i32, i1 } %uadd, 1
127  store i32 %val, ptr addrspace(1) %out, align 4
128  store i1 %carry, ptr addrspace(1) %carryout
129  ret void
130}
131
132define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
133; SI-LABEL: v_uaddo_i32:
134; SI:       ; %bb.0:
135; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
136; SI-NEXT:    s_mov_b32 s11, 0xf000
137; SI-NEXT:    s_mov_b32 s10, -1
138; SI-NEXT:    s_mov_b32 s14, s10
139; SI-NEXT:    s_mov_b32 s15, s11
140; SI-NEXT:    s_mov_b32 s18, s10
141; SI-NEXT:    s_mov_b32 s19, s11
142; SI-NEXT:    s_waitcnt lgkmcnt(0)
143; SI-NEXT:    s_mov_b32 s12, s4
144; SI-NEXT:    s_mov_b32 s13, s5
145; SI-NEXT:    s_mov_b32 s16, s6
146; SI-NEXT:    s_mov_b32 s17, s7
147; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
148; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
149; SI-NEXT:    s_mov_b32 s6, s10
150; SI-NEXT:    s_mov_b32 s7, s11
151; SI-NEXT:    s_mov_b32 s8, s0
152; SI-NEXT:    s_mov_b32 s9, s1
153; SI-NEXT:    s_mov_b32 s4, s2
154; SI-NEXT:    s_mov_b32 s5, s3
155; SI-NEXT:    s_waitcnt vmcnt(0)
156; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
157; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
158; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
159; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
160; SI-NEXT:    s_endpgm
161;
162; VI-LABEL: v_uaddo_i32:
163; VI:       ; %bb.0:
164; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
165; VI-NEXT:    s_waitcnt lgkmcnt(0)
166; VI-NEXT:    v_mov_b32_e32 v0, s4
167; VI-NEXT:    v_mov_b32_e32 v1, s5
168; VI-NEXT:    v_mov_b32_e32 v2, s6
169; VI-NEXT:    v_mov_b32_e32 v3, s7
170; VI-NEXT:    flat_load_dword v4, v[0:1]
171; VI-NEXT:    flat_load_dword v5, v[2:3]
172; VI-NEXT:    v_mov_b32_e32 v0, s0
173; VI-NEXT:    v_mov_b32_e32 v1, s1
174; VI-NEXT:    v_mov_b32_e32 v2, s2
175; VI-NEXT:    v_mov_b32_e32 v3, s3
176; VI-NEXT:    s_waitcnt vmcnt(0)
177; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
178; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
179; VI-NEXT:    flat_store_dword v[0:1], v4
180; VI-NEXT:    flat_store_byte v[2:3], v5
181; VI-NEXT:    s_endpgm
182;
183; GFX9-LABEL: v_uaddo_i32:
184; GFX9:       ; %bb.0:
185; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
186; GFX9-NEXT:    v_mov_b32_e32 v0, 0
187; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX9-NEXT:    global_load_dword v1, v0, s[12:13]
189; GFX9-NEXT:    global_load_dword v2, v0, s[14:15]
190; GFX9-NEXT:    s_waitcnt vmcnt(0)
191; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
192; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
193; GFX9-NEXT:    global_store_dword v0, v1, s[8:9]
194; GFX9-NEXT:    global_store_byte v0, v2, s[10:11]
195; GFX9-NEXT:    s_endpgm
196  %tid = call i32 @llvm.amdgcn.workitem.id.x()
197  %tid.ext = sext i32 %tid to i64
198  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
199  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
200  %a = load i32, ptr addrspace(1) %a.gep, align 4
201  %b = load i32, ptr addrspace(1) %b.gep, align 4
202  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
203  %val = extractvalue { i32, i1 } %uadd, 0
204  %carry = extractvalue { i32, i1 } %uadd, 1
205  store i32 %val, ptr addrspace(1) %out, align 4
206  store i1 %carry, ptr addrspace(1) %carryout
207  ret void
208}
209
210define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
211; SI-LABEL: v_uaddo_i32_novcc:
212; SI:       ; %bb.0:
213; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
214; SI-NEXT:    s_mov_b32 s11, 0xf000
215; SI-NEXT:    s_mov_b32 s10, -1
216; SI-NEXT:    s_mov_b32 s14, s10
217; SI-NEXT:    s_mov_b32 s15, s11
218; SI-NEXT:    s_mov_b32 s18, s10
219; SI-NEXT:    s_mov_b32 s19, s11
220; SI-NEXT:    s_waitcnt lgkmcnt(0)
221; SI-NEXT:    s_mov_b32 s12, s4
222; SI-NEXT:    s_mov_b32 s13, s5
223; SI-NEXT:    s_mov_b32 s16, s6
224; SI-NEXT:    s_mov_b32 s17, s7
225; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
226; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
227; SI-NEXT:    s_mov_b32 s6, s10
228; SI-NEXT:    s_mov_b32 s7, s11
229; SI-NEXT:    s_mov_b32 s8, s0
230; SI-NEXT:    s_mov_b32 s9, s1
231; SI-NEXT:    s_mov_b32 s4, s2
232; SI-NEXT:    s_mov_b32 s5, s3
233; SI-NEXT:    s_waitcnt vmcnt(0)
234; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
235; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
236; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
237; SI-NEXT:    s_waitcnt vmcnt(0)
238; SI-NEXT:    ;;#ASMSTART
239; SI-NEXT:    ;;#ASMEND
240; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
241; SI-NEXT:    s_waitcnt vmcnt(0)
242; SI-NEXT:    s_endpgm
243;
244; VI-LABEL: v_uaddo_i32_novcc:
245; VI:       ; %bb.0:
246; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
247; VI-NEXT:    s_waitcnt lgkmcnt(0)
248; VI-NEXT:    v_mov_b32_e32 v0, s4
249; VI-NEXT:    v_mov_b32_e32 v1, s5
250; VI-NEXT:    v_mov_b32_e32 v2, s6
251; VI-NEXT:    v_mov_b32_e32 v3, s7
252; VI-NEXT:    flat_load_dword v4, v[0:1]
253; VI-NEXT:    flat_load_dword v5, v[2:3]
254; VI-NEXT:    v_mov_b32_e32 v0, s0
255; VI-NEXT:    v_mov_b32_e32 v1, s1
256; VI-NEXT:    v_mov_b32_e32 v2, s2
257; VI-NEXT:    v_mov_b32_e32 v3, s3
258; VI-NEXT:    s_waitcnt vmcnt(0)
259; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
260; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
261; VI-NEXT:    flat_store_dword v[0:1], v4
262; VI-NEXT:    s_waitcnt vmcnt(0)
263; VI-NEXT:    ;;#ASMSTART
264; VI-NEXT:    ;;#ASMEND
265; VI-NEXT:    flat_store_byte v[2:3], v5
266; VI-NEXT:    s_waitcnt vmcnt(0)
267; VI-NEXT:    s_endpgm
268;
269; GFX9-LABEL: v_uaddo_i32_novcc:
270; GFX9:       ; %bb.0:
271; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
272; GFX9-NEXT:    v_mov_b32_e32 v0, 0
273; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX9-NEXT:    global_load_dword v1, v0, s[12:13]
275; GFX9-NEXT:    global_load_dword v2, v0, s[14:15]
276; GFX9-NEXT:    s_waitcnt vmcnt(0)
277; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
278; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
279; GFX9-NEXT:    global_store_dword v0, v1, s[8:9]
280; GFX9-NEXT:    s_waitcnt vmcnt(0)
281; GFX9-NEXT:    ;;#ASMSTART
282; GFX9-NEXT:    ;;#ASMEND
283; GFX9-NEXT:    global_store_byte v0, v2, s[10:11]
284; GFX9-NEXT:    s_waitcnt vmcnt(0)
285; GFX9-NEXT:    s_endpgm
286  %tid = call i32 @llvm.amdgcn.workitem.id.x()
287  %tid.ext = sext i32 %tid to i64
288  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
289  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
290  %a = load i32, ptr addrspace(1) %a.gep, align 4
291  %b = load i32, ptr addrspace(1) %b.gep, align 4
292  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
293  %val = extractvalue { i32, i1 } %uadd, 0
294  %carry = extractvalue { i32, i1 } %uadd, 1
295  store volatile i32 %val, ptr addrspace(1) %out, align 4
296  call void asm sideeffect "", "~{vcc}"() #0
297  store volatile i1 %carry, ptr addrspace(1) %carryout
298  ret void
299}
300
301define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
302; SI-LABEL: s_uaddo_i64:
303; SI:       ; %bb.0:
304; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
305; SI-NEXT:    s_mov_b32 s11, 0xf000
306; SI-NEXT:    s_mov_b32 s10, -1
307; SI-NEXT:    s_waitcnt lgkmcnt(0)
308; SI-NEXT:    s_add_u32 s6, s4, s6
309; SI-NEXT:    s_addc_u32 s7, s5, s7
310; SI-NEXT:    s_mov_b32 s14, s10
311; SI-NEXT:    s_mov_b32 s15, s11
312; SI-NEXT:    s_mov_b32 s8, s0
313; SI-NEXT:    s_mov_b32 s9, s1
314; SI-NEXT:    s_mov_b32 s12, s2
315; SI-NEXT:    s_mov_b32 s13, s3
316; SI-NEXT:    v_mov_b32_e32 v0, s4
317; SI-NEXT:    v_mov_b32_e32 v1, s5
318; SI-NEXT:    v_mov_b32_e32 v2, s6
319; SI-NEXT:    v_mov_b32_e32 v3, s7
320; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
321; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
322; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
323; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
324; SI-NEXT:    s_endpgm
325;
326; VI-LABEL: s_uaddo_i64:
327; VI:       ; %bb.0:
328; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
329; VI-NEXT:    s_waitcnt lgkmcnt(0)
330; VI-NEXT:    v_mov_b32_e32 v0, s0
331; VI-NEXT:    s_add_u32 s0, s4, s6
332; VI-NEXT:    v_mov_b32_e32 v4, s4
333; VI-NEXT:    v_mov_b32_e32 v1, s1
334; VI-NEXT:    s_addc_u32 s1, s5, s7
335; VI-NEXT:    v_mov_b32_e32 v5, s5
336; VI-NEXT:    v_mov_b32_e32 v7, s1
337; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
338; VI-NEXT:    v_mov_b32_e32 v6, s0
339; VI-NEXT:    v_mov_b32_e32 v2, s2
340; VI-NEXT:    v_mov_b32_e32 v3, s3
341; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
342; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
343; VI-NEXT:    flat_store_byte v[2:3], v0
344; VI-NEXT:    s_endpgm
345;
346; GFX9-LABEL: s_uaddo_i64:
347; GFX9:       ; %bb.0:
348; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
349; GFX9-NEXT:    v_mov_b32_e32 v4, 0
350; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX9-NEXT:    s_add_u32 s0, s12, s14
352; GFX9-NEXT:    v_mov_b32_e32 v0, s12
353; GFX9-NEXT:    v_mov_b32_e32 v1, s13
354; GFX9-NEXT:    s_addc_u32 s1, s13, s15
355; GFX9-NEXT:    v_mov_b32_e32 v3, s1
356; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
357; GFX9-NEXT:    v_mov_b32_e32 v2, s0
358; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
359; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
360; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
361; GFX9-NEXT:    s_endpgm
362  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
363  %val = extractvalue { i64, i1 } %uadd, 0
364  %carry = extractvalue { i64, i1 } %uadd, 1
365  store i64 %val, ptr addrspace(1) %out, align 8
366  store i1 %carry, ptr addrspace(1) %carryout
367  ret void
368}
369
370define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
371; SI-LABEL: v_uaddo_i64:
372; SI:       ; %bb.0:
373; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
374; SI-NEXT:    s_mov_b32 s11, 0xf000
375; SI-NEXT:    s_mov_b32 s10, -1
376; SI-NEXT:    s_mov_b32 s14, s10
377; SI-NEXT:    s_mov_b32 s15, s11
378; SI-NEXT:    s_mov_b32 s18, s10
379; SI-NEXT:    s_mov_b32 s19, s11
380; SI-NEXT:    s_waitcnt lgkmcnt(0)
381; SI-NEXT:    s_mov_b32 s12, s4
382; SI-NEXT:    s_mov_b32 s13, s5
383; SI-NEXT:    s_mov_b32 s16, s6
384; SI-NEXT:    s_mov_b32 s17, s7
385; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
386; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
387; SI-NEXT:    s_mov_b32 s6, s10
388; SI-NEXT:    s_mov_b32 s7, s11
389; SI-NEXT:    s_mov_b32 s8, s0
390; SI-NEXT:    s_mov_b32 s9, s1
391; SI-NEXT:    s_mov_b32 s4, s2
392; SI-NEXT:    s_mov_b32 s5, s3
393; SI-NEXT:    s_waitcnt vmcnt(0)
394; SI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
395; SI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
396; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
397; SI-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
398; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
399; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
400; SI-NEXT:    s_endpgm
401;
402; VI-LABEL: v_uaddo_i64:
403; VI:       ; %bb.0:
404; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
405; VI-NEXT:    s_waitcnt lgkmcnt(0)
406; VI-NEXT:    v_mov_b32_e32 v0, s4
407; VI-NEXT:    v_mov_b32_e32 v1, s5
408; VI-NEXT:    v_mov_b32_e32 v2, s6
409; VI-NEXT:    v_mov_b32_e32 v3, s7
410; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
411; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
412; VI-NEXT:    v_mov_b32_e32 v4, s0
413; VI-NEXT:    v_mov_b32_e32 v5, s1
414; VI-NEXT:    v_mov_b32_e32 v6, s2
415; VI-NEXT:    v_mov_b32_e32 v7, s3
416; VI-NEXT:    s_waitcnt vmcnt(0)
417; VI-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
418; VI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
419; VI-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
420; VI-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
421; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
422; VI-NEXT:    flat_store_byte v[6:7], v0
423; VI-NEXT:    s_endpgm
424;
425; GFX9-LABEL: v_uaddo_i64:
426; GFX9:       ; %bb.0:
427; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
428; GFX9-NEXT:    v_mov_b32_e32 v4, 0
429; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
431; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
432; GFX9-NEXT:    s_waitcnt vmcnt(0)
433; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
434; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
435; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
436; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
437; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
438; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
439; GFX9-NEXT:    s_endpgm
440  %tid = call i32 @llvm.amdgcn.workitem.id.x()
441  %tid.ext = sext i32 %tid to i64
442  %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
443  %b.gep = getelementptr inbounds i64, ptr addrspace(1) %b.ptr
444  %a = load i64, ptr addrspace(1) %a.gep
445  %b = load i64, ptr addrspace(1) %b.gep
446  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
447  %val = extractvalue { i64, i1 } %uadd, 0
448  %carry = extractvalue { i64, i1 } %uadd, 1
449  store i64 %val, ptr addrspace(1) %out
450  store i1 %carry, ptr addrspace(1) %carryout
451  ret void
452}
453
454define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
455; SI-LABEL: v_uaddo_i16:
456; SI:       ; %bb.0:
457; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
458; SI-NEXT:    s_mov_b32 s11, 0xf000
459; SI-NEXT:    s_mov_b32 s10, -1
460; SI-NEXT:    s_mov_b32 s14, s10
461; SI-NEXT:    s_mov_b32 s15, s11
462; SI-NEXT:    s_mov_b32 s18, s10
463; SI-NEXT:    s_mov_b32 s19, s11
464; SI-NEXT:    s_waitcnt lgkmcnt(0)
465; SI-NEXT:    s_mov_b32 s12, s4
466; SI-NEXT:    s_mov_b32 s13, s5
467; SI-NEXT:    s_mov_b32 s16, s6
468; SI-NEXT:    s_mov_b32 s17, s7
469; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
470; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
471; SI-NEXT:    s_mov_b32 s6, s10
472; SI-NEXT:    s_mov_b32 s7, s11
473; SI-NEXT:    s_mov_b32 s8, s0
474; SI-NEXT:    s_mov_b32 s9, s1
475; SI-NEXT:    s_mov_b32 s4, s2
476; SI-NEXT:    s_mov_b32 s5, s3
477; SI-NEXT:    s_waitcnt vmcnt(0)
478; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
479; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
480; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
481; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v0
482; SI-NEXT:    s_waitcnt expcnt(0)
483; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
484; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
485; SI-NEXT:    s_endpgm
486;
487; VI-LABEL: v_uaddo_i16:
488; VI:       ; %bb.0:
489; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
490; VI-NEXT:    s_waitcnt lgkmcnt(0)
491; VI-NEXT:    v_mov_b32_e32 v0, s4
492; VI-NEXT:    v_mov_b32_e32 v1, s5
493; VI-NEXT:    v_mov_b32_e32 v2, s6
494; VI-NEXT:    v_mov_b32_e32 v3, s7
495; VI-NEXT:    flat_load_ushort v4, v[0:1]
496; VI-NEXT:    flat_load_ushort v5, v[2:3]
497; VI-NEXT:    v_mov_b32_e32 v0, s0
498; VI-NEXT:    v_mov_b32_e32 v1, s1
499; VI-NEXT:    v_mov_b32_e32 v2, s2
500; VI-NEXT:    v_mov_b32_e32 v3, s3
501; VI-NEXT:    s_waitcnt vmcnt(0)
502; VI-NEXT:    v_add_u32_e32 v5, vcc, v4, v5
503; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
504; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
505; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v6, v4
506; VI-NEXT:    flat_store_short v[0:1], v5
507; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
508; VI-NEXT:    flat_store_byte v[2:3], v0
509; VI-NEXT:    s_endpgm
510;
511; GFX9-LABEL: v_uaddo_i16:
512; GFX9:       ; %bb.0:
513; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
514; GFX9-NEXT:    v_mov_b32_e32 v0, 0
515; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
517; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
518; GFX9-NEXT:    s_waitcnt vmcnt(0)
519; GFX9-NEXT:    v_add_u32_e32 v2, v1, v2
520; GFX9-NEXT:    v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
521; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
522; GFX9-NEXT:    global_store_short v0, v2, s[8:9]
523; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
524; GFX9-NEXT:    s_endpgm
525  %tid = call i32 @llvm.amdgcn.workitem.id.x()
526  %tid.ext = sext i32 %tid to i64
527  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
528  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr
529  %a = load i16, ptr addrspace(1) %a.gep
530  %b = load i16, ptr addrspace(1) %b.gep
531  %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
532  %val = extractvalue { i16, i1 } %uadd, 0
533  %carry = extractvalue { i16, i1 } %uadd, 1
534  store i16 %val, ptr addrspace(1) %out
535  store i1 %carry, ptr addrspace(1) %carryout
536  ret void
537}
538
539define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
540; SI-LABEL: v_uaddo_v2i32:
541; SI:       ; %bb.0:
542; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
543; SI-NEXT:    s_mov_b32 s11, 0xf000
544; SI-NEXT:    s_mov_b32 s10, -1
545; SI-NEXT:    s_mov_b32 s14, s10
546; SI-NEXT:    s_mov_b32 s15, s11
547; SI-NEXT:    s_mov_b32 s18, s10
548; SI-NEXT:    s_mov_b32 s19, s11
549; SI-NEXT:    s_waitcnt lgkmcnt(0)
550; SI-NEXT:    s_mov_b32 s12, s4
551; SI-NEXT:    s_mov_b32 s13, s5
552; SI-NEXT:    s_mov_b32 s16, s6
553; SI-NEXT:    s_mov_b32 s17, s7
554; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
555; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
556; SI-NEXT:    s_mov_b32 s6, s10
557; SI-NEXT:    s_mov_b32 s7, s11
558; SI-NEXT:    s_mov_b32 s8, s0
559; SI-NEXT:    s_mov_b32 s9, s1
560; SI-NEXT:    s_mov_b32 s4, s2
561; SI-NEXT:    s_mov_b32 s5, s3
562; SI-NEXT:    s_waitcnt vmcnt(0)
563; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
564; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
565; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
566; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
567; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
568; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
569; SI-NEXT:    s_endpgm
570;
571; VI-LABEL: v_uaddo_v2i32:
572; VI:       ; %bb.0:
573; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
574; VI-NEXT:    s_waitcnt lgkmcnt(0)
575; VI-NEXT:    v_mov_b32_e32 v0, s4
576; VI-NEXT:    v_mov_b32_e32 v1, s5
577; VI-NEXT:    v_mov_b32_e32 v2, s6
578; VI-NEXT:    v_mov_b32_e32 v3, s7
579; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
580; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
581; VI-NEXT:    v_mov_b32_e32 v4, s0
582; VI-NEXT:    v_mov_b32_e32 v5, s1
583; VI-NEXT:    v_mov_b32_e32 v6, s2
584; VI-NEXT:    v_mov_b32_e32 v7, s3
585; VI-NEXT:    s_waitcnt vmcnt(0)
586; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
587; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
588; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
589; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
590; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
591; VI-NEXT:    flat_store_dwordx2 v[6:7], v[2:3]
592; VI-NEXT:    s_endpgm
593;
594; GFX9-LABEL: v_uaddo_v2i32:
595; GFX9:       ; %bb.0:
596; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
597; GFX9-NEXT:    v_mov_b32_e32 v4, 0
598; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
600; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
601; GFX9-NEXT:    s_waitcnt vmcnt(0)
602; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
603; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
604; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
605; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
606; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
607; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
608; GFX9-NEXT:    s_endpgm
609  %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
610  %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
611  %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
612  %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
613  %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
614  store <2 x i32> %val, ptr addrspace(1) %out, align 4
615  %carry.ext = zext <2 x i1> %carry to <2 x i32>
616  store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
617  ret void
618}
619
620define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
621; SI-LABEL: s_uaddo_clamp_bit:
622; SI:       ; %bb.0: ; %entry
623; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
624; SI-NEXT:    s_waitcnt lgkmcnt(0)
625; SI-NEXT:    v_mov_b32_e32 v0, s1
626; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
627; SI-NEXT:    s_cmp_eq_u32 s0, s1
628; SI-NEXT:    s_mov_b64 s[0:1], 0
629; SI-NEXT:    s_cbranch_scc1 .LBB8_2
630; SI-NEXT:  ; %bb.1: ; %if
631; SI-NEXT:    s_xor_b64 s[0:1], vcc, -1
632; SI-NEXT:  .LBB8_2: ; %exit
633; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
634; SI-NEXT:    s_mov_b32 s3, 0xf000
635; SI-NEXT:    s_mov_b32 s2, -1
636; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
637; SI-NEXT:    s_mov_b32 s10, s2
638; SI-NEXT:    s_mov_b32 s11, s3
639; SI-NEXT:    s_waitcnt lgkmcnt(0)
640; SI-NEXT:    s_mov_b32 s0, s4
641; SI-NEXT:    s_mov_b32 s1, s5
642; SI-NEXT:    s_mov_b32 s8, s6
643; SI-NEXT:    s_mov_b32 s9, s7
644; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
645; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
646; SI-NEXT:    s_endpgm
647;
648; VI-LABEL: s_uaddo_clamp_bit:
649; VI:       ; %bb.0: ; %entry
650; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
651; VI-NEXT:    s_waitcnt lgkmcnt(0)
652; VI-NEXT:    v_mov_b32_e32 v0, s1
653; VI-NEXT:    s_cmp_eq_u32 s0, s1
654; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
655; VI-NEXT:    s_mov_b64 s[0:1], 0
656; VI-NEXT:    s_cbranch_scc1 .LBB8_2
657; VI-NEXT:  ; %bb.1: ; %if
658; VI-NEXT:    s_xor_b64 s[0:1], vcc, -1
659; VI-NEXT:  .LBB8_2: ; %exit
660; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
661; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
662; VI-NEXT:    s_waitcnt lgkmcnt(0)
663; VI-NEXT:    v_mov_b32_e32 v1, s4
664; VI-NEXT:    v_mov_b32_e32 v2, s5
665; VI-NEXT:    v_mov_b32_e32 v3, s6
666; VI-NEXT:    v_mov_b32_e32 v4, s7
667; VI-NEXT:    flat_store_dword v[1:2], v0
668; VI-NEXT:    flat_store_byte v[3:4], v5
669; VI-NEXT:    s_endpgm
670;
671; GFX9-LABEL: s_uaddo_clamp_bit:
672; GFX9:       ; %bb.0: ; %entry
673; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
674; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX9-NEXT:    v_mov_b32_e32 v0, s1
676; GFX9-NEXT:    s_cmp_eq_u32 s0, s1
677; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
678; GFX9-NEXT:    s_mov_b64 s[0:1], 0
679; GFX9-NEXT:    s_cbranch_scc1 .LBB8_2
680; GFX9-NEXT:  ; %bb.1: ; %if
681; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
682; GFX9-NEXT:  .LBB8_2: ; %exit
683; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
684; GFX9-NEXT:    v_mov_b32_e32 v1, 0
685; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
686; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX9-NEXT:    global_store_dword v1, v0, s[8:9]
688; GFX9-NEXT:    global_store_byte v1, v2, s[10:11]
689; GFX9-NEXT:    s_endpgm
690entry:
691  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
692  %val = extractvalue { i32, i1 } %uadd, 0
693  %carry = extractvalue { i32, i1 } %uadd, 1
694  %c2 = icmp eq i1 %carry, false
695  %cc = icmp eq i32 %a, %b
696  br i1 %cc, label %exit, label %if
697
698if:
699  br label %exit
700
701exit:
702  %cout = phi i1 [false, %entry], [%c2, %if]
703  store i32 %val, ptr addrspace(1) %out, align 4
704  store i1 %cout, ptr addrspace(1) %carryout
705  ret void
706}
707
708define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
709; SI-LABEL: v_uaddo_clamp_bit:
710; SI:       ; %bb.0: ; %entry
711; SI-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x9
712; SI-NEXT:    s_mov_b32 s3, 0xf000
713; SI-NEXT:    s_mov_b32 s2, -1
714; SI-NEXT:    s_mov_b32 s14, s2
715; SI-NEXT:    s_mov_b32 s15, s3
716; SI-NEXT:    s_waitcnt lgkmcnt(0)
717; SI-NEXT:    s_mov_b32 s0, s8
718; SI-NEXT:    s_mov_b32 s1, s9
719; SI-NEXT:    s_mov_b32 s12, s10
720; SI-NEXT:    s_mov_b32 s13, s11
721; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0
722; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
723; SI-NEXT:    s_waitcnt vmcnt(0)
724; SI-NEXT:    v_add_i32_e64 v0, s[0:1], v1, v2
725; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
726; SI-NEXT:    s_mov_b64 s[8:9], 0
727; SI-NEXT:    s_cbranch_vccnz .LBB9_2
728; SI-NEXT:  ; %bb.1: ; %if
729; SI-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
730; SI-NEXT:  .LBB9_2: ; %exit
731; SI-NEXT:    s_mov_b32 s0, s4
732; SI-NEXT:    s_mov_b32 s1, s5
733; SI-NEXT:    s_mov_b32 s4, s6
734; SI-NEXT:    s_mov_b32 s5, s7
735; SI-NEXT:    s_mov_b32 s6, s2
736; SI-NEXT:    s_mov_b32 s7, s3
737; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
738; SI-NEXT:    s_waitcnt expcnt(0)
739; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
740; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
741; SI-NEXT:    s_endpgm
742;
743; VI-LABEL: v_uaddo_clamp_bit:
744; VI:       ; %bb.0: ; %entry
745; VI-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x24
746; VI-NEXT:    s_mov_b64 s[2:3], 0
747; VI-NEXT:    s_waitcnt lgkmcnt(0)
748; VI-NEXT:    v_mov_b32_e32 v0, s8
749; VI-NEXT:    v_mov_b32_e32 v1, s9
750; VI-NEXT:    v_mov_b32_e32 v2, s10
751; VI-NEXT:    v_mov_b32_e32 v3, s11
752; VI-NEXT:    flat_load_dword v1, v[0:1]
753; VI-NEXT:    flat_load_dword v2, v[2:3]
754; VI-NEXT:    s_waitcnt vmcnt(0)
755; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
756; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v1, v2
757; VI-NEXT:    s_cbranch_vccnz .LBB9_2
758; VI-NEXT:  ; %bb.1: ; %if
759; VI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
760; VI-NEXT:  .LBB9_2: ; %exit
761; VI-NEXT:    v_mov_b32_e32 v1, s4
762; VI-NEXT:    v_mov_b32_e32 v2, s5
763; VI-NEXT:    v_mov_b32_e32 v3, s6
764; VI-NEXT:    v_mov_b32_e32 v4, s7
765; VI-NEXT:    flat_store_dword v[1:2], v0
766; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
767; VI-NEXT:    flat_store_byte v[3:4], v0
768; VI-NEXT:    s_endpgm
769;
770; GFX9-LABEL: v_uaddo_clamp_bit:
771; GFX9:       ; %bb.0: ; %entry
772; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
773; GFX9-NEXT:    v_mov_b32_e32 v0, 0
774; GFX9-NEXT:    s_mov_b64 s[2:3], 0
775; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX9-NEXT:    global_load_dword v2, v0, s[12:13]
777; GFX9-NEXT:    global_load_dword v3, v0, s[14:15]
778; GFX9-NEXT:    s_waitcnt vmcnt(0)
779; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
780; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v2, v3
781; GFX9-NEXT:    s_cbranch_vccnz .LBB9_2
782; GFX9-NEXT:  ; %bb.1: ; %if
783; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
784; GFX9-NEXT:  .LBB9_2: ; %exit
785; GFX9-NEXT:    global_store_dword v0, v1, s[8:9]
786; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
787; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
788; GFX9-NEXT:    s_endpgm
789entry:
790  %tid = call i32 @llvm.amdgcn.workitem.id.x()
791  %tid.ext = sext i32 %tid to i64
792  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
793  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
794  %a = load i32, ptr addrspace(1) %a.gep
795  %b = load i32, ptr addrspace(1) %b.gep
796  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
797  %val = extractvalue { i32, i1 } %uadd, 0
798  %carry = extractvalue { i32, i1 } %uadd, 1
799  %c2 = icmp eq i1 %carry, false
800  %cc = icmp eq i32 %a, %b
801  br i1 %cc, label %exit, label %if
802
803if:
804  br label %exit
805
806exit:
807  %cout = phi i1 [false, %entry], [%c2, %if]
808  store i32 %val, ptr addrspace(1) %out, align 4
809  store i1 %cout, ptr addrspace(1) %carryout
810  ret void
811}
812
813define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) {
814; SI-LABEL: sv_uaddo_i128:
815; SI:       ; %bb.0:
816; SI-NEXT:    s_mov_b32 s7, 0xf000
817; SI-NEXT:    s_mov_b32 s6, 0
818; SI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
819; SI-NEXT:    v_mov_b32_e32 v6, s1
820; SI-NEXT:    v_mov_b32_e32 v7, s2
821; SI-NEXT:    v_mov_b32_e32 v8, s3
822; SI-NEXT:    s_mov_b32 s4, s6
823; SI-NEXT:    s_mov_b32 s5, s6
824; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
825; SI-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
826; SI-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3]
827; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
828; SI-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
829; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
830; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
831; SI-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
832; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
833; SI-NEXT:    v_and_b32_e32 v2, 1, v2
834; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
835; SI-NEXT:    s_endpgm
836;
837; VI-LABEL: sv_uaddo_i128:
838; VI:       ; %bb.0:
839; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
840; VI-NEXT:    v_mov_b32_e32 v6, s1
841; VI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
842; VI-NEXT:    v_mov_b32_e32 v6, s2
843; VI-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
844; VI-NEXT:    v_mov_b32_e32 v6, s3
845; VI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
846; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
847; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
848; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
849; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
850; VI-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
851; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
852; VI-NEXT:    v_and_b32_e32 v2, 1, v2
853; VI-NEXT:    flat_store_dword v[0:1], v2
854; VI-NEXT:    s_endpgm
855;
856; GFX9-LABEL: sv_uaddo_i128:
857; GFX9:       ; %bb.0:
858; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
859; GFX9-NEXT:    v_mov_b32_e32 v6, s1
860; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
861; GFX9-NEXT:    v_mov_b32_e32 v6, s2
862; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
863; GFX9-NEXT:    v_mov_b32_e32 v6, s3
864; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
865; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
866; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
867; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
868; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
869; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
870; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
871; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
872; GFX9-NEXT:    global_store_dword v[0:1], v2, off
873; GFX9-NEXT:    s_endpgm
874  %uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b)
875  %carry = extractvalue { i128, i1 } %uadd, 1
876  %carry.ext = zext i1 %carry to i32
877  store i32 %carry.ext, ptr addrspace(1) %out
878  ret void
879}
880
881declare i32 @llvm.amdgcn.workitem.id.x() #1
882declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1
883declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
884declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1
885declare { i128, i1 } @llvm.uadd.with.overflow.i128(i128, i128) #1
886declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
887
888
889attributes #0 = { nounwind }
890attributes #1 = { nounwind readnone }
891