xref: /llvm-project/llvm/test/CodeGen/NVPTX/variadics-backend.ll (revision 560b72c0408a8f7e4340a1d4197b164a14cd30b0)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 < %s | FileCheck %s --check-prefix=CHECK-PTX
3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
4
5%struct.S1 = type { i32, i8, i64 }
6%struct.S2 = type { i64, i64 }
7
8@__const.bar.s1 = private unnamed_addr constant %struct.S1 { i32 1, i8 1, i64 1 }, align 8
9@__const.qux.s = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8
10
11define dso_local i32 @variadics1(i32 noundef %first, ...) {
12; CHECK-PTX-LABEL: variadics1(
13; CHECK-PTX:       {
14; CHECK-PTX-NEXT:    .reg .b32 %r<11>;
15; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
16; CHECK-PTX-NEXT:    .reg .f64 %fd<7>;
17; CHECK-PTX-EMPTY:
18; CHECK-PTX-NEXT:  // %bb.0: // %entry
19; CHECK-PTX-NEXT:    ld.param.u32 %r1, [variadics1_param_0];
20; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics1_param_1];
21; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd1];
22; CHECK-PTX-NEXT:    add.s32 %r3, %r1, %r2;
23; CHECK-PTX-NEXT:    ld.u32 %r4, [%rd1+4];
24; CHECK-PTX-NEXT:    add.s32 %r5, %r3, %r4;
25; CHECK-PTX-NEXT:    ld.u32 %r6, [%rd1+8];
26; CHECK-PTX-NEXT:    add.s32 %r7, %r5, %r6;
27; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 19;
28; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
29; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3];
30; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r7;
31; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
32; CHECK-PTX-NEXT:    cvt.u32.u64 %r8, %rd6;
33; CHECK-PTX-NEXT:    add.s64 %rd7, %rd3, 15;
34; CHECK-PTX-NEXT:    and.b64 %rd8, %rd7, -8;
35; CHECK-PTX-NEXT:    ld.f64 %fd1, [%rd8];
36; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd2, %r8;
37; CHECK-PTX-NEXT:    add.rn.f64 %fd3, %fd2, %fd1;
38; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r9, %fd3;
39; CHECK-PTX-NEXT:    add.s64 %rd9, %rd8, 15;
40; CHECK-PTX-NEXT:    and.b64 %rd10, %rd9, -8;
41; CHECK-PTX-NEXT:    ld.f64 %fd4, [%rd10];
42; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd5, %r9;
43; CHECK-PTX-NEXT:    add.rn.f64 %fd6, %fd5, %fd4;
44; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %fd6;
45; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r10;
46; CHECK-PTX-NEXT:    ret;
47entry:
48  %vlist = alloca ptr, align 8
49  call void @llvm.va_start.p0(ptr %vlist)
50  %argp.cur = load ptr, ptr %vlist, align 8
51  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 4
52  store ptr %argp.next, ptr %vlist, align 8
53  %0 = load i32, ptr %argp.cur, align 4
54  %add = add nsw i32 %first, %0
55  %argp.cur1 = load ptr, ptr %vlist, align 8
56  %argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i64 4
57  store ptr %argp.next2, ptr %vlist, align 8
58  %1 = load i32, ptr %argp.cur1, align 4
59  %add3 = add nsw i32 %add, %1
60  %argp.cur4 = load ptr, ptr %vlist, align 8
61  %argp.next5 = getelementptr inbounds i8, ptr %argp.cur4, i64 4
62  store ptr %argp.next5, ptr %vlist, align 8
63  %2 = load i32, ptr %argp.cur4, align 4
64  %add6 = add nsw i32 %add3, %2
65  %argp.cur7 = load ptr, ptr %vlist, align 8
66  %3 = getelementptr inbounds i8, ptr %argp.cur7, i32 7
67  %argp.cur7.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %3, i64 -8)
68  %argp.next8 = getelementptr inbounds i8, ptr %argp.cur7.aligned, i64 8
69  store ptr %argp.next8, ptr %vlist, align 8
70  %4 = load i64, ptr %argp.cur7.aligned, align 8
71  %conv = sext i32 %add6 to i64
72  %add9 = add nsw i64 %conv, %4
73  %conv10 = trunc i64 %add9 to i32
74  %argp.cur11 = load ptr, ptr %vlist, align 8
75  %5 = getelementptr inbounds i8, ptr %argp.cur11, i32 7
76  %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %5, i64 -8)
77  %argp.next12 = getelementptr inbounds i8, ptr %argp.cur11.aligned, i64 8
78  store ptr %argp.next12, ptr %vlist, align 8
79  %6 = load double, ptr %argp.cur11.aligned, align 8
80  %conv13 = sitofp i32 %conv10 to double
81  %add14 = fadd double %conv13, %6
82  %conv15 = fptosi double %add14 to i32
83  %argp.cur16 = load ptr, ptr %vlist, align 8
84  %7 = getelementptr inbounds i8, ptr %argp.cur16, i32 7
85  %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %7, i64 -8)
86  %argp.next17 = getelementptr inbounds i8, ptr %argp.cur16.aligned, i64 8
87  store ptr %argp.next17, ptr %vlist, align 8
88  %8 = load double, ptr %argp.cur16.aligned, align 8
89  %conv18 = sitofp i32 %conv15 to double
90  %add19 = fadd double %conv18, %8
91  %conv20 = fptosi double %add19 to i32
92  call void @llvm.va_end.p0(ptr %vlist)
93  ret i32 %conv20
94}
95
96declare void @llvm.va_start.p0(ptr)
97
98declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
99
100declare void @llvm.va_end.p0(ptr)
101
102define dso_local i32 @foo() {
103; CHECK-PTX-LABEL: foo(
104; CHECK-PTX:       {
105; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot1[40];
106; CHECK-PTX-NEXT:    .reg .b64 %SP;
107; CHECK-PTX-NEXT:    .reg .b64 %SPL;
108; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
109; CHECK-PTX-NEXT:    .reg .b64 %rd<5>;
110; CHECK-PTX-EMPTY:
111; CHECK-PTX-NEXT:  // %bb.0: // %entry
112; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot1;
113; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
114; CHECK-PTX-NEXT:    mov.b64 %rd1, 4294967297;
115; CHECK-PTX-NEXT:    st.u64 [%SP], %rd1;
116; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
117; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
118; CHECK-PTX-NEXT:    mov.b64 %rd2, 1;
119; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd2;
120; CHECK-PTX-NEXT:    mov.b64 %rd3, 4607182418800017408;
121; CHECK-PTX-NEXT:    st.u64 [%SP+24], %rd3;
122; CHECK-PTX-NEXT:    st.u64 [%SP+32], %rd3;
123; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 0;
124; CHECK-PTX-NEXT:    { // callseq 0, 0
125; CHECK-PTX-NEXT:    .param .b32 param0;
126; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
127; CHECK-PTX-NEXT:    .param .b64 param1;
128; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd4;
129; CHECK-PTX-NEXT:    .param .b32 retval0;
130; CHECK-PTX-NEXT:    call.uni (retval0),
131; CHECK-PTX-NEXT:    variadics1,
132; CHECK-PTX-NEXT:    (
133; CHECK-PTX-NEXT:    param0,
134; CHECK-PTX-NEXT:    param1
135; CHECK-PTX-NEXT:    );
136; CHECK-PTX-NEXT:    ld.param.b32 %r2, [retval0];
137; CHECK-PTX-NEXT:    } // callseq 0
138; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r2;
139; CHECK-PTX-NEXT:    ret;
140entry:
141  %conv = sext i8 1 to i32
142  %conv1 = sext i16 1 to i32
143  %conv2 = fpext float 1.000000e+00 to double
144  %call = call i32 (i32, ...) @variadics1(i32 noundef 1, i32 noundef %conv, i32 noundef %conv1, i32 noundef 1, i64 noundef 1, double noundef %conv2, double noundef 1.000000e+00)
145  ret i32 %call
146}
147
148define dso_local i32 @variadics2(i32 noundef %first, ...) {
149; CHECK-PTX-LABEL: variadics2(
150; CHECK-PTX:       {
151; CHECK-PTX-NEXT:    .local .align 2 .b8 __local_depot2[4];
152; CHECK-PTX-NEXT:    .reg .b64 %SP;
153; CHECK-PTX-NEXT:    .reg .b64 %SPL;
154; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
155; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
156; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
157; CHECK-PTX-EMPTY:
158; CHECK-PTX-NEXT:  // %bb.0: // %entry
159; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
160; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
161; CHECK-PTX-NEXT:    ld.param.u32 %r1, [variadics2_param_0];
162; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics2_param_1];
163; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
164; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
165; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd3];
166; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd3+4];
167; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd3+7];
168; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs1;
169; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd3+5];
170; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd3+6];
171; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
172; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
173; CHECK-PTX-NEXT:    st.u16 [%SP], %rs5;
174; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3+8];
175; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
176; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
177; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r5;
178; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
179; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd6;
180; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r6;
181; CHECK-PTX-NEXT:    ret;
182entry:
183  %vlist = alloca ptr, align 8
184  %s1.sroa.3 = alloca [3 x i8], align 1
185  call void @llvm.va_start.p0(ptr %vlist)
186  %argp.cur = load ptr, ptr %vlist, align 8
187  %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7
188  %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8)
189  %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16
190  store ptr %argp.next, ptr %vlist, align 8
191  %s1.sroa.0.0.copyload = load i32, ptr %argp.cur.aligned, align 8
192  %s1.sroa.2.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 4
193  %s1.sroa.2.0.copyload = load i8, ptr %s1.sroa.2.0.argp.cur.aligned.sroa_idx, align 4
194  %s1.sroa.3.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 5
195  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 %s1.sroa.3.0.argp.cur.aligned.sroa_idx, i64 3, i1 false)
196  %s1.sroa.31.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8
197  %s1.sroa.31.0.copyload = load i64, ptr %s1.sroa.31.0.argp.cur.aligned.sroa_idx, align 8
198  %add = add nsw i32 %first, %s1.sroa.0.0.copyload
199  %conv = sext i8 %s1.sroa.2.0.copyload to i32
200  %add1 = add nsw i32 %add, %conv
201  %conv2 = sext i32 %add1 to i64
202  %add3 = add nsw i64 %conv2, %s1.sroa.31.0.copyload
203  %conv4 = trunc i64 %add3 to i32
204  call void @llvm.va_end.p0(ptr %vlist)
205  ret i32 %conv4
206}
207
208declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
209
210define dso_local i32 @bar() {
211; CHECK-PTX-LABEL: bar(
212; CHECK-PTX:       {
213; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot3[24];
214; CHECK-PTX-NEXT:    .reg .b64 %SP;
215; CHECK-PTX-NEXT:    .reg .b64 %SPL;
216; CHECK-PTX-NEXT:    .reg .b16 %rs<10>;
217; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
218; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
219; CHECK-PTX-EMPTY:
220; CHECK-PTX-NEXT:  // %bb.0: // %entry
221; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
222; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
223; CHECK-PTX-NEXT:    mov.u64 %rd1, __const_$_bar_$_s1;
224; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
225; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [%rd2];
226; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
227; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs2;
228; CHECK-PTX-NEXT:    add.s64 %rd3, %rd1, 5;
229; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs3, [%rd3];
230; CHECK-PTX-NEXT:    cvt.u16.u8 %rs4, %rs3;
231; CHECK-PTX-NEXT:    add.s64 %rd4, %rd1, 6;
232; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs5, [%rd4];
233; CHECK-PTX-NEXT:    cvt.u16.u8 %rs6, %rs5;
234; CHECK-PTX-NEXT:    shl.b16 %rs7, %rs6, 8;
235; CHECK-PTX-NEXT:    or.b16 %rs8, %rs7, %rs4;
236; CHECK-PTX-NEXT:    st.u16 [%SP], %rs8;
237; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
238; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
239; CHECK-PTX-NEXT:    mov.b16 %rs9, 1;
240; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs9;
241; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
242; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
243; CHECK-PTX-NEXT:    add.u64 %rd6, %SP, 8;
244; CHECK-PTX-NEXT:    { // callseq 1, 0
245; CHECK-PTX-NEXT:    .param .b32 param0;
246; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
247; CHECK-PTX-NEXT:    .param .b64 param1;
248; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd6;
249; CHECK-PTX-NEXT:    .param .b32 retval0;
250; CHECK-PTX-NEXT:    call.uni (retval0),
251; CHECK-PTX-NEXT:    variadics2,
252; CHECK-PTX-NEXT:    (
253; CHECK-PTX-NEXT:    param0,
254; CHECK-PTX-NEXT:    param1
255; CHECK-PTX-NEXT:    );
256; CHECK-PTX-NEXT:    ld.param.b32 %r2, [retval0];
257; CHECK-PTX-NEXT:    } // callseq 1
258; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r2;
259; CHECK-PTX-NEXT:    ret;
260entry:
261  %s1.sroa.3 = alloca [3 x i8], align 1
262  %s1.sroa.0.0.copyload = load i32, ptr @__const.bar.s1, align 8
263  %s1.sroa.2.0.copyload = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4
264  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false)
265  %s1.sroa.31.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8
266  %call = call i32 (i32, ...) @variadics2(i32 noundef 1, i32 %s1.sroa.0.0.copyload, i8 %s1.sroa.2.0.copyload, i64 %s1.sroa.31.0.copyload)
267  ret i32 %call
268}
269
270define dso_local i32 @variadics3(i32 noundef %first, ...) {
271; CHECK-PTX-LABEL: variadics3(
272; CHECK-PTX:       {
273; CHECK-PTX-NEXT:    .reg .b32 %r<8>;
274; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
275; CHECK-PTX-EMPTY:
276; CHECK-PTX-NEXT:  // %bb.0: // %entry
277; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics3_param_1];
278; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 15;
279; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -16;
280; CHECK-PTX-NEXT:    ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd3];
281; CHECK-PTX-NEXT:    add.s32 %r5, %r1, %r2;
282; CHECK-PTX-NEXT:    add.s32 %r6, %r5, %r3;
283; CHECK-PTX-NEXT:    add.s32 %r7, %r6, %r4;
284; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r7;
285; CHECK-PTX-NEXT:    ret;
286entry:
287  %vlist = alloca ptr, align 8
288  call void @llvm.va_start.p0(ptr %vlist)
289  %argp.cur = load ptr, ptr %vlist, align 8
290  %0 = getelementptr inbounds i8, ptr %argp.cur, i32 15
291  %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -16)
292  %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16
293  store ptr %argp.next, ptr %vlist, align 8
294  %1 = load <4 x i32>, ptr %argp.cur.aligned, align 16
295  call void @llvm.va_end.p0(ptr %vlist)
296  %2 = extractelement <4 x i32> %1, i64 0
297  %3 = extractelement <4 x i32> %1, i64 1
298  %add = add nsw i32 %2, %3
299  %4 = extractelement <4 x i32> %1, i64 2
300  %add1 = add nsw i32 %add, %4
301  %5 = extractelement <4 x i32> %1, i64 3
302  %add2 = add nsw i32 %add1, %5
303  ret i32 %add2
304}
305
306define dso_local i32 @baz() {
307; CHECK-PTX-LABEL: baz(
308; CHECK-PTX:       {
309; CHECK-PTX-NEXT:    .local .align 16 .b8 __local_depot5[16];
310; CHECK-PTX-NEXT:    .reg .b64 %SP;
311; CHECK-PTX-NEXT:    .reg .b64 %SPL;
312; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
313; CHECK-PTX-NEXT:    .reg .b64 %rd<2>;
314; CHECK-PTX-EMPTY:
315; CHECK-PTX-NEXT:  // %bb.0: // %entry
316; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot5;
317; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
318; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
319; CHECK-PTX-NEXT:    st.v4.u32 [%SP], {%r1, %r1, %r1, %r1};
320; CHECK-PTX-NEXT:    add.u64 %rd1, %SP, 0;
321; CHECK-PTX-NEXT:    { // callseq 2, 0
322; CHECK-PTX-NEXT:    .param .b32 param0;
323; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
324; CHECK-PTX-NEXT:    .param .b64 param1;
325; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd1;
326; CHECK-PTX-NEXT:    .param .b32 retval0;
327; CHECK-PTX-NEXT:    call.uni (retval0),
328; CHECK-PTX-NEXT:    variadics3,
329; CHECK-PTX-NEXT:    (
330; CHECK-PTX-NEXT:    param0,
331; CHECK-PTX-NEXT:    param1
332; CHECK-PTX-NEXT:    );
333; CHECK-PTX-NEXT:    ld.param.b32 %r2, [retval0];
334; CHECK-PTX-NEXT:    } // callseq 2
335; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r2;
336; CHECK-PTX-NEXT:    ret;
337entry:
338  %call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef <i32 1, i32 1, i32 1, i32 1>)
339  ret i32 %call
340}
341
342define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) {
343; CHECK-PTX-LABEL: variadics4(
344; CHECK-PTX:       {
345; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
346; CHECK-PTX-NEXT:    .reg .b64 %rd<9>;
347; CHECK-PTX-EMPTY:
348; CHECK-PTX-NEXT:  // %bb.0: // %entry
349; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics4_param_1];
350; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
351; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
352; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3];
353; CHECK-PTX-NEXT:    ld.param.u64 %rd5, [variadics4_param_0];
354; CHECK-PTX-NEXT:    ld.param.u64 %rd6, [variadics4_param_0+8];
355; CHECK-PTX-NEXT:    add.s64 %rd7, %rd5, %rd6;
356; CHECK-PTX-NEXT:    add.s64 %rd8, %rd7, %rd4;
357; CHECK-PTX-NEXT:    cvt.u32.u64 %r1, %rd8;
358; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r1;
359; CHECK-PTX-NEXT:    ret;
360entry:
361  %vlist = alloca ptr, align 8
362  call void @llvm.va_start.p0(ptr %vlist)
363  %argp.cur = load ptr, ptr %vlist, align 8
364  %0 = getelementptr inbounds i8, ptr %argp.cur, i32 7
365  %argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8)
366  %argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8
367  store ptr %argp.next, ptr %vlist, align 8
368  %1 = load i64, ptr %argp.cur.aligned, align 8
369  %x1 = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 0
370  %2 = load i64, ptr %x1, align 8
371  %y = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 1
372  %3 = load i64, ptr %y, align 8
373  %add = add nsw i64 %2, %3
374  %add2 = add nsw i64 %add, %1
375  %conv = trunc i64 %add2 to i32
376  call void @llvm.va_end.p0(ptr %vlist)
377  ret i32 %conv
378}
379
380define dso_local void @qux() {
381; CHECK-PTX-LABEL: qux(
382; CHECK-PTX:       {
383; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot7[24];
384; CHECK-PTX-NEXT:    .reg .b64 %SP;
385; CHECK-PTX-NEXT:    .reg .b64 %SPL;
386; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
387; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
388; CHECK-PTX-EMPTY:
389; CHECK-PTX-NEXT:  // %bb.0: // %entry
390; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot7;
391; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
392; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd1, [__const_$_qux_$_s];
393; CHECK-PTX-NEXT:    st.u64 [%SP], %rd1;
394; CHECK-PTX-NEXT:    mov.u64 %rd2, __const_$_qux_$_s;
395; CHECK-PTX-NEXT:    add.s64 %rd3, %rd2, 8;
396; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd4, [%rd3];
397; CHECK-PTX-NEXT:    st.u64 [%SP+8], %rd4;
398; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
399; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
400; CHECK-PTX-NEXT:    add.u64 %rd6, %SP, 16;
401; CHECK-PTX-NEXT:    { // callseq 3, 0
402; CHECK-PTX-NEXT:    .param .align 8 .b8 param0[16];
403; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd1;
404; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd4;
405; CHECK-PTX-NEXT:    .param .b64 param1;
406; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd6;
407; CHECK-PTX-NEXT:    .param .b32 retval0;
408; CHECK-PTX-NEXT:    call.uni (retval0),
409; CHECK-PTX-NEXT:    variadics4,
410; CHECK-PTX-NEXT:    (
411; CHECK-PTX-NEXT:    param0,
412; CHECK-PTX-NEXT:    param1
413; CHECK-PTX-NEXT:    );
414; CHECK-PTX-NEXT:    ld.param.b32 %r1, [retval0];
415; CHECK-PTX-NEXT:    } // callseq 3
416; CHECK-PTX-NEXT:    ret;
417entry:
418  %s = alloca %struct.S2, align 8
419  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %s, ptr align 8 @__const.qux.s, i64 16, i1 false)
420  %call = call i32 (ptr, ...) @variadics4(ptr noundef byval(%struct.S2) align 8 %s, i64 noundef 1)
421  ret void
422}
423