xref: /llvm-project/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll (revision 0f0a96b8621fcc8e1d6b6a3d047c263bb17a7f39)
1; RUN: llc < %s -mtriple=nvptx64-unknown-unknown | FileCheck %s
2; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-unknown-unknown | %ptxas-verify %}
3;
4; Check that parameters of a __device__ function with private or internal
5; linkage called from a __global__ (kernel) function get increased alignment,
6; and additional vectorization is performed on loads/stores with that
7; parameters.
8;
9; Test IR is a minimized version of IR generated with the following command
10; from the source code below:
11; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
12;
13; ----------------------------------------------------------------------------
14; #include <stdint.h>
15;
16; struct St4x1 { uint32_t field[1]; };
17; struct St4x2 { uint32_t field[2]; };
18; struct St4x3 { uint32_t field[3]; };
19; struct St4x4 { uint32_t field[4]; };
20; struct St4x5 { uint32_t field[5]; };
21; struct St4x6 { uint32_t field[6]; };
22; struct St4x7 { uint32_t field[7]; };
23; struct St4x8 { uint32_t field[8]; };
24; struct St8x1 { uint64_t field[1]; };
25; struct St8x2 { uint64_t field[2]; };
26; struct St8x3 { uint64_t field[3]; };
27; struct St8x4 { uint64_t field[4]; };
28;
29; #define DECLARE_CALLEE(StName)                                      \
30; static __device__  __attribute__((noinline))                        \
31; struct StName callee_##StName(struct StName in) {                   \
32;   struct StName ret;                                                \
33;   const unsigned size = sizeof(ret.field) / sizeof(*ret.field);     \
34;   for (unsigned i = 0; i != size; ++i)                              \
35;     ret.field[i] = in.field[i];                                     \
36;   return ret;                                                       \
37; }                                                                   \
38
39; #define DECLARE_CALLER(StName)                                      \
40; __global__                                                          \
41; void caller_##StName(struct StName in, struct StName* ret)          \
42; {                                                                   \
43;   *ret = callee_##StName(in);                                       \
44; }                                                                   \
45;
46; #define DECLARE_CALL(StName)  \
47;     DECLARE_CALLEE(StName)    \
48;     DECLARE_CALLER(StName)    \
49;
50; DECLARE_CALL(St4x1)
51; DECLARE_CALL(St4x2)
52; DECLARE_CALL(St4x3)
53; DECLARE_CALL(St4x4)
54; DECLARE_CALL(St4x5)
55; DECLARE_CALL(St4x6)
56; DECLARE_CALL(St4x7)
57; DECLARE_CALL(St4x8)
58; DECLARE_CALL(St8x1)
59; DECLARE_CALL(St8x2)
60; DECLARE_CALL(St8x3)
61; DECLARE_CALL(St8x4)
62; ----------------------------------------------------------------------------
63
64%struct.St4x1 = type { [1 x i32] }
65%struct.St4x2 = type { [2 x i32] }
66%struct.St4x3 = type { [3 x i32] }
67%struct.St4x4 = type { [4 x i32] }
68%struct.St4x5 = type { [5 x i32] }
69%struct.St4x6 = type { [6 x i32] }
70%struct.St4x7 = type { [7 x i32] }
71%struct.St4x8 = type { [8 x i32] }
72%struct.St8x1 = type { [1 x i64] }
73%struct.St8x2 = type { [2 x i64] }
74%struct.St8x3 = type { [3 x i64] }
75%struct.St8x4 = type { [4 x i64] }
76
77; Section 1 - checking that:
78; - function argument (including retval) vectorization is done with internal linkage;
79; - caller and callee specify correct alignment for callee's params.
80
81define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct.St4x1) align 4 %in, ptr nocapture noundef writeonly %ret) {
82  ; CHECK-LABEL: .visible .func caller_St4x1(
83  ; CHECK:               .param .align 4 .b8 caller_St4x1_param_0[4],
84  ; CHECK:               .param .b64 caller_St4x1_param_1
85  ; CHECK:       )
86  ; CHECK:       .param .b32 param0;
87  ; CHECK:       st.param.b32 [param0], {{%r[0-9]+}};
88  ; CHECK:       .param .align 16 .b8 retval0[4];
89  ; CHECK:       call.uni (retval0),
90  ; CHECK-NEXT:  callee_St4x1,
91  ; CHECK-NEXT:  (
92  ; CHECK-NEXT:  param0
93  ; CHECK-NEXT:  );
94  ; CHECK:       ld.param.b32 {{%r[0-9]+}}, [retval0];
95  %1 = load i32, ptr %in, align 4
96  %call = tail call fastcc [1 x i32] @callee_St4x1(i32 %1) #2
97  %.fca.0.extract = extractvalue [1 x i32] %call, 0
98  store i32 %.fca.0.extract, ptr %ret, align 4
99  ret void
100}
101
102define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) {
103  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[4])
104  ; CHECK-LABEL: callee_St4x1(
105  ; CHECK-NEXT:  .param .b32 callee_St4x1_param_0
106  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0];
107  ; CHECK:       st.param.b32 [func_retval0], [[R1]];
108  ; CHECK-NEXT:  ret;
109  %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0
110  ret [1 x i32] %oldret
111}
112
113define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct.St4x2) align 4 %in, ptr nocapture noundef writeonly %ret) {
114  ; CHECK-LABEL: .visible .func caller_St4x2(
115  ; CHECK:               .param .align 4 .b8 caller_St4x2_param_0[8],
116  ; CHECK:               .param .b64 caller_St4x2_param_1
117  ; CHECK:       )
118  ; CHECK:       .param .align 16 .b8 param0[8];
119  ; CHECK:       st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
120  ; CHECK:       .param .align 16 .b8 retval0[8];
121  ; CHECK:       call.uni (retval0),
122  ; CHECK-NEXT:  callee_St4x2,
123  ; CHECK-NEXT:  (
124  ; CHECK-NEXT:  param0
125  ; CHECK-NEXT:  );
126  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
127  %agg.tmp = alloca %struct.St4x2, align 8
128  %1 = load i64, ptr %in, align 4
129  store i64 %1, ptr %agg.tmp, align 8
130  %call = tail call fastcc [2 x i32] @callee_St4x2(ptr noundef nonnull byval(%struct.St4x2) align 4 %agg.tmp) #2
131  %.fca.0.extract = extractvalue [2 x i32] %call, 0
132  %.fca.1.extract = extractvalue [2 x i32] %call, 1
133  store i32 %.fca.0.extract, ptr %ret, align 4
134  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
135  store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
136  ret void
137}
138
139define internal fastcc [2 x i32] @callee_St4x2(ptr nocapture noundef readonly byval(%struct.St4x2) align 4 %in) {
140  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
141  ; CHECK-LABEL: callee_St4x2(
142  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x2_param_0[8]
143  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0];
144  ; CHECK:       st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
145  ; CHECK-NEXT:  ret;
146  %1 = load i32, ptr %in, align 4
147  %arrayidx.1 = getelementptr inbounds [2 x i32], ptr %in, i64 0, i64 1
148  %2 = load i32, ptr %arrayidx.1, align 4
149  %3 = insertvalue [2 x i32] poison, i32 %1, 0
150  %oldret = insertvalue [2 x i32] %3, i32 %2, 1
151  ret [2 x i32] %oldret
152}
153
154define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct.St4x3) align 4 %in, ptr nocapture noundef writeonly %ret) {
155  ; CHECK-LABEL: .visible .func caller_St4x3(
156  ; CHECK:               .param .align 4 .b8 caller_St4x3_param_0[12],
157  ; CHECK:               .param .b64 caller_St4x3_param_1
158  ; CHECK:       )
159  ; CHECK:       .param .align 16 .b8 param0[12];
160  ; CHECK:       st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
161  ; CHECK:       st.param.b32    [param0+8], {{%r[0-9]+}};
162  ; CHECK:       .param .align 16 .b8 retval0[12];
163  ; CHECK:       call.uni (retval0),
164  ; CHECK-NEXT:  callee_St4x3,
165  ; CHECK-NEXT:  (
166  ; CHECK-NEXT:  param0
167  ; CHECK-NEXT:  );
168  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
169  ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+8];
170  %call = tail call fastcc [3 x i32] @callee_St4x3(ptr noundef nonnull byval(%struct.St4x3) align 4 %in) #2
171  %.fca.0.extract = extractvalue [3 x i32] %call, 0
172  %.fca.1.extract = extractvalue [3 x i32] %call, 1
173  %.fca.2.extract = extractvalue [3 x i32] %call, 2
174  store i32 %.fca.0.extract, ptr %ret, align 4
175  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
176  store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
177  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
178  store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
179  ret void
180}
181
182define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly byval(%struct.St4x3) align 4 %in) {
183  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[12])
184  ; CHECK-LABEL: callee_St4x3(
185  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x3_param_0[12]
186  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
187  ; CHECK:       ld.param.u32    [[R3:%r[0-9]+]],  [callee_St4x3_param_0+8];
188  ; CHECK:       st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
189  ; CHECK:       st.param.b32    [func_retval0+8], [[R3]];
190  ; CHECK-NEXT:  ret;
191  %1 = load i32, ptr %in, align 4
192  %arrayidx.1 = getelementptr inbounds [3 x i32], ptr %in, i64 0, i64 1
193  %2 = load i32, ptr %arrayidx.1, align 4
194  %arrayidx.2 = getelementptr inbounds [3 x i32], ptr %in, i64 0, i64 2
195  %3 = load i32, ptr %arrayidx.2, align 4
196  %4 = insertvalue [3 x i32] poison, i32 %1, 0
197  %5 = insertvalue [3 x i32] %4, i32 %2, 1
198  %oldret = insertvalue [3 x i32] %5, i32 %3, 2
199  ret [3 x i32] %oldret
200}
201
202define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in, ptr nocapture noundef writeonly %ret) {
203  ; CHECK-LABEL: .visible .func caller_St4x4(
204  ; CHECK:               .param .align 4 .b8 caller_St4x4_param_0[16],
205  ; CHECK:               .param .b64 caller_St4x4_param_1
206  ; CHECK:       )
207  ; CHECK:       .param .align 16 .b8 param0[16];
208  ; CHECK:       st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
209  ; CHECK:       .param .align 16 .b8 retval0[16];
210  ; CHECK:       call.uni (retval0),
211  ; CHECK-NEXT:  callee_St4x4,
212  ; CHECK-NEXT:  (
213  ; CHECK-NEXT:  param0
214  ; CHECK-NEXT:  );
215  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
216  %call = tail call fastcc [4 x i32] @callee_St4x4(ptr noundef nonnull byval(%struct.St4x4) align 4 %in) #2
217  %.fca.0.extract = extractvalue [4 x i32] %call, 0
218  %.fca.1.extract = extractvalue [4 x i32] %call, 1
219  %.fca.2.extract = extractvalue [4 x i32] %call, 2
220  %.fca.3.extract = extractvalue [4 x i32] %call, 3
221  store i32 %.fca.0.extract, ptr %ret, align 4
222  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
223  store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
224  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
225  store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
226  %ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
227  store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
228  ret void
229}
230
231define internal fastcc [4 x i32] @callee_St4x4(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
232  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
233  ; CHECK-LABEL: callee_St4x4(
234  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_param_0[16]
235  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0];
236  ; CHECK:       st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
237  ; CHECK-NEXT:  ret;
238  %1 = load i32, ptr %in, align 4
239  %arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1
240  %2 = load i32, ptr %arrayidx.1, align 4
241  %arrayidx.2 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 2
242  %3 = load i32, ptr %arrayidx.2, align 4
243  %arrayidx.3 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 3
244  %4 = load i32, ptr %arrayidx.3, align 4
245  %5 = insertvalue [4 x i32] poison, i32 %1, 0
246  %6 = insertvalue [4 x i32] %5, i32 %2, 1
247  %7 = insertvalue [4 x i32] %6, i32 %3, 2
248  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
249  ret [4 x i32] %oldret
250}
251
252define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct.St4x5) align 4 %in, ptr nocapture noundef writeonly %ret) {
253  ; CHECK-LABEL: .visible .func caller_St4x5(
254  ; CHECK:               .param .align 4 .b8 caller_St4x5_param_0[20],
255  ; CHECK:               .param .b64 caller_St4x5_param_1
256  ; CHECK:       )
257  ; CHECK:       .param .align 16 .b8 param0[20];
258  ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
259  ; CHECK:       st.param.b32    [param0+16], {{%r[0-9]+}};
260  ; CHECK:       .param .align 16 .b8 retval0[20];
261  ; CHECK:       call.uni (retval0),
262  ; CHECK-NEXT:  callee_St4x5,
263  ; CHECK-NEXT:  (
264  ; CHECK-NEXT:  param0
265  ; CHECK-NEXT:  );
266  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
267  ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+16];
268  %call = tail call fastcc [5 x i32] @callee_St4x5(ptr noundef nonnull byval(%struct.St4x5) align 4 %in) #2
269  %.fca.0.extract = extractvalue [5 x i32] %call, 0
270  %.fca.1.extract = extractvalue [5 x i32] %call, 1
271  %.fca.2.extract = extractvalue [5 x i32] %call, 2
272  %.fca.3.extract = extractvalue [5 x i32] %call, 3
273  %.fca.4.extract = extractvalue [5 x i32] %call, 4
274  store i32 %.fca.0.extract, ptr %ret, align 4
275  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
276  store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
277  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
278  store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
279  %ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
280  store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
281  %ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
282  store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
283  ret void
284}
285
286define internal fastcc [5 x i32] @callee_St4x5(ptr nocapture noundef readonly byval(%struct.St4x5) align 4 %in) {
287  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[20])
288  ; CHECK-LABEL: callee_St4x5(
289  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x5_param_0[20]
290  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0];
291  ; CHECK:       ld.param.u32    [[R5:%r[0-9]+]],   [callee_St4x5_param_0+16];
292  ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
293  ; CHECK:       st.param.b32    [func_retval0+16], [[R5]];
294  ; CHECK-NEXT:  ret;
295  %1 = load i32, ptr %in, align 4
296  %arrayidx.1 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 1
297  %2 = load i32, ptr %arrayidx.1, align 4
298  %arrayidx.2 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 2
299  %3 = load i32, ptr %arrayidx.2, align 4
300  %arrayidx.3 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 3
301  %4 = load i32, ptr %arrayidx.3, align 4
302  %arrayidx.4 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 4
303  %5 = load i32, ptr %arrayidx.4, align 4
304  %6 = insertvalue [5 x i32] poison, i32 %1, 0
305  %7 = insertvalue [5 x i32] %6, i32 %2, 1
306  %8 = insertvalue [5 x i32] %7, i32 %3, 2
307  %9 = insertvalue [5 x i32] %8, i32 %4, 3
308  %oldret = insertvalue [5 x i32] %9, i32 %5, 4
309  ret [5 x i32] %oldret
310}
311
312define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct.St4x6) align 4 %in, ptr nocapture noundef writeonly %ret) {
313  ; CHECK-LABEL: .visible .func caller_St4x6(
314  ; CHECK:               .param .align 4 .b8 caller_St4x6_param_0[24],
315  ; CHECK:               .param .b64 caller_St4x6_param_1
316  ; CHECK:       )
317  ; CHECK:       .param .align 16 .b8 param0[24];
318  ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
319  ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
320  ; CHECK:       .param .align 16 .b8 retval0[24];
321  ; CHECK:       call.uni (retval0),
322  ; CHECK-NEXT:  callee_St4x6,
323  ; CHECK-NEXT:  (
324  ; CHECK-NEXT:  param0
325  ; CHECK-NEXT:  );
326  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
327  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
328  %call = tail call fastcc [6 x i32] @callee_St4x6(ptr noundef nonnull byval(%struct.St4x6) align 4 %in) #2
329  %.fca.0.extract = extractvalue [6 x i32] %call, 0
330  %.fca.1.extract = extractvalue [6 x i32] %call, 1
331  %.fca.2.extract = extractvalue [6 x i32] %call, 2
332  %.fca.3.extract = extractvalue [6 x i32] %call, 3
333  %.fca.4.extract = extractvalue [6 x i32] %call, 4
334  %.fca.5.extract = extractvalue [6 x i32] %call, 5
335  store i32 %.fca.0.extract, ptr %ret, align 4
336  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
337  store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
338  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
339  store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
340  %ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
341  store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
342  %ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
343  store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
344  %ref.tmp.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 20
345  store i32 %.fca.5.extract, ptr %ref.tmp.sroa.8.0..sroa_idx, align 4
346  ret void
347}
348
349define internal fastcc [6 x i32] @callee_St4x6(ptr nocapture noundef readonly byval(%struct.St4x6) align 4 %in) {
350  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
351  ; CHECK-LABEL: callee_St4x6(
352  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x6_param_0[24]
353  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0];
354  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16];
355  ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
356  ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
357  ; CHECK-NEXT:  ret;
358  %1 = load i32, ptr %in, align 4
359  %arrayidx.1 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 1
360  %2 = load i32, ptr %arrayidx.1, align 4
361  %arrayidx.2 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 2
362  %3 = load i32, ptr %arrayidx.2, align 4
363  %arrayidx.3 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 3
364  %4 = load i32, ptr %arrayidx.3, align 4
365  %arrayidx.4 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 4
366  %5 = load i32, ptr %arrayidx.4, align 4
367  %arrayidx.5 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 5
368  %6 = load i32, ptr %arrayidx.5, align 4
369  %7 = insertvalue [6 x i32] poison, i32 %1, 0
370  %8 = insertvalue [6 x i32] %7, i32 %2, 1
371  %9 = insertvalue [6 x i32] %8, i32 %3, 2
372  %10 = insertvalue [6 x i32] %9, i32 %4, 3
373  %11 = insertvalue [6 x i32] %10, i32 %5, 4
374  %oldret = insertvalue [6 x i32] %11, i32 %6, 5
375  ret [6 x i32] %oldret
376}
377
378define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct.St4x7) align 4 %in, ptr nocapture noundef writeonly %ret) {
379  ; CHECK-LABEL: .visible .func caller_St4x7(
380  ; CHECK:               .param .align 4 .b8 caller_St4x7_param_0[28],
381  ; CHECK:               .param .b64 caller_St4x7_param_1
382  ; CHECK:       )
383  ; CHECK:       .param .align 16 .b8 param0[28];
384  ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
385  ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
386  ; CHECK:       st.param.b32    [param0+24], {{%r[0-9]+}};
387  ; CHECK:       .param .align 16 .b8 retval0[28];
388  ; CHECK:       call.uni (retval0),
389  ; CHECK-NEXT:  callee_St4x7,
390  ; CHECK-NEXT:  (
391  ; CHECK-NEXT:  param0
392  ; CHECK-NEXT:  );
393  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
394  ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
395  ; CHECK:       ld.param.b32    {{%r[0-9]+}}, [retval0+24];
396  %call = tail call fastcc [7 x i32] @callee_St4x7(ptr noundef nonnull byval(%struct.St4x7) align 4 %in) #2
397  %.fca.0.extract = extractvalue [7 x i32] %call, 0
398  %.fca.1.extract = extractvalue [7 x i32] %call, 1
399  %.fca.2.extract = extractvalue [7 x i32] %call, 2
400  %.fca.3.extract = extractvalue [7 x i32] %call, 3
401  %.fca.4.extract = extractvalue [7 x i32] %call, 4
402  %.fca.5.extract = extractvalue [7 x i32] %call, 5
403  %.fca.6.extract = extractvalue [7 x i32] %call, 6
404  store i32 %.fca.0.extract, ptr %ret, align 4
405  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
406  store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
407  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
408  store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
409  %ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
410  store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
411  %ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
412  store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
413  %ref.tmp.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 20
414  store i32 %.fca.5.extract, ptr %ref.tmp.sroa.8.0..sroa_idx, align 4
415  %ref.tmp.sroa.9.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 24
416  store i32 %.fca.6.extract, ptr %ref.tmp.sroa.9.0..sroa_idx, align 4
417  ret void
418}
419
420define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly byval(%struct.St4x7) align 4 %in) {
421  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[28])
422  ; CHECK-LABEL: callee_St4x7(
423  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x7_param_0[28]
424  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
425  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
426  ; CHECK:       ld.param.u32    [[R7:%r[0-9]+]],   [callee_St4x7_param_0+24];
427  ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
428  ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
429  ; CHECK:       st.param.b32    [func_retval0+24], [[R7]];
430  ; CHECK-NEXT:  ret;
431  %1 = load i32, ptr %in, align 4
432  %arrayidx.1 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 1
433  %2 = load i32, ptr %arrayidx.1, align 4
434  %arrayidx.2 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 2
435  %3 = load i32, ptr %arrayidx.2, align 4
436  %arrayidx.3 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 3
437  %4 = load i32, ptr %arrayidx.3, align 4
438  %arrayidx.4 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 4
439  %5 = load i32, ptr %arrayidx.4, align 4
440  %arrayidx.5 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 5
441  %6 = load i32, ptr %arrayidx.5, align 4
442  %arrayidx.6 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 6
443  %7 = load i32, ptr %arrayidx.6, align 4
444  %8 = insertvalue [7 x i32] poison, i32 %1, 0
445  %9 = insertvalue [7 x i32] %8, i32 %2, 1
446  %10 = insertvalue [7 x i32] %9, i32 %3, 2
447  %11 = insertvalue [7 x i32] %10, i32 %4, 3
448  %12 = insertvalue [7 x i32] %11, i32 %5, 4
449  %13 = insertvalue [7 x i32] %12, i32 %6, 5
450  %oldret = insertvalue [7 x i32] %13, i32 %7, 6
451  ret [7 x i32] %oldret
452}
453
454define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct.St4x8) align 4 %in, ptr nocapture noundef writeonly %ret) {
455  ; CHECK-LABEL: .visible .func caller_St4x8(
456  ; CHECK:               .param .align 4 .b8 caller_St4x8_param_0[32],
457  ; CHECK:               .param .b64 caller_St4x8_param_1
458  ; CHECK:       )
459  ; CHECK:       .param .align 16 .b8 param0[32];
460  ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
461  ; CHECK:       st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
462  ; CHECK:       .param .align 16 .b8 retval0[32];
463  ; CHECK:       call.uni (retval0),
464  ; CHECK-NEXT:  callee_St4x8,
465  ; CHECK-NEXT:  (
466  ; CHECK-NEXT:  param0
467  ; CHECK-NEXT:  );
468  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
469  ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
470  %call = tail call fastcc [8 x i32] @callee_St4x8(ptr noundef nonnull byval(%struct.St4x8) align 4 %in) #2
471  %.fca.0.extract = extractvalue [8 x i32] %call, 0
472  %.fca.1.extract = extractvalue [8 x i32] %call, 1
473  %.fca.2.extract = extractvalue [8 x i32] %call, 2
474  %.fca.3.extract = extractvalue [8 x i32] %call, 3
475  %.fca.4.extract = extractvalue [8 x i32] %call, 4
476  %.fca.5.extract = extractvalue [8 x i32] %call, 5
477  %.fca.6.extract = extractvalue [8 x i32] %call, 6
478  %.fca.7.extract = extractvalue [8 x i32] %call, 7
479  store i32 %.fca.0.extract, ptr %ret, align 4
480  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
481  store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
482  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
483  store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
484  %ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
485  store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
486  %ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
487  store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
488  %ref.tmp.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 20
489  store i32 %.fca.5.extract, ptr %ref.tmp.sroa.8.0..sroa_idx, align 4
490  %ref.tmp.sroa.9.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 24
491  store i32 %.fca.6.extract, ptr %ref.tmp.sroa.9.0..sroa_idx, align 4
492  %ref.tmp.sroa.10.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 28
493  store i32 %.fca.7.extract, ptr %ref.tmp.sroa.10.0..sroa_idx, align 4
494  ret void
495}
496
497define internal fastcc [8 x i32] @callee_St4x8(ptr nocapture noundef readonly byval(%struct.St4x8) align 4 %in) {
498  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
499  ; CHECK-LABEL: callee_St4x8(
500  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x8_param_0[32]
501  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0];
502  ; CHECK:       ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16];
503  ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
504  ; CHECK:       st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]};
505  ; CHECK-NEXT:  ret;
506  %1 = load i32, ptr %in, align 4
507  %arrayidx.1 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 1
508  %2 = load i32, ptr %arrayidx.1, align 4
509  %arrayidx.2 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 2
510  %3 = load i32, ptr %arrayidx.2, align 4
511  %arrayidx.3 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 3
512  %4 = load i32, ptr %arrayidx.3, align 4
513  %arrayidx.4 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 4
514  %5 = load i32, ptr %arrayidx.4, align 4
515  %arrayidx.5 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 5
516  %6 = load i32, ptr %arrayidx.5, align 4
517  %arrayidx.6 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 6
518  %7 = load i32, ptr %arrayidx.6, align 4
519  %arrayidx.7 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 7
520  %8 = load i32, ptr %arrayidx.7, align 4
521  %9 = insertvalue [8 x i32] poison, i32 %1, 0
522  %10 = insertvalue [8 x i32] %9, i32 %2, 1
523  %11 = insertvalue [8 x i32] %10, i32 %3, 2
524  %12 = insertvalue [8 x i32] %11, i32 %4, 3
525  %13 = insertvalue [8 x i32] %12, i32 %5, 4
526  %14 = insertvalue [8 x i32] %13, i32 %6, 5
527  %15 = insertvalue [8 x i32] %14, i32 %7, 6
528  %oldret = insertvalue [8 x i32] %15, i32 %8, 7
529  ret [8 x i32] %oldret
530}
531
532define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct.St8x1) align 8 %in, ptr nocapture noundef writeonly %ret) {
533  ; CHECK-LABEL: .visible .func caller_St8x1(
534  ; CHECK:               .param .align 8 .b8 caller_St8x1_param_0[8],
535  ; CHECK:               .param .b64 caller_St8x1_param_1
536  ; CHECK:       )
537  ; CHECK:       .param .b64 param0;
538  ; CHECK:       st.param.b64 [param0], {{%rd[0-9]+}};
539  ; CHECK:       .param .align 16 .b8 retval0[8];
540  ; CHECK:       call.uni (retval0),
541  ; CHECK-NEXT:  callee_St8x1,
542  ; CHECK-NEXT:  (
543  ; CHECK-NEXT:  param0
544  ; CHECK-NEXT:  );
545  ; CHECK:       ld.param.b64 {{%rd[0-9]+}}, [retval0];
546  %1 = load i64, ptr %in, align 8
547  %call = tail call fastcc [1 x i64] @callee_St8x1(i64 %1) #2
548  %.fca.0.extract = extractvalue [1 x i64] %call, 0
549  store i64 %.fca.0.extract, ptr %ret, align 8
550  ret void
551}
552
553define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) {
554  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
555  ; CHECK-LABEL: callee_St8x1(
556  ; CHECK-NEXT:  .param .b64 callee_St8x1_param_0
557  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0];
558  ; CHECK:       st.param.b64 [func_retval0],  [[RD1]];
559  ; CHECK-NEXT:  ret;
560  %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0
561  ret [1 x i64] %oldret
562}
563
564define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct.St8x2) align 8 %in, ptr nocapture noundef writeonly %ret) {
565  ; CHECK-LABEL: .visible .func caller_St8x2(
566  ; CHECK:               .param .align 8 .b8 caller_St8x2_param_0[16],
567  ; CHECK:               .param .b64 caller_St8x2_param_1
568  ; CHECK:       )
569  ; CHECK:       .param .align 16 .b8 param0[16];
570  ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
571  ; CHECK:       .param .align 16 .b8 retval0[16];
572  ; CHECK:       call.uni (retval0),
573  ; CHECK-NEXT:  callee_St8x2,
574  ; CHECK-NEXT:  (
575  ; CHECK-NEXT:  param0
576  ; CHECK-NEXT:  );
577  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
578  %call = tail call fastcc [2 x i64] @callee_St8x2(ptr noundef nonnull byval(%struct.St8x2) align 8 %in) #2
579  %.fca.0.extract = extractvalue [2 x i64] %call, 0
580  %.fca.1.extract = extractvalue [2 x i64] %call, 1
581  store i64 %.fca.0.extract, ptr %ret, align 8
582  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
583  store i64 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 8
584  ret void
585}
586
587define internal fastcc [2 x i64] @callee_St8x2(ptr nocapture noundef readonly byval(%struct.St8x2) align 8 %in) {
588  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
589  ; CHECK-LABEL: callee_St8x2(
590  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x2_param_0[16]
591  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0];
592  ; CHECK:       st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]};
593  ; CHECK-NEXT:  ret;
594  %1 = load i64, ptr %in, align 8
595  %arrayidx.1 = getelementptr inbounds [2 x i64], ptr %in, i64 0, i64 1
596  %2 = load i64, ptr %arrayidx.1, align 8
597  %3 = insertvalue [2 x i64] poison, i64 %1, 0
598  %oldret = insertvalue [2 x i64] %3, i64 %2, 1
599  ret [2 x i64] %oldret
600}
601
602define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct.St8x3) align 8 %in, ptr nocapture noundef writeonly %ret) {
603  ; CHECK-LABEL: .visible .func caller_St8x3(
604  ; CHECK:               .param .align 8 .b8 caller_St8x3_param_0[24],
605  ; CHECK:               .param .b64 caller_St8x3_param_1
606  ; CHECK:       )
607  ; CHECK:       .param .align 16 .b8 param0[24];
608  ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
609  ; CHECK:       st.param.b64    [param0+16], {{%rd[0-9]+}};
610  ; CHECK:       .param .align 16 .b8 retval0[24];
611  ; CHECK:       call.uni (retval0),
612  ; CHECK-NEXT:  callee_St8x3,
613  ; CHECK-NEXT:  (
614  ; CHECK-NEXT:  param0
615  ; CHECK-NEXT:  );
616  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
617  ; CHECK:       ld.param.b64    {{%rd[0-9]+}}, [retval0+16];
618  %call = tail call fastcc [3 x i64] @callee_St8x3(ptr noundef nonnull byval(%struct.St8x3) align 8 %in) #2
619  %.fca.0.extract = extractvalue [3 x i64] %call, 0
620  %.fca.1.extract = extractvalue [3 x i64] %call, 1
621  %.fca.2.extract = extractvalue [3 x i64] %call, 2
622  store i64 %.fca.0.extract, ptr %ret, align 8
623  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
624  store i64 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 8
625  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
626  store i64 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 8
627  ret void
628}
629
630define internal fastcc [3 x i64] @callee_St8x3(ptr nocapture noundef readonly byval(%struct.St8x3) align 8 %in) {
631  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
632  ; CHECK-LABEL: callee_St8x3(
633  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x3_param_0[24]
634  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0];
635  ; CHECK:       ld.param.u64    [[RD3:%rd[0-9]+]],  [callee_St8x3_param_0+16];
636  ; CHECK:       st.param.v2.b64 [func_retval0],   {[[RD1]], [[RD2]]};
637  ; CHECK:       st.param.b64    [func_retval0+16],  [[RD3]];
638  ; CHECK-NEXT:  ret;
639  %1 = load i64, ptr %in, align 8
640  %arrayidx.1 = getelementptr inbounds [3 x i64], ptr %in, i64 0, i64 1
641  %2 = load i64, ptr %arrayidx.1, align 8
642  %arrayidx.2 = getelementptr inbounds [3 x i64], ptr %in, i64 0, i64 2
643  %3 = load i64, ptr %arrayidx.2, align 8
644  %4 = insertvalue [3 x i64] poison, i64 %1, 0
645  %5 = insertvalue [3 x i64] %4, i64 %2, 1
646  %oldret = insertvalue [3 x i64] %5, i64 %3, 2
647  ret [3 x i64] %oldret
648}
649
650define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
651  ; CHECK-LABEL: .visible .func caller_St8x4(
652  ; CHECK:               .param .align 8 .b8 caller_St8x4_param_0[32],
653  ; CHECK:               .param .b64 caller_St8x4_param_1
654  ; CHECK:       )
655  ; CHECK:       .param .align 16 .b8 param0[32];
656  ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
657  ; CHECK:       st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
658  ; CHECK:       .param .align 16 .b8 retval0[32];
659  ; CHECK:       call.uni (retval0),
660  ; CHECK-NEXT:  callee_St8x4,
661  ; CHECK-NEXT:  (
662  ; CHECK-NEXT:  param0
663  ; CHECK-NEXT:  );
664  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
665  ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16];
666  %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
667  %.fca.0.extract = extractvalue [4 x i64] %call, 0
668  %.fca.1.extract = extractvalue [4 x i64] %call, 1
669  %.fca.2.extract = extractvalue [4 x i64] %call, 2
670  %.fca.3.extract = extractvalue [4 x i64] %call, 3
671  store i64 %.fca.0.extract, ptr %ret, align 8
672  %ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
673  store i64 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 8
674  %ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
675  store i64 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 8
676  %ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 24
677  store i64 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 8
678  ret void
679}
680
681define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in) {
682  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
683  ; CHECK-LABEL: callee_St8x4(
684  ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x4_param_0[32]
685  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0];
686  ; CHECK:       ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16];
687  ; CHECK:       st.param.v2.b64 [func_retval0],  {[[RD1]], [[RD2]]};
688  ; CHECK:       st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]};
689  ; CHECK-NEXT:  ret;
690  %1 = load i64, ptr %in, align 8
691  %arrayidx.1 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 1
692  %2 = load i64, ptr %arrayidx.1, align 8
693  %arrayidx.2 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 2
694  %3 = load i64, ptr %arrayidx.2, align 8
695  %arrayidx.3 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 3
696  %4 = load i64, ptr %arrayidx.3, align 8
697  %5 = insertvalue [4 x i64] poison, i64 %1, 0
698  %6 = insertvalue [4 x i64] %5, i64 %2, 1
699  %7 = insertvalue [4 x i64] %6, i64 %3, 2
700  %oldret = insertvalue [4 x i64] %7, i64 %4, 3
701  ret [4 x i64] %oldret
702}
703
704; Section 2 - checking that function argument (including retval) vectorization is done with private linkage.
705
706define private fastcc [4 x i32] @callee_St4x4_private(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
707  ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
708  ; CHECK-LABEL: callee_St4x4_private(
709  ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_private_param_0[16]
710  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0];
711  ; CHECK:       st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
712  ; CHECK-NEXT:  ret;
713  %1 = load i32, ptr %in, align 4
714  %arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1
715  %2 = load i32, ptr %arrayidx.1, align 4
716  %arrayidx.2 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 2
717  %3 = load i32, ptr %arrayidx.2, align 4
718  %arrayidx.3 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 3
719  %4 = load i32, ptr %arrayidx.3, align 4
720  %5 = insertvalue [4 x i32] poison, i32 %1, 0
721  %6 = insertvalue [4 x i32] %5, i32 %2, 1
722  %7 = insertvalue [4 x i32] %6, i32 %3, 2
723  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
724  ret [4 x i32] %oldret
725}
726
727; Section 3 - checking that function argument (including retval) vectorization
728; is NOT done with linkage types other than internal and private.
729
730define external fastcc [4 x i32] @callee_St4x4_external(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
731  ; CHECK:       .func  (.param .align 4 .b8 func_retval0[16])
732  ; CHECK-LABEL: callee_St4x4_external(
733  ; CHECK-NEXT:  .param .align 4 .b8 callee_St4x4_external_param_0[16]
734  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]],   [callee_St4x4_external_param_0];
735  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]],   [callee_St4x4_external_param_0+4];
736  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]],   [callee_St4x4_external_param_0+8];
737  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]],   [callee_St4x4_external_param_0+12];
738  ; CHECK:       st.param.b32 [func_retval0],  [[R1]];
739  ; CHECK:       st.param.b32 [func_retval0+4],  [[R2]];
740  ; CHECK:       st.param.b32 [func_retval0+8],  [[R3]];
741  ; CHECK:       st.param.b32 [func_retval0+12], [[R4]];
742  ; CHECK-NEXT:  ret;
743  %1 = load i32, ptr %in, align 4
744  %arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1
745  %2 = load i32, ptr %arrayidx.1, align 4
746  %arrayidx.2 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 2
747  %3 = load i32, ptr %arrayidx.2, align 4
748  %arrayidx.3 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 3
749  %4 = load i32, ptr %arrayidx.3, align 4
750  %5 = insertvalue [4 x i32] poison, i32 %1, 0
751  %6 = insertvalue [4 x i32] %5, i32 %2, 1
752  %7 = insertvalue [4 x i32] %6, i32 %3, 2
753  %oldret = insertvalue [4 x i32] %7, i32 %4, 3
754  ret [4 x i32] %oldret
755}
756
757