xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll (revision 69f7d81d0a47781e5d4820873c20f725f3d0236e)
1;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
2;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
3
4; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
5; ;CHECK: NumVgprs: 4
6define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
7.entry:
8  %i1 = extractelement <2 x float> %arg3, i32 1
9  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
10  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1, 0
11  ret { <4 x float> } %ret2
12}
13
14; CHECK-LABEL: {{^}}_amdgpu_ps_3_arg:
15; CHECK: NumVgprs: 6
16define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_3_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
17.entry:
18  %i1 = extractelement <2 x float> %arg3, i32 1
19  %i2 = extractelement <2 x float> %arg4, i32 0
20  %i3 = extractelement <2 x float> %arg5, i32 1
21  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
22  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
23  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
24  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
25  ret { <4 x float> } %ret2
26}
27
28; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_gap:
29; CHECK: NumVgprs: 4
30define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_gap(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
31.entry:
32  %i1 = extractelement <2 x float> %arg3, i32 1
33  %i3 = extractelement <2 x float> %arg5, i32 1
34  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
35  %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1
36  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
37  ret { <4 x float> } %ret2
38}
39
40; Using InitialPSInputAddr of 0x2 causes the 2nd VGPR arg to be included in the packing - this increases the total number of VGPRs and in turn makes arg3 not be packed to be
41; adjacent to arg1 (the only 2 used arguments)
42; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_no_pack:
43; CHECK: NumVgprs: 6
44define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_no_pack(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #1 {
45.entry:
46  %i1 = extractelement <2 x float> %arg3, i32 1
47  %i3 = extractelement <2 x float> %arg5, i32 1
48  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
49  %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1
50  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
51  ret { <4 x float> } %ret2
52}
53
54; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg:
55; CHECK: NumVgprs: 24
56define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
57.entry:
58  %i1 = extractelement <2 x float> %arg3, i32 1
59  %i2 = extractelement <2 x float> %arg4, i32 0
60  %i3 = extractelement <2 x float> %arg5, i32 1
61  %i4 = extractelement <3 x float> %arg6, i32 1
62  %i5 = extractelement <2 x float> %arg7, i32 0
63  %i6 = extractelement <2 x float> %arg8, i32 0
64  %i7 = extractelement <2 x float> %arg9, i32 1
65
66  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
67  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
68  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
69  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
70
71  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
72  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
73  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
74  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
75
76  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
77  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
78  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
79  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
80
81  %arg15.f = bitcast i32 %arg15 to float
82  %arg16.f = bitcast i32 %arg16 to float
83  %arg17.f = bitcast i32 %arg17 to float
84  %arg18.f = bitcast i32 %arg18 to float
85
86  %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0
87  %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1
88  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2
89  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3
90
91  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
92  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
93  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
94  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
95
96  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
97}
98
99; Extra arguments have to be allocated even if they're unused
100; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra_unused:
101; CHECK: NumVgprs: 26
102define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
103.entry:
104  %i1 = extractelement <2 x float> %arg3, i32 1
105  %i2 = extractelement <2 x float> %arg4, i32 0
106  %i3 = extractelement <2 x float> %arg5, i32 1
107  %i4 = extractelement <3 x float> %arg6, i32 1
108  %i5 = extractelement <2 x float> %arg7, i32 0
109  %i6 = extractelement <2 x float> %arg8, i32 0
110  %i7 = extractelement <2 x float> %arg9, i32 1
111
112  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
113  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
114  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
115  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
116
117  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
118  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
119  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
120  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
121
122  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
123  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
124  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
125  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
126
127  %arg15.f = bitcast i32 %arg15 to float
128  %arg16.f = bitcast i32 %arg16 to float
129  %arg17.f = bitcast i32 %arg17 to float
130  %arg18.f = bitcast i32 %arg18 to float
131
132  %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0
133  %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1
134  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2
135  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3
136
137  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
138  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
139  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
140  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
141
142  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
143}
144
145; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra:
146; CHECK: NumVgprs: 26
147; CHECK: NumVGPRsForWavesPerEU: 26
148define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
149.entry:
150  %i1 = extractelement <2 x float> %arg3, i32 1
151  %i2 = extractelement <2 x float> %arg4, i32 0
152  %i3 = extractelement <2 x float> %arg5, i32 1
153  %i4 = extractelement <3 x float> %arg6, i32 1
154  %i5 = extractelement <2 x float> %arg7, i32 0
155  %i6 = extractelement <2 x float> %arg8, i32 0
156  %i7 = extractelement <2 x float> %arg9, i32 1
157
158  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
159  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
160  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
161  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
162
163  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
164  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
165  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
166  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
167
168  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
169  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
170  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
171  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
172
173  %arg15.f = bitcast i32 %arg15 to float
174  %arg16.f = bitcast i32 %arg16 to float
175  %arg17.f = bitcast i32 %arg17 to float
176  %arg18.f = bitcast i32 %arg18 to float
177
178  %arg15_16.f = fadd float %arg15.f, %arg16.f
179  %arg17_18.f = fadd float %arg17.f, %arg18.f
180
181  %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
182  %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
183  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15_16.f, i32 2
184  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg17_18.f, i32 3
185
186  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
187  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
188  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
189  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
190
191  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
192}
193
194; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
195; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
196; CHECK: NumVgprs: 4
197define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
198.entry:
199  ret { <4 x float> } undef
200}
201
202; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
203; Additionally set the PSInputAddr to 0 via the metadata
204; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
205; CHECK: NumVgprs: 4
206define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
207.entry:
208  ret { <4 x float> } undef
209}
210
211; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used:
212; CHECK: NumVgprs: 4
213define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
214.entry:
215  %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0
216  %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1
217
218  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0
219
220  ret { <4 x float> } %ret.res
221}
222
223; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_used:
224; CHECK: NumVgprs: 5
225define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
226.entry:
227  %ret4.1 = insertelement <4 x float> undef, float %arg14, i32 0
228  %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg1, i32 1
229  %ret4.3 = insertelement <4 x float> %ret4.2, float %extra_arg2, i32 2
230
231  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0
232
233  ret { <4 x float> } %ret.res
234}
235
236; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_unused:
237; CHECK: NumVgprs: 7
238define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
239.entry:
240  %ret4.1 = insertelement <4 x float> undef, float %arg12, i32 0
241  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg13, i32 1
242  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg14, i32 2
243
244  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0
245
246  ret { <4 x float> } %ret.res
247}
248
249; Extra unused inputs are always added to the allocation
250; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused:
251; CHECK: NumVgprs: 4
252define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
253.entry:
254
255  ret { <4 x float> } undef
256}
257
258; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used_no_packing:
259; CHECK: NumVgprs: 26
260define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
261.entry:
262  %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0
263  %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1
264
265  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0
266
267  ret { <4 x float> } %ret.res
268}
269
270; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused_no_packing:
271; CHECK: NumVgprs: 26
272define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
273.entry:
274  ret { <4 x float> } undef
275}
276
277; CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_arg_extra:
278; CHECK: NumVgprs: 24
279; CHECK: NumVGPRsForWavesPerEU: 24
280define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
281.entry:
282  %i1 = extractelement <2 x float> %arg3, i32 1
283  %i2 = extractelement <2 x float> %arg4, i32 0
284  %i3 = extractelement <2 x float> %arg5, i32 1
285  %i4 = extractelement <3 x float> %arg6, i32 1
286  %i5 = extractelement <2 x float> %arg7, i32 0
287  %i6 = extractelement <2 x float> %arg8, i32 0
288  %i7 = extractelement <2 x float> %arg9, i32 1
289
290  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
291  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
292  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
293  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
294
295  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
296  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
297  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
298  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
299
300  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
301  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
302  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
303  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
304
305  %arg15.f = bitcast i32 %arg15 to float
306  %arg16.f = bitcast i32 %arg16 to float
307
308  %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
309  %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
310  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15.f, i32 2
311  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg16.f, i32 3
312
313  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
314  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
315  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
316  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
317
318  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
319}
320
321;CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_no_packing_arg_extra:
322;CHECK: NumVgprs: 26
323;CHECK: NumVGPRsForWavesPerEU: 26
324define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_no_packing_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
325.entry:
326  %i1 = extractelement <2 x float> %arg3, i32 1
327  %i2 = extractelement <2 x float> %arg4, i32 0
328  %i3 = extractelement <2 x float> %arg5, i32 1
329  %i4 = extractelement <3 x float> %arg6, i32 1
330  %i5 = extractelement <2 x float> %arg7, i32 0
331  %i6 = extractelement <2 x float> %arg8, i32 0
332  %i7 = extractelement <2 x float> %arg9, i32 1
333
334  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
335  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
336  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
337  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
338
339  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
340  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
341  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
342  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
343
344  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
345  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
346  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
347  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
348
349  %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
350  %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
351
352  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
353  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
354  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
355  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.1, 3
356
357  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
358}
359
360attributes #0 = { nounwind "target-features"=",+wavefrontsize64,+cumode"  }
361attributes #1 = { nounwind "InitialPSInputAddr"="2" "target-features"=",+wavefrontsize64,+cumode" }
362attributes #2 = { nounwind "InitialPSInputAddr"="0xffff" "target-features"=",+wavefrontsize64,+cumode" }
363attributes #3 = { nounwind "InitialPSInputAddr"="0" "target-features"=",+wavefrontsize64,+cumode" }
364