xref: /llvm-project/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll (revision 932d9c13faa3de1deca3874d3b864901aa5ec9a5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s | FileCheck -check-prefix=ENABLED %s
3; RUN: llc -disable-nvptx-load-store-vectorizer < %s | FileCheck -check-prefix=DISABLED %s
4; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
5; RUN: %if ptxas %{ llc -disable-nvptx-load-store-vectorizer < %s | %ptxas-verify %}
6
7target triple = "nvptx64-nvidia-cuda"
8
9; Check that the load-store vectorizer is enabled by default for nvptx, and
10; that it's disabled by the appropriate flag.
11
12define i32 @f(ptr %p) {
13; ENABLED-LABEL: f(
14; ENABLED:       {
15; ENABLED-NEXT:    .reg .b32 %r<4>;
16; ENABLED-NEXT:    .reg .b64 %rd<2>;
17; ENABLED-EMPTY:
18; ENABLED-NEXT:  // %bb.0:
19; ENABLED-NEXT:    ld.param.u64 %rd1, [f_param_0];
20; ENABLED-NEXT:    ld.v2.u32 {%r1, %r2}, [%rd1];
21; ENABLED-NEXT:    add.s32 %r3, %r1, %r2;
22; ENABLED-NEXT:    st.param.b32 [func_retval0], %r3;
23; ENABLED-NEXT:    ret;
24;
25; DISABLED-LABEL: f(
26; DISABLED:       {
27; DISABLED-NEXT:    .reg .b32 %r<4>;
28; DISABLED-NEXT:    .reg .b64 %rd<2>;
29; DISABLED-EMPTY:
30; DISABLED-NEXT:  // %bb.0:
31; DISABLED-NEXT:    ld.param.u64 %rd1, [f_param_0];
32; DISABLED-NEXT:    ld.u32 %r1, [%rd1];
33; DISABLED-NEXT:    ld.u32 %r2, [%rd1+4];
34; DISABLED-NEXT:    add.s32 %r3, %r1, %r2;
35; DISABLED-NEXT:    st.param.b32 [func_retval0], %r3;
36; DISABLED-NEXT:    ret;
37  %p.1 = getelementptr i32, ptr %p, i32 1
38  %v0 = load i32, ptr %p, align 8
39  %v1 = load i32, ptr %p.1, align 4
40  %sum = add i32 %v0, %v1
41  ret i32 %sum
42}
43
44define half @fh(ptr %p) {
45; ENABLED-LABEL: fh(
46; ENABLED:       {
47; ENABLED-NEXT:    .reg .b16 %rs<10>;
48; ENABLED-NEXT:    .reg .f32 %f<13>;
49; ENABLED-NEXT:    .reg .b64 %rd<2>;
50; ENABLED-EMPTY:
51; ENABLED-NEXT:  // %bb.0:
52; ENABLED-NEXT:    ld.param.u64 %rd1, [fh_param_0];
53; ENABLED-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
54; ENABLED-NEXT:    ld.b16 %rs5, [%rd1+8];
55; ENABLED-NEXT:    cvt.f32.f16 %f1, %rs2;
56; ENABLED-NEXT:    cvt.f32.f16 %f2, %rs1;
57; ENABLED-NEXT:    add.rn.f32 %f3, %f2, %f1;
58; ENABLED-NEXT:    cvt.rn.f16.f32 %rs6, %f3;
59; ENABLED-NEXT:    cvt.f32.f16 %f4, %rs4;
60; ENABLED-NEXT:    cvt.f32.f16 %f5, %rs3;
61; ENABLED-NEXT:    add.rn.f32 %f6, %f5, %f4;
62; ENABLED-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
63; ENABLED-NEXT:    cvt.f32.f16 %f7, %rs7;
64; ENABLED-NEXT:    cvt.f32.f16 %f8, %rs6;
65; ENABLED-NEXT:    add.rn.f32 %f9, %f8, %f7;
66; ENABLED-NEXT:    cvt.rn.f16.f32 %rs8, %f9;
67; ENABLED-NEXT:    cvt.f32.f16 %f10, %rs8;
68; ENABLED-NEXT:    cvt.f32.f16 %f11, %rs5;
69; ENABLED-NEXT:    add.rn.f32 %f12, %f10, %f11;
70; ENABLED-NEXT:    cvt.rn.f16.f32 %rs9, %f12;
71; ENABLED-NEXT:    st.param.b16 [func_retval0], %rs9;
72; ENABLED-NEXT:    ret;
73;
74; DISABLED-LABEL: fh(
75; DISABLED:       {
76; DISABLED-NEXT:    .reg .b16 %rs<10>;
77; DISABLED-NEXT:    .reg .f32 %f<13>;
78; DISABLED-NEXT:    .reg .b64 %rd<2>;
79; DISABLED-EMPTY:
80; DISABLED-NEXT:  // %bb.0:
81; DISABLED-NEXT:    ld.param.u64 %rd1, [fh_param_0];
82; DISABLED-NEXT:    ld.b16 %rs1, [%rd1];
83; DISABLED-NEXT:    ld.b16 %rs2, [%rd1+2];
84; DISABLED-NEXT:    ld.b16 %rs3, [%rd1+4];
85; DISABLED-NEXT:    ld.b16 %rs4, [%rd1+6];
86; DISABLED-NEXT:    ld.b16 %rs5, [%rd1+8];
87; DISABLED-NEXT:    cvt.f32.f16 %f1, %rs2;
88; DISABLED-NEXT:    cvt.f32.f16 %f2, %rs1;
89; DISABLED-NEXT:    add.rn.f32 %f3, %f2, %f1;
90; DISABLED-NEXT:    cvt.rn.f16.f32 %rs6, %f3;
91; DISABLED-NEXT:    cvt.f32.f16 %f4, %rs4;
92; DISABLED-NEXT:    cvt.f32.f16 %f5, %rs3;
93; DISABLED-NEXT:    add.rn.f32 %f6, %f5, %f4;
94; DISABLED-NEXT:    cvt.rn.f16.f32 %rs7, %f6;
95; DISABLED-NEXT:    cvt.f32.f16 %f7, %rs7;
96; DISABLED-NEXT:    cvt.f32.f16 %f8, %rs6;
97; DISABLED-NEXT:    add.rn.f32 %f9, %f8, %f7;
98; DISABLED-NEXT:    cvt.rn.f16.f32 %rs8, %f9;
99; DISABLED-NEXT:    cvt.f32.f16 %f10, %rs8;
100; DISABLED-NEXT:    cvt.f32.f16 %f11, %rs5;
101; DISABLED-NEXT:    add.rn.f32 %f12, %f10, %f11;
102; DISABLED-NEXT:    cvt.rn.f16.f32 %rs9, %f12;
103; DISABLED-NEXT:    st.param.b16 [func_retval0], %rs9;
104; DISABLED-NEXT:    ret;
105  %p.1 = getelementptr half, ptr %p, i32 1
106  %p.2 = getelementptr half, ptr %p, i32 2
107  %p.3 = getelementptr half, ptr %p, i32 3
108  %p.4 = getelementptr half, ptr %p, i32 4
109  %v0 = load half, ptr %p, align 64
110  %v1 = load half, ptr %p.1, align 4
111  %v2 = load half, ptr %p.2, align 4
112  %v3 = load half, ptr %p.3, align 4
113  %v4 = load half, ptr %p.4, align 4
114  %sum1 = fadd half %v0, %v1
115  %sum2 = fadd half %v2, %v3
116  %sum3 = fadd half %sum1, %sum2
117  %sum = fadd half %sum3, %v4
118  ret half %sum
119}
120
121define float @ff(ptr %p) {
122; ENABLED-LABEL: ff(
123; ENABLED:       {
124; ENABLED-NEXT:    .reg .f32 %f<10>;
125; ENABLED-NEXT:    .reg .b64 %rd<2>;
126; ENABLED-EMPTY:
127; ENABLED-NEXT:  // %bb.0:
128; ENABLED-NEXT:    ld.param.u64 %rd1, [ff_param_0];
129; ENABLED-NEXT:    ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
130; ENABLED-NEXT:    ld.f32 %f5, [%rd1+16];
131; ENABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
132; ENABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
133; ENABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
134; ENABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
135; ENABLED-NEXT:    st.param.f32 [func_retval0], %f9;
136; ENABLED-NEXT:    ret;
137;
138; DISABLED-LABEL: ff(
139; DISABLED:       {
140; DISABLED-NEXT:    .reg .f32 %f<10>;
141; DISABLED-NEXT:    .reg .b64 %rd<2>;
142; DISABLED-EMPTY:
143; DISABLED-NEXT:  // %bb.0:
144; DISABLED-NEXT:    ld.param.u64 %rd1, [ff_param_0];
145; DISABLED-NEXT:    ld.f32 %f1, [%rd1];
146; DISABLED-NEXT:    ld.f32 %f2, [%rd1+4];
147; DISABLED-NEXT:    ld.f32 %f3, [%rd1+8];
148; DISABLED-NEXT:    ld.f32 %f4, [%rd1+12];
149; DISABLED-NEXT:    ld.f32 %f5, [%rd1+16];
150; DISABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
151; DISABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
152; DISABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
153; DISABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
154; DISABLED-NEXT:    st.param.f32 [func_retval0], %f9;
155; DISABLED-NEXT:    ret;
156  %p.1 = getelementptr float, ptr %p, i32 1
157  %p.2 = getelementptr float, ptr %p, i32 2
158  %p.3 = getelementptr float, ptr %p, i32 3
159  %p.4 = getelementptr float, ptr %p, i32 4
160  %v0 = load float, ptr %p, align 64
161  %v1 = load float, ptr %p.1, align 4
162  %v2 = load float, ptr %p.2, align 4
163  %v3 = load float, ptr %p.3, align 4
164  %v4 = load float, ptr %p.4, align 4
165  %sum1 = fadd float %v0, %v1
166  %sum2 = fadd float %v2, %v3
167  %sum3 = fadd float %sum1, %sum2
168  %sum = fadd float %sum3, %v4
169  ret float %sum
170}
171
172define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
173; ENABLED-LABEL: combine_v16i8(
174; ENABLED:       {
175; ENABLED-NEXT:    .reg .b32 %r<36>;
176; ENABLED-NEXT:    .reg .b64 %rd<3>;
177; ENABLED-EMPTY:
178; ENABLED-NEXT:  // %bb.0:
179; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_param_0];
180; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
181; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_param_1];
182; ENABLED-NEXT:    bfe.u32 %r5, %r1, 0, 8;
183; ENABLED-NEXT:    bfe.u32 %r6, %r1, 8, 8;
184; ENABLED-NEXT:    bfe.u32 %r7, %r1, 16, 8;
185; ENABLED-NEXT:    bfe.u32 %r8, %r1, 24, 8;
186; ENABLED-NEXT:    bfe.u32 %r9, %r2, 0, 8;
187; ENABLED-NEXT:    bfe.u32 %r10, %r2, 8, 8;
188; ENABLED-NEXT:    bfe.u32 %r11, %r2, 16, 8;
189; ENABLED-NEXT:    bfe.u32 %r12, %r2, 24, 8;
190; ENABLED-NEXT:    bfe.u32 %r13, %r3, 0, 8;
191; ENABLED-NEXT:    bfe.u32 %r14, %r3, 8, 8;
192; ENABLED-NEXT:    bfe.u32 %r15, %r3, 16, 8;
193; ENABLED-NEXT:    bfe.u32 %r16, %r3, 24, 8;
194; ENABLED-NEXT:    bfe.u32 %r17, %r4, 0, 8;
195; ENABLED-NEXT:    bfe.u32 %r18, %r4, 8, 8;
196; ENABLED-NEXT:    bfe.u32 %r19, %r4, 16, 8;
197; ENABLED-NEXT:    bfe.u32 %r20, %r4, 24, 8;
198; ENABLED-NEXT:    add.s32 %r21, %r5, %r6;
199; ENABLED-NEXT:    add.s32 %r22, %r21, %r7;
200; ENABLED-NEXT:    add.s32 %r23, %r22, %r8;
201; ENABLED-NEXT:    add.s32 %r24, %r23, %r9;
202; ENABLED-NEXT:    add.s32 %r25, %r24, %r10;
203; ENABLED-NEXT:    add.s32 %r26, %r25, %r11;
204; ENABLED-NEXT:    add.s32 %r27, %r26, %r12;
205; ENABLED-NEXT:    add.s32 %r28, %r27, %r13;
206; ENABLED-NEXT:    add.s32 %r29, %r28, %r14;
207; ENABLED-NEXT:    add.s32 %r30, %r29, %r15;
208; ENABLED-NEXT:    add.s32 %r31, %r30, %r16;
209; ENABLED-NEXT:    add.s32 %r32, %r31, %r17;
210; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
211; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
212; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
213; ENABLED-NEXT:    st.u32 [%rd2], %r35;
214; ENABLED-NEXT:    ret;
215;
216; DISABLED-LABEL: combine_v16i8(
217; DISABLED:       {
218; DISABLED-NEXT:    .reg .b32 %r<32>;
219; DISABLED-NEXT:    .reg .b64 %rd<3>;
220; DISABLED-EMPTY:
221; DISABLED-NEXT:  // %bb.0:
222; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_param_0];
223; DISABLED-NEXT:    ld.u8 %r1, [%rd1];
224; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_param_1];
225; DISABLED-NEXT:    ld.u8 %r2, [%rd1+1];
226; DISABLED-NEXT:    ld.u8 %r3, [%rd1+2];
227; DISABLED-NEXT:    ld.u8 %r4, [%rd1+3];
228; DISABLED-NEXT:    ld.u8 %r5, [%rd1+4];
229; DISABLED-NEXT:    ld.u8 %r6, [%rd1+5];
230; DISABLED-NEXT:    ld.u8 %r7, [%rd1+6];
231; DISABLED-NEXT:    ld.u8 %r8, [%rd1+7];
232; DISABLED-NEXT:    ld.u8 %r9, [%rd1+8];
233; DISABLED-NEXT:    ld.u8 %r10, [%rd1+9];
234; DISABLED-NEXT:    ld.u8 %r11, [%rd1+10];
235; DISABLED-NEXT:    ld.u8 %r12, [%rd1+11];
236; DISABLED-NEXT:    ld.u8 %r13, [%rd1+12];
237; DISABLED-NEXT:    ld.u8 %r14, [%rd1+13];
238; DISABLED-NEXT:    ld.u8 %r15, [%rd1+14];
239; DISABLED-NEXT:    ld.u8 %r16, [%rd1+15];
240; DISABLED-NEXT:    add.s32 %r17, %r1, %r2;
241; DISABLED-NEXT:    add.s32 %r18, %r17, %r3;
242; DISABLED-NEXT:    add.s32 %r19, %r18, %r4;
243; DISABLED-NEXT:    add.s32 %r20, %r19, %r5;
244; DISABLED-NEXT:    add.s32 %r21, %r20, %r6;
245; DISABLED-NEXT:    add.s32 %r22, %r21, %r7;
246; DISABLED-NEXT:    add.s32 %r23, %r22, %r8;
247; DISABLED-NEXT:    add.s32 %r24, %r23, %r9;
248; DISABLED-NEXT:    add.s32 %r25, %r24, %r10;
249; DISABLED-NEXT:    add.s32 %r26, %r25, %r11;
250; DISABLED-NEXT:    add.s32 %r27, %r26, %r12;
251; DISABLED-NEXT:    add.s32 %r28, %r27, %r13;
252; DISABLED-NEXT:    add.s32 %r29, %r28, %r14;
253; DISABLED-NEXT:    add.s32 %r30, %r29, %r15;
254; DISABLED-NEXT:    add.s32 %r31, %r30, %r16;
255; DISABLED-NEXT:    st.u32 [%rd2], %r31;
256; DISABLED-NEXT:    ret;
257  %val0 = load i8, ptr %ptr1, align 16
258  %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
259  %val1 = load i8, ptr %ptr1.1, align 1
260  %ptr1.2 = getelementptr inbounds i8, ptr %ptr1, i64 2
261  %val2 = load i8, ptr %ptr1.2, align 2
262  %ptr1.3 = getelementptr inbounds i8, ptr %ptr1, i64 3
263  %val3 = load i8, ptr %ptr1.3, align 1
264  %ptr1.4 = getelementptr inbounds i8, ptr %ptr1, i64 4
265  %val4 = load i8, ptr %ptr1.4, align 4
266  %ptr1.5 = getelementptr inbounds i8, ptr %ptr1, i64 5
267  %val5 = load i8, ptr %ptr1.5, align 1
268  %ptr1.6 = getelementptr inbounds i8, ptr %ptr1, i64 6
269  %val6 = load i8, ptr %ptr1.6, align 2
270  %ptr1.7 = getelementptr inbounds i8, ptr %ptr1, i64 7
271  %val7 = load i8, ptr %ptr1.7, align 1
272  %ptr1.8 = getelementptr inbounds i8, ptr %ptr1, i64 8
273  %val8 = load i8, ptr %ptr1.8, align 8
274  %ptr1.9 = getelementptr inbounds i8, ptr %ptr1, i64 9
275  %val9 = load i8, ptr %ptr1.9, align 1
276  %ptr1.10 = getelementptr inbounds i8, ptr %ptr1, i64 10
277  %val10 = load i8, ptr %ptr1.10, align 2
278  %ptr1.11 = getelementptr inbounds i8, ptr %ptr1, i64 11
279  %val11 = load i8, ptr %ptr1.11, align 1
280  %ptr1.12 = getelementptr inbounds i8, ptr %ptr1, i64 12
281  %val12 = load i8, ptr %ptr1.12, align 4
282  %ptr1.13 = getelementptr inbounds i8, ptr %ptr1, i64 13
283  %val13 = load i8, ptr %ptr1.13, align 1
284  %ptr1.14 = getelementptr inbounds i8, ptr %ptr1, i64 14
285  %val14 = load i8, ptr %ptr1.14, align 2
286  %ptr1.15 = getelementptr inbounds i8, ptr %ptr1, i64 15
287  %val15 = load i8, ptr %ptr1.15, align 1
288  %lane0 = zext i8 %val0 to i32
289  %lane1 = zext i8 %val1 to i32
290  %lane2 = zext i8 %val2 to i32
291  %lane3 = zext i8 %val3 to i32
292  %lane4 = zext i8 %val4 to i32
293  %lane5 = zext i8 %val5 to i32
294  %lane6 = zext i8 %val6 to i32
295  %lane7 = zext i8 %val7 to i32
296  %lane8 = zext i8 %val8 to i32
297  %lane9 = zext i8 %val9 to i32
298  %lane10 = zext i8 %val10 to i32
299  %lane11 = zext i8 %val11 to i32
300  %lane12 = zext i8 %val12 to i32
301  %lane13 = zext i8 %val13 to i32
302  %lane14 = zext i8 %val14 to i32
303  %lane15 = zext i8 %val15 to i32
304  %red.1 = add i32 %lane0, %lane1
305  %red.2 = add i32 %red.1, %lane2
306  %red.3 = add i32 %red.2, %lane3
307  %red.4 = add i32 %red.3, %lane4
308  %red.5 = add i32 %red.4, %lane5
309  %red.6 = add i32 %red.5, %lane6
310  %red.7 = add i32 %red.6, %lane7
311  %red.8 = add i32 %red.7, %lane8
312  %red.9 = add i32 %red.8, %lane9
313  %red.10 = add i32 %red.9, %lane10
314  %red.11 = add i32 %red.10, %lane11
315  %red.12 = add i32 %red.11, %lane12
316  %red.13 = add i32 %red.12, %lane13
317  %red.14 = add i32 %red.13, %lane14
318  %red = add i32 %red.14, %lane15
319  store i32 %red, ptr %ptr2, align 4
320  ret void
321}
322
323define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef align 16 %ptr2) {
324; ENABLED-LABEL: combine_v16i8_unaligned(
325; ENABLED:       {
326; ENABLED-NEXT:    .reg .b32 %r<36>;
327; ENABLED-NEXT:    .reg .b64 %rd<3>;
328; ENABLED-EMPTY:
329; ENABLED-NEXT:  // %bb.0:
330; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
331; ENABLED-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
332; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
333; ENABLED-NEXT:    ld.v2.b32 {%r3, %r4}, [%rd1+8];
334; ENABLED-NEXT:    bfe.u32 %r5, %r1, 0, 8;
335; ENABLED-NEXT:    bfe.u32 %r6, %r1, 8, 8;
336; ENABLED-NEXT:    bfe.u32 %r7, %r1, 16, 8;
337; ENABLED-NEXT:    bfe.u32 %r8, %r1, 24, 8;
338; ENABLED-NEXT:    bfe.u32 %r9, %r2, 0, 8;
339; ENABLED-NEXT:    bfe.u32 %r10, %r2, 8, 8;
340; ENABLED-NEXT:    bfe.u32 %r11, %r2, 16, 8;
341; ENABLED-NEXT:    bfe.u32 %r12, %r2, 24, 8;
342; ENABLED-NEXT:    bfe.u32 %r13, %r3, 0, 8;
343; ENABLED-NEXT:    bfe.u32 %r14, %r3, 8, 8;
344; ENABLED-NEXT:    bfe.u32 %r15, %r3, 16, 8;
345; ENABLED-NEXT:    bfe.u32 %r16, %r3, 24, 8;
346; ENABLED-NEXT:    bfe.u32 %r17, %r4, 0, 8;
347; ENABLED-NEXT:    bfe.u32 %r18, %r4, 8, 8;
348; ENABLED-NEXT:    bfe.u32 %r19, %r4, 16, 8;
349; ENABLED-NEXT:    bfe.u32 %r20, %r4, 24, 8;
350; ENABLED-NEXT:    add.s32 %r21, %r5, %r6;
351; ENABLED-NEXT:    add.s32 %r22, %r21, %r7;
352; ENABLED-NEXT:    add.s32 %r23, %r22, %r8;
353; ENABLED-NEXT:    add.s32 %r24, %r23, %r9;
354; ENABLED-NEXT:    add.s32 %r25, %r24, %r10;
355; ENABLED-NEXT:    add.s32 %r26, %r25, %r11;
356; ENABLED-NEXT:    add.s32 %r27, %r26, %r12;
357; ENABLED-NEXT:    add.s32 %r28, %r27, %r13;
358; ENABLED-NEXT:    add.s32 %r29, %r28, %r14;
359; ENABLED-NEXT:    add.s32 %r30, %r29, %r15;
360; ENABLED-NEXT:    add.s32 %r31, %r30, %r16;
361; ENABLED-NEXT:    add.s32 %r32, %r31, %r17;
362; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
363; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
364; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
365; ENABLED-NEXT:    st.u32 [%rd2], %r35;
366; ENABLED-NEXT:    ret;
367;
368; DISABLED-LABEL: combine_v16i8_unaligned(
369; DISABLED:       {
370; DISABLED-NEXT:    .reg .b32 %r<32>;
371; DISABLED-NEXT:    .reg .b64 %rd<3>;
372; DISABLED-EMPTY:
373; DISABLED-NEXT:  // %bb.0:
374; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
375; DISABLED-NEXT:    ld.u8 %r1, [%rd1];
376; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
377; DISABLED-NEXT:    ld.u8 %r2, [%rd1+1];
378; DISABLED-NEXT:    ld.u8 %r3, [%rd1+2];
379; DISABLED-NEXT:    ld.u8 %r4, [%rd1+3];
380; DISABLED-NEXT:    ld.u8 %r5, [%rd1+4];
381; DISABLED-NEXT:    ld.u8 %r6, [%rd1+5];
382; DISABLED-NEXT:    ld.u8 %r7, [%rd1+6];
383; DISABLED-NEXT:    ld.u8 %r8, [%rd1+7];
384; DISABLED-NEXT:    ld.u8 %r9, [%rd1+8];
385; DISABLED-NEXT:    ld.u8 %r10, [%rd1+9];
386; DISABLED-NEXT:    ld.u8 %r11, [%rd1+10];
387; DISABLED-NEXT:    ld.u8 %r12, [%rd1+11];
388; DISABLED-NEXT:    ld.u8 %r13, [%rd1+12];
389; DISABLED-NEXT:    ld.u8 %r14, [%rd1+13];
390; DISABLED-NEXT:    ld.u8 %r15, [%rd1+14];
391; DISABLED-NEXT:    ld.u8 %r16, [%rd1+15];
392; DISABLED-NEXT:    add.s32 %r17, %r1, %r2;
393; DISABLED-NEXT:    add.s32 %r18, %r17, %r3;
394; DISABLED-NEXT:    add.s32 %r19, %r18, %r4;
395; DISABLED-NEXT:    add.s32 %r20, %r19, %r5;
396; DISABLED-NEXT:    add.s32 %r21, %r20, %r6;
397; DISABLED-NEXT:    add.s32 %r22, %r21, %r7;
398; DISABLED-NEXT:    add.s32 %r23, %r22, %r8;
399; DISABLED-NEXT:    add.s32 %r24, %r23, %r9;
400; DISABLED-NEXT:    add.s32 %r25, %r24, %r10;
401; DISABLED-NEXT:    add.s32 %r26, %r25, %r11;
402; DISABLED-NEXT:    add.s32 %r27, %r26, %r12;
403; DISABLED-NEXT:    add.s32 %r28, %r27, %r13;
404; DISABLED-NEXT:    add.s32 %r29, %r28, %r14;
405; DISABLED-NEXT:    add.s32 %r30, %r29, %r15;
406; DISABLED-NEXT:    add.s32 %r31, %r30, %r16;
407; DISABLED-NEXT:    st.u32 [%rd2], %r31;
408; DISABLED-NEXT:    ret;
409  %val0 = load i8, ptr %ptr1, align 8
410  %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
411  %val1 = load i8, ptr %ptr1.1, align 1
412  %ptr1.2 = getelementptr inbounds i8, ptr %ptr1, i64 2
413  %val2 = load i8, ptr %ptr1.2, align 2
414  %ptr1.3 = getelementptr inbounds i8, ptr %ptr1, i64 3
415  %val3 = load i8, ptr %ptr1.3, align 1
416  %ptr1.4 = getelementptr inbounds i8, ptr %ptr1, i64 4
417  %val4 = load i8, ptr %ptr1.4, align 4
418  %ptr1.5 = getelementptr inbounds i8, ptr %ptr1, i64 5
419  %val5 = load i8, ptr %ptr1.5, align 1
420  %ptr1.6 = getelementptr inbounds i8, ptr %ptr1, i64 6
421  %val6 = load i8, ptr %ptr1.6, align 2
422  %ptr1.7 = getelementptr inbounds i8, ptr %ptr1, i64 7
423  %val7 = load i8, ptr %ptr1.7, align 1
424  %ptr1.8 = getelementptr inbounds i8, ptr %ptr1, i64 8
425  %val8 = load i8, ptr %ptr1.8, align 8
426  %ptr1.9 = getelementptr inbounds i8, ptr %ptr1, i64 9
427  %val9 = load i8, ptr %ptr1.9, align 1
428  %ptr1.10 = getelementptr inbounds i8, ptr %ptr1, i64 10
429  %val10 = load i8, ptr %ptr1.10, align 2
430  %ptr1.11 = getelementptr inbounds i8, ptr %ptr1, i64 11
431  %val11 = load i8, ptr %ptr1.11, align 1
432  %ptr1.12 = getelementptr inbounds i8, ptr %ptr1, i64 12
433  %val12 = load i8, ptr %ptr1.12, align 4
434  %ptr1.13 = getelementptr inbounds i8, ptr %ptr1, i64 13
435  %val13 = load i8, ptr %ptr1.13, align 1
436  %ptr1.14 = getelementptr inbounds i8, ptr %ptr1, i64 14
437  %val14 = load i8, ptr %ptr1.14, align 2
438  %ptr1.15 = getelementptr inbounds i8, ptr %ptr1, i64 15
439  %val15 = load i8, ptr %ptr1.15, align 1
440  %lane0 = zext i8 %val0 to i32
441  %lane1 = zext i8 %val1 to i32
442  %lane2 = zext i8 %val2 to i32
443  %lane3 = zext i8 %val3 to i32
444  %lane4 = zext i8 %val4 to i32
445  %lane5 = zext i8 %val5 to i32
446  %lane6 = zext i8 %val6 to i32
447  %lane7 = zext i8 %val7 to i32
448  %lane8 = zext i8 %val8 to i32
449  %lane9 = zext i8 %val9 to i32
450  %lane10 = zext i8 %val10 to i32
451  %lane11 = zext i8 %val11 to i32
452  %lane12 = zext i8 %val12 to i32
453  %lane13 = zext i8 %val13 to i32
454  %lane14 = zext i8 %val14 to i32
455  %lane15 = zext i8 %val15 to i32
456  %red.1 = add i32 %lane0, %lane1
457  %red.2 = add i32 %red.1, %lane2
458  %red.3 = add i32 %red.2, %lane3
459  %red.4 = add i32 %red.3, %lane4
460  %red.5 = add i32 %red.4, %lane5
461  %red.6 = add i32 %red.5, %lane6
462  %red.7 = add i32 %red.6, %lane7
463  %red.8 = add i32 %red.7, %lane8
464  %red.9 = add i32 %red.8, %lane9
465  %red.10 = add i32 %red.9, %lane10
466  %red.11 = add i32 %red.10, %lane11
467  %red.12 = add i32 %red.11, %lane12
468  %red.13 = add i32 %red.12, %lane13
469  %red.14 = add i32 %red.13, %lane14
470  %red = add i32 %red.14, %lane15
471  store i32 %red, ptr %ptr2, align 4
472  ret void
473}
474
475
476define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
477; ENABLED-LABEL: combine_v8i16(
478; ENABLED:       {
479; ENABLED-NEXT:    .reg .b16 %rs<9>;
480; ENABLED-NEXT:    .reg .b32 %r<20>;
481; ENABLED-NEXT:    .reg .b64 %rd<3>;
482; ENABLED-EMPTY:
483; ENABLED-NEXT:  // %bb.0:
484; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v8i16_param_0];
485; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
486; ENABLED-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
487; ENABLED-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
488; ENABLED-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
489; ENABLED-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
490; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v8i16_param_1];
491; ENABLED-NEXT:    cvt.u32.u16 %r5, %rs7;
492; ENABLED-NEXT:    cvt.u32.u16 %r6, %rs8;
493; ENABLED-NEXT:    cvt.u32.u16 %r7, %rs5;
494; ENABLED-NEXT:    cvt.u32.u16 %r8, %rs6;
495; ENABLED-NEXT:    cvt.u32.u16 %r9, %rs3;
496; ENABLED-NEXT:    cvt.u32.u16 %r10, %rs4;
497; ENABLED-NEXT:    cvt.u32.u16 %r11, %rs1;
498; ENABLED-NEXT:    cvt.u32.u16 %r12, %rs2;
499; ENABLED-NEXT:    add.s32 %r13, %r5, %r6;
500; ENABLED-NEXT:    add.s32 %r14, %r13, %r7;
501; ENABLED-NEXT:    add.s32 %r15, %r14, %r8;
502; ENABLED-NEXT:    add.s32 %r16, %r15, %r9;
503; ENABLED-NEXT:    add.s32 %r17, %r16, %r10;
504; ENABLED-NEXT:    add.s32 %r18, %r17, %r11;
505; ENABLED-NEXT:    add.s32 %r19, %r18, %r12;
506; ENABLED-NEXT:    st.u32 [%rd2], %r19;
507; ENABLED-NEXT:    ret;
508;
509; DISABLED-LABEL: combine_v8i16(
510; DISABLED:       {
511; DISABLED-NEXT:    .reg .b32 %r<16>;
512; DISABLED-NEXT:    .reg .b64 %rd<3>;
513; DISABLED-EMPTY:
514; DISABLED-NEXT:  // %bb.0:
515; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v8i16_param_0];
516; DISABLED-NEXT:    ld.u16 %r1, [%rd1];
517; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v8i16_param_1];
518; DISABLED-NEXT:    ld.u16 %r2, [%rd1+2];
519; DISABLED-NEXT:    ld.u16 %r3, [%rd1+4];
520; DISABLED-NEXT:    ld.u16 %r4, [%rd1+6];
521; DISABLED-NEXT:    ld.u16 %r5, [%rd1+8];
522; DISABLED-NEXT:    ld.u16 %r6, [%rd1+10];
523; DISABLED-NEXT:    ld.u16 %r7, [%rd1+12];
524; DISABLED-NEXT:    ld.u16 %r8, [%rd1+14];
525; DISABLED-NEXT:    add.s32 %r9, %r1, %r2;
526; DISABLED-NEXT:    add.s32 %r10, %r9, %r3;
527; DISABLED-NEXT:    add.s32 %r11, %r10, %r4;
528; DISABLED-NEXT:    add.s32 %r12, %r11, %r5;
529; DISABLED-NEXT:    add.s32 %r13, %r12, %r6;
530; DISABLED-NEXT:    add.s32 %r14, %r13, %r7;
531; DISABLED-NEXT:    add.s32 %r15, %r14, %r8;
532; DISABLED-NEXT:    st.u32 [%rd2], %r15;
533; DISABLED-NEXT:    ret;
534  %val0 = load i16, ptr %ptr1, align 16
535  %ptr1.1 = getelementptr inbounds i16, ptr %ptr1, i64 1
536  %val1 = load i16, ptr %ptr1.1, align 2
537  %ptr1.2 = getelementptr inbounds i16, ptr %ptr1, i64 2
538  %val2 = load i16, ptr %ptr1.2, align 4
539  %ptr1.3 = getelementptr inbounds i16, ptr %ptr1, i64 3
540  %val3 = load i16, ptr %ptr1.3, align 2
541  %ptr1.4 = getelementptr inbounds i16, ptr %ptr1, i64 4
542  %val4 = load i16, ptr %ptr1.4, align 4
543  %ptr1.5 = getelementptr inbounds i16, ptr %ptr1, i64 5
544  %val5 = load i16, ptr %ptr1.5, align 2
545  %ptr1.6 = getelementptr inbounds i16, ptr %ptr1, i64 6
546  %val6 = load i16, ptr %ptr1.6, align 4
547  %ptr1.7 = getelementptr inbounds i16, ptr %ptr1, i64 7
548  %val7 = load i16, ptr %ptr1.7, align 2
549  %lane0 = zext i16 %val0 to i32
550  %lane1 = zext i16 %val1 to i32
551  %lane2 = zext i16 %val2 to i32
552  %lane3 = zext i16 %val3 to i32
553  %lane4 = zext i16 %val4 to i32
554  %lane5 = zext i16 %val5 to i32
555  %lane6 = zext i16 %val6 to i32
556  %lane7 = zext i16 %val7 to i32
557  %red.1 = add i32 %lane0, %lane1
558  %red.2 = add i32 %red.1, %lane2
559  %red.3 = add i32 %red.2, %lane3
560  %red.4 = add i32 %red.3, %lane4
561  %red.5 = add i32 %red.4, %lane5
562  %red.6 = add i32 %red.5, %lane6
563  %red = add i32 %red.6, %lane7
564  store i32 %red, ptr %ptr2, align 4
565  ret void
566}
567
568define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) {
569; ENABLED-LABEL: combine_v4i32(
570; ENABLED:       {
571; ENABLED-NEXT:    .reg .b32 %r<8>;
572; ENABLED-NEXT:    .reg .b64 %rd<3>;
573; ENABLED-EMPTY:
574; ENABLED-NEXT:  // %bb.0:
575; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v4i32_param_0];
576; ENABLED-NEXT:    ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
577; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v4i32_param_1];
578; ENABLED-NEXT:    add.s32 %r5, %r1, %r2;
579; ENABLED-NEXT:    add.s32 %r6, %r5, %r3;
580; ENABLED-NEXT:    add.s32 %r7, %r6, %r4;
581; ENABLED-NEXT:    st.u32 [%rd2], %r7;
582; ENABLED-NEXT:    ret;
583;
584; DISABLED-LABEL: combine_v4i32(
585; DISABLED:       {
586; DISABLED-NEXT:    .reg .b32 %r<8>;
587; DISABLED-NEXT:    .reg .b64 %rd<3>;
588; DISABLED-EMPTY:
589; DISABLED-NEXT:  // %bb.0:
590; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v4i32_param_0];
591; DISABLED-NEXT:    ld.u32 %r1, [%rd1];
592; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v4i32_param_1];
593; DISABLED-NEXT:    ld.u32 %r2, [%rd1+4];
594; DISABLED-NEXT:    ld.u32 %r3, [%rd1+8];
595; DISABLED-NEXT:    ld.u32 %r4, [%rd1+12];
596; DISABLED-NEXT:    add.s32 %r5, %r1, %r2;
597; DISABLED-NEXT:    add.s32 %r6, %r5, %r3;
598; DISABLED-NEXT:    add.s32 %r7, %r6, %r4;
599; DISABLED-NEXT:    st.u32 [%rd2], %r7;
600; DISABLED-NEXT:    ret;
601  %val0 = load i32, ptr %ptr1, align 16
602  %ptr1.1 = getelementptr inbounds i32, ptr %ptr1, i64 1
603  %val1 = load i32, ptr %ptr1.1, align 4
604  %ptr1.2 = getelementptr inbounds i32, ptr %ptr1, i64 2
605  %val2 = load i32, ptr %ptr1.2, align 8
606  %ptr1.3 = getelementptr inbounds i32, ptr %ptr1, i64 3
607  %val3 = load i32, ptr %ptr1.3, align 4
608  %red.1 = add i32 %val0, %val1
609  %red.2 = add i32 %red.1, %val2
610  %red = add i32 %red.2, %val3
611  store i32 %red, ptr %ptr2, align 4
612  ret void
613}
614