xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll (revision 7523086a050d679370dfd86a0166d5f7168ffa09)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
3; RUN: -mattr=+v,+zvfhmin,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
4; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
5; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
6; RUN: -mattr=+v,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
7; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL128
8; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
9; RUN: -mattr=+v,+zvl256b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
10; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL256
11; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
12; RUN: -mattr=+v,+zvl512b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
13; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL512
14
15target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
16target triple = "riscv64"
17
18; First batch of tests are simple reductions of various widths
19
20define i64 @red_ld_2xi64(ptr %ptr) {
21; CHECK-LABEL: @red_ld_2xi64(
22; CHECK-NEXT:  entry:
23; CHECK-NEXT:    [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 8
24; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 1
25; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[GEP]], align 8
26; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[LD0]], [[LD1]]
27; CHECK-NEXT:    ret i64 [[ADD_1]]
28;
29entry:
30  %ld0 = load i64, ptr %ptr
31  %gep = getelementptr inbounds i64, ptr %ptr, i64 1
32  %ld1 = load i64, ptr %gep
33  %add.1 = add nuw nsw i64 %ld0, %ld1
34  ret i64 %add.1
35}
36
37define i64 @red_ld_4xi64(ptr %ptr) {
38; CHECK-LABEL: @red_ld_4xi64(
39; CHECK-NEXT:  entry:
40; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 8
41; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]])
42; CHECK-NEXT:    ret i64 [[TMP1]]
43;
44entry:
45  %ld0 = load i64, ptr %ptr
46  %gep = getelementptr inbounds i64, ptr %ptr, i64 1
47  %ld1 = load i64, ptr %gep
48  %add.1 = add nuw nsw i64 %ld0, %ld1
49  %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 2
50  %ld2 = load i64, ptr %gep.1
51  %add.2 = add nuw nsw i64 %add.1, %ld2
52  %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 3
53  %ld3 = load i64, ptr %gep.2
54  %add.3 = add nuw nsw i64 %add.2, %ld3
55  ret i64 %add.3
56}
57
58define i64 @red_ld_8xi64(ptr %ptr) {
59; CHECK-LABEL: @red_ld_8xi64(
60; CHECK-NEXT:  entry:
61; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 8
62; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP0]])
63; CHECK-NEXT:    ret i64 [[TMP1]]
64;
65entry:
66  %ld0 = load i64, ptr %ptr
67  %gep = getelementptr inbounds i64, ptr %ptr, i64 1
68  %ld1 = load i64, ptr %gep
69  %add.1 = add nuw nsw i64 %ld0, %ld1
70  %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 2
71  %ld2 = load i64, ptr %gep.1
72  %add.2 = add nuw nsw i64 %add.1, %ld2
73  %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 3
74  %ld3 = load i64, ptr %gep.2
75  %add.3 = add nuw nsw i64 %add.2, %ld3
76  %gep.3 = getelementptr inbounds i64, ptr %ptr, i64 4
77  %ld4 = load i64, ptr %gep.3
78  %add.4 = add nuw nsw i64 %add.3, %ld4
79  %gep.4 = getelementptr inbounds i64, ptr %ptr, i64 5
80  %ld5 = load i64, ptr %gep.4
81  %add.5 = add nuw nsw i64 %add.4, %ld5
82  %gep.5 = getelementptr inbounds i64, ptr %ptr, i64 6
83  %ld6 = load i64, ptr %gep.5
84  %add.6 = add nuw nsw i64 %add.5, %ld6
85  %gep.6 = getelementptr inbounds i64, ptr %ptr, i64 7
86  %ld7 = load i64, ptr %gep.6
87  %add.7 = add nuw nsw i64 %add.6, %ld7
88  ret i64 %add.7
89}
90
91define i64 @red_ld_16xi64(ptr %ptr) {
92; CHECK-LABEL: @red_ld_16xi64(
93; CHECK-NEXT:  entry:
94; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 8
95; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]])
96; CHECK-NEXT:    ret i64 [[TMP1]]
97;
98entry:
99  %ld0 = load i64, ptr %ptr
100  %gep = getelementptr inbounds i64, ptr %ptr, i64 1
101  %ld1 = load i64, ptr %gep
102  %add.1 = add nuw nsw i64 %ld0, %ld1
103  %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 2
104  %ld2 = load i64, ptr %gep.1
105  %add.2 = add nuw nsw i64 %add.1, %ld2
106  %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 3
107  %ld3 = load i64, ptr %gep.2
108  %add.3 = add nuw nsw i64 %add.2, %ld3
109  %gep.3 = getelementptr inbounds i64, ptr %ptr, i64 4
110  %ld4 = load i64, ptr %gep.3
111  %add.4 = add nuw nsw i64 %add.3, %ld4
112  %gep.4 = getelementptr inbounds i64, ptr %ptr, i64 5
113  %ld5 = load i64, ptr %gep.4
114  %add.5 = add nuw nsw i64 %add.4, %ld5
115  %gep.5 = getelementptr inbounds i64, ptr %ptr, i64 6
116  %ld6 = load i64, ptr %gep.5
117  %add.6 = add nuw nsw i64 %add.5, %ld6
118  %gep.6 = getelementptr inbounds i64, ptr %ptr, i64 7
119  %ld7 = load i64, ptr %gep.6
120  %add.7 = add nuw nsw i64 %add.6, %ld7
121  %gep.7 = getelementptr inbounds i64, ptr %ptr, i64 8
122  %ld8 = load i64, ptr %gep.7
123  %add.8 = add nuw nsw i64 %add.7, %ld8
124  %gep.8 = getelementptr inbounds i64, ptr %ptr, i64 9
125  %ld9 = load i64, ptr %gep.8
126  %add.9 = add nuw nsw i64 %add.8, %ld9
127  %gep.9 = getelementptr inbounds i64, ptr %ptr, i64 10
128  %ld10 = load i64, ptr %gep.9
129  %add.10 = add nuw nsw i64 %add.9, %ld10
130  %gep.10 = getelementptr inbounds i64, ptr %ptr, i64 11
131  %ld11 = load i64, ptr %gep.10
132  %add.11 = add nuw nsw i64 %add.10, %ld11
133  %gep.11 = getelementptr inbounds i64, ptr %ptr, i64 12
134  %ld12 = load i64, ptr %gep.11
135  %add.12 = add nuw nsw i64 %add.11, %ld12
136  %gep.12 = getelementptr inbounds i64, ptr %ptr, i64 13
137  %ld13 = load i64, ptr %gep.12
138  %add.13 = add nuw nsw i64 %add.12, %ld13
139  %gep.13 = getelementptr inbounds i64, ptr %ptr, i64 14
140  %ld14 = load i64, ptr %gep.13
141  %add.14 = add nuw nsw i64 %add.13, %ld14
142  %gep.14 = getelementptr inbounds i64, ptr %ptr, i64 15
143  %ld15 = load i64, ptr %gep.14
144  %add.15 = add nuw nsw i64 %add.14, %ld15
145  ret i64 %add.15
146}
147
148
149define i64 @red_strided_ld_16xi64(ptr %ptr) {
150; CHECK-LABEL: @red_strided_ld_16xi64(
151; CHECK-NEXT:  entry:
152; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 [[PTR:%.*]], i64 16, <16 x i1> splat (i1 true), i32 16)
153; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]])
154; CHECK-NEXT:    ret i64 [[TMP1]]
155;
156entry:
157  %ld0 = load i64, ptr %ptr
158  %gep = getelementptr inbounds i64, ptr %ptr, i64 2
159  %ld1 = load i64, ptr %gep
160  %add.1 = add nuw nsw i64 %ld0, %ld1
161  %gep.1 = getelementptr inbounds i64, ptr %ptr, i64 4
162  %ld2 = load i64, ptr %gep.1
163  %add.2 = add nuw nsw i64 %add.1, %ld2
164  %gep.2 = getelementptr inbounds i64, ptr %ptr, i64 6
165  %ld3 = load i64, ptr %gep.2
166  %add.3 = add nuw nsw i64 %add.2, %ld3
167  %gep.3 = getelementptr inbounds i64, ptr %ptr, i64 8
168  %ld4 = load i64, ptr %gep.3
169  %add.4 = add nuw nsw i64 %add.3, %ld4
170  %gep.4 = getelementptr inbounds i64, ptr %ptr, i64 10
171  %ld5 = load i64, ptr %gep.4
172  %add.5 = add nuw nsw i64 %add.4, %ld5
173  %gep.5 = getelementptr inbounds i64, ptr %ptr, i64 12
174  %ld6 = load i64, ptr %gep.5
175  %add.6 = add nuw nsw i64 %add.5, %ld6
176  %gep.6 = getelementptr inbounds i64, ptr %ptr, i64 14
177  %ld7 = load i64, ptr %gep.6
178  %add.7 = add nuw nsw i64 %add.6, %ld7
179  %gep.7 = getelementptr inbounds i64, ptr %ptr, i64 16
180  %ld8 = load i64, ptr %gep.7
181  %add.8 = add nuw nsw i64 %add.7, %ld8
182  %gep.8 = getelementptr inbounds i64, ptr %ptr, i64 18
183  %ld9 = load i64, ptr %gep.8
184  %add.9 = add nuw nsw i64 %add.8, %ld9
185  %gep.9 = getelementptr inbounds i64, ptr %ptr, i64 20
186  %ld10 = load i64, ptr %gep.9
187  %add.10 = add nuw nsw i64 %add.9, %ld10
188  %gep.10 = getelementptr inbounds i64, ptr %ptr, i64 22
189  %ld11 = load i64, ptr %gep.10
190  %add.11 = add nuw nsw i64 %add.10, %ld11
191  %gep.11 = getelementptr inbounds i64, ptr %ptr, i64 24
192  %ld12 = load i64, ptr %gep.11
193  %add.12 = add nuw nsw i64 %add.11, %ld12
194  %gep.12 = getelementptr inbounds i64, ptr %ptr, i64 26
195  %ld13 = load i64, ptr %gep.12
196  %add.13 = add nuw nsw i64 %add.12, %ld13
197  %gep.13 = getelementptr inbounds i64, ptr %ptr, i64 28
198  %ld14 = load i64, ptr %gep.13
199  %add.14 = add nuw nsw i64 %add.13, %ld14
200  %gep.14 = getelementptr inbounds i64, ptr %ptr, i64 30
201  %ld15 = load i64, ptr %gep.14
202  %add.15 = add nuw nsw i64 %add.14, %ld15
203  ret i64 %add.15
204}
205
206; Next batch test differen reductions kinds
207
208%struct.buf = type { [8 x i8] }
209
210define i8 @reduce_and(ptr %a, ptr %b) {
211; CHECK-LABEL: @reduce_and(
212; CHECK-NEXT:  entry:
213; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
214; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
215; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
216; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
217; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP1]], [[TMP0]]
218; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> [[TMP2]])
219; CHECK-NEXT:    [[OP_RDX:%.*]] = and i8 [[TMP3]], 1
220; CHECK-NEXT:    ret i8 [[OP_RDX]]
221;
222entry:
223  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
224  %0 = load i8, ptr %arrayidx, align 1
225  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
226  %1 = load i8, ptr %arrayidx3, align 1
227  %xor12 = xor i8 %1, %0
228  %and13 = and i8 %xor12, 1
229  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
230  %2 = load i8, ptr %arrayidx.1, align 1
231  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
232  %3 = load i8, ptr %arrayidx3.1, align 1
233  %xor12.1 = xor i8 %3, %2
234  %and13.1 = and i8 %xor12.1, %and13
235  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
236  %4 = load i8, ptr %arrayidx.2, align 1
237  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
238  %5 = load i8, ptr %arrayidx3.2, align 1
239  %xor12.2 = xor i8 %5, %4
240  %and13.2 = and i8 %xor12.2, %and13.1
241  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
242  %6 = load i8, ptr %arrayidx.3, align 1
243  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
244  %7 = load i8, ptr %arrayidx3.3, align 1
245  %xor12.3 = xor i8 %7, %6
246  %and13.3 = and i8 %xor12.3, %and13.2
247  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
248  %8 = load i8, ptr %arrayidx.4, align 1
249  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
250  %9 = load i8, ptr %arrayidx3.4, align 1
251  %xor12.4 = xor i8 %9, %8
252  %and13.4 = and i8 %xor12.4, %and13.3
253  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
254  %10 = load i8, ptr %arrayidx.5, align 1
255  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
256  %11 = load i8, ptr %arrayidx3.5, align 1
257  %xor12.5 = xor i8 %11, %10
258  %and13.5 = and i8 %xor12.5, %and13.4
259  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
260  %12 = load i8, ptr %arrayidx.6, align 1
261  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
262  %13 = load i8, ptr %arrayidx3.6, align 1
263  %xor12.6 = xor i8 %13, %12
264  %and13.6 = and i8 %xor12.6, %and13.5
265  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
266  %14 = load i8, ptr %arrayidx.7, align 1
267  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
268  %15 = load i8, ptr %arrayidx3.7, align 1
269  %xor12.7 = xor i8 %15, %14
270  %and13.7 = and i8 %xor12.7, %and13.6
271  ret i8 %and13.7
272}
273
274define i8 @reduce_or_1(ptr %a, ptr %b) {
275; CHECK-LABEL: @reduce_or_1(
276; CHECK-NEXT:  entry:
277; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
278; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
279; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
280; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
281; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i8> [[TMP1]], [[TMP0]]
282; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP2]])
283; CHECK-NEXT:    ret i8 [[TMP3]]
284;
285
286entry:
287  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
288  %0 = load i8, ptr %arrayidx, align 1
289  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
290  %1 = load i8, ptr %arrayidx3, align 1
291  %xor12 = xor i8 %1, %0
292  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
293  %2 = load i8, ptr %arrayidx.1, align 1
294  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
295  %3 = load i8, ptr %arrayidx3.1, align 1
296  %xor12.1 = xor i8 %3, %2
297  %or13.1 = or i8 %xor12.1, %xor12
298  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
299  %4 = load i8, ptr %arrayidx.2, align 1
300  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
301  %5 = load i8, ptr %arrayidx3.2, align 1
302  %xor12.2 = xor i8 %5, %4
303  %or13.2 = or i8 %xor12.2, %or13.1
304  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
305  %6 = load i8, ptr %arrayidx.3, align 1
306  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
307  %7 = load i8, ptr %arrayidx3.3, align 1
308  %xor12.3 = xor i8 %7, %6
309  %or13.3 = or i8 %xor12.3, %or13.2
310  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
311  %8 = load i8, ptr %arrayidx.4, align 1
312  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
313  %9 = load i8, ptr %arrayidx3.4, align 1
314  %xor12.4 = xor i8 %9, %8
315  %or13.4 = or i8 %xor12.4, %or13.3
316  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
317  %10 = load i8, ptr %arrayidx.5, align 1
318  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
319  %11 = load i8, ptr %arrayidx3.5, align 1
320  %xor12.5 = xor i8 %11, %10
321  %or13.5 = or i8 %xor12.5, %or13.4
322  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
323  %12 = load i8, ptr %arrayidx.6, align 1
324  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
325  %13 = load i8, ptr %arrayidx3.6, align 1
326  %xor12.6 = xor i8 %13, %12
327  %or13.6 = or i8 %xor12.6, %or13.5
328  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
329  %14 = load i8, ptr %arrayidx.7, align 1
330  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
331  %15 = load i8, ptr %arrayidx3.7, align 1
332  %xor12.7 = xor i8 %15, %14
333  %or13.7 = or i8 %xor12.7, %or13.6
334  ret i8 %or13.7
335}
336
337define void @reduce_or_2() {
338; ZVFHMIN-LABEL: @reduce_or_2(
339; ZVFHMIN-NEXT:    [[TMP1:%.*]] = shl i64 0, 0
340; ZVFHMIN-NEXT:    [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15
341; ZVFHMIN-NEXT:    [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer
342; ZVFHMIN-NEXT:    [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6
343; ZVFHMIN-NEXT:    [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer
344; ZVFHMIN-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
345; ZVFHMIN-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
346; ZVFHMIN-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]]
347; ZVFHMIN-NEXT:    br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]]
348; ZVFHMIN:       8:
349; ZVFHMIN-NEXT:    ret void
350; ZVFHMIN:       9:
351; ZVFHMIN-NEXT:    ret void
352;
353; ZVL128-LABEL: @reduce_or_2(
354; ZVL128-NEXT:    [[TMP1:%.*]] = shl i64 0, 0
355; ZVL128-NEXT:    [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15
356; ZVL128-NEXT:    [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer
357; ZVL128-NEXT:    [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6
358; ZVL128-NEXT:    [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer
359; ZVL128-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
360; ZVL128-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
361; ZVL128-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]]
362; ZVL128-NEXT:    br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]]
363; ZVL128:       8:
364; ZVL128-NEXT:    ret void
365; ZVL128:       9:
366; ZVL128-NEXT:    ret void
367;
368; ZVL256-LABEL: @reduce_or_2(
369; ZVL256-NEXT:    [[TMP1:%.*]] = shl i64 0, 0
370; ZVL256-NEXT:    [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15
371; ZVL256-NEXT:    [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer
372; ZVL256-NEXT:    [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6
373; ZVL256-NEXT:    [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer
374; ZVL256-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
375; ZVL256-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
376; ZVL256-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]]
377; ZVL256-NEXT:    br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]]
378; ZVL256:       8:
379; ZVL256-NEXT:    ret void
380; ZVL256:       9:
381; ZVL256-NEXT:    ret void
382;
383; ZVL512-LABEL: @reduce_or_2(
384; ZVL512-NEXT:    [[TMP1:%.*]] = shl i64 0, 0
385; ZVL512-NEXT:    [[TMP2:%.*]] = insertelement <32 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 15
386; ZVL512-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i64> [[TMP2]], <32 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 15, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
387; ZVL512-NEXT:    [[TMP4:%.*]] = icmp ult <32 x i64> [[TMP3]], zeroinitializer
388; ZVL512-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> [[TMP4]])
389; ZVL512-NEXT:    br i1 [[TMP5]], label [[TMP7:%.*]], label [[TMP6:%.*]]
390; ZVL512:       6:
391; ZVL512-NEXT:    ret void
392; ZVL512:       7:
393; ZVL512-NEXT:    ret void
394;
395  %1 = shl i64 0, 0
396  %2 = icmp ult i64 0, 0
397  %3 = icmp ult i64 0, 0
398  %4 = or i1 %2, %3
399  %5 = icmp ult i64 0, 0
400  %6 = or i1 %4, %5
401  %7 = icmp ult i64 0, 0
402  %8 = or i1 %6, %7
403  %9 = icmp ult i64 0, 0
404  %10 = or i1 %8, %9
405  %11 = icmp ult i64 0, 0
406  %12 = or i1 %10, %11
407  %13 = icmp ult i64 0, 0
408  %14 = or i1 %12, %13
409  %15 = icmp ult i64 0, 0
410  %16 = or i1 %14, %15
411  %17 = icmp ult i64 0, 0
412  %18 = or i1 %16, %17
413  %19 = icmp ult i64 0, 0
414  %20 = or i1 %18, %19
415  %21 = icmp ult i64 0, 0
416  %22 = or i1 %20, %21
417  %23 = icmp ult i64 0, 0
418  %24 = or i1 %22, %23
419  %25 = icmp ult i64 0, 0
420  %26 = or i1 %24, %25
421  %27 = icmp ult i64 0, 0
422  %28 = or i1 %26, %27
423  %29 = icmp ult i64 0, 0
424  %30 = or i1 %28, %29
425  %31 = icmp ult i64 %1, 0
426  %32 = or i1 %30, %31
427  %33 = icmp ult i64 0, 0
428  %34 = or i1 %32, %33
429  %35 = icmp ult i64 0, 0
430  %36 = or i1 %34, %35
431  %37 = icmp ult i64 0, 0
432  %38 = or i1 %36, %37
433  %39 = icmp ult i64 0, 0
434  %40 = or i1 %38, %39
435  %41 = icmp ult i64 0, 0
436  %42 = or i1 %40, %41
437  %43 = icmp ult i64 0, 0
438  %44 = or i1 %42, %43
439  %45 = icmp ult i64 %1, 0
440  %46 = or i1 %44, %45
441  %47 = icmp ult i64 0, 0
442  %48 = or i1 %46, %47
443  %49 = icmp ult i64 0, 0
444  %50 = or i1 %48, %49
445  %51 = icmp ult i64 0, 0
446  %52 = or i1 %50, %51
447  %53 = icmp ult i64 0, 0
448  %54 = or i1 %52, %53
449  %55 = icmp ult i64 0, 0
450  %56 = or i1 %54, %55
451  %57 = icmp ult i64 0, 0
452  %58 = or i1 %56, %57
453  %59 = icmp ult i64 0, 0
454  %60 = or i1 %58, %59
455  %61 = icmp ult i64 0, 0
456  %62 = or i1 %60, %61
457  %63 = icmp ult i64 0, 0
458  %64 = or i1 %62, %63
459  br i1 %64, label %66, label %65
460
46165:                                               ; preds = %0
462  ret void
463
46466:                                               ; preds = %0
465  ret void
466}
467
468define i8 @reduce_xor(ptr %a, ptr %b) {
469; CHECK-LABEL: @reduce_xor(
470; CHECK-NEXT:  entry:
471; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
472; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
473; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
474; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
475; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]]
476; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP2]])
477; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i8 [[TMP3]], 1
478; CHECK-NEXT:    ret i8 [[OP_RDX]]
479;
480entry:
481  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
482  %0 = load i8, ptr %arrayidx, align 1
483  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
484  %1 = load i8, ptr %arrayidx3, align 1
485  %and12 = and i8 %1, %0
486  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
487  %2 = load i8, ptr %arrayidx.1, align 1
488  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
489  %3 = load i8, ptr %arrayidx3.1, align 1
490  %and12.1 = and i8 %3, %2
491  %4 = xor i8 %and12, %and12.1
492  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
493  %5 = load i8, ptr %arrayidx.2, align 1
494  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
495  %6 = load i8, ptr %arrayidx3.2, align 1
496  %and12.2 = and i8 %6, %5
497  %7 = xor i8 %4, %and12.2
498  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
499  %8 = load i8, ptr %arrayidx.3, align 1
500  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
501  %9 = load i8, ptr %arrayidx3.3, align 1
502  %and12.3 = and i8 %9, %8
503  %10 = xor i8 %7, %and12.3
504  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
505  %11 = load i8, ptr %arrayidx.4, align 1
506  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
507  %12 = load i8, ptr %arrayidx3.4, align 1
508  %and12.4 = and i8 %12, %11
509  %13 = xor i8 %10, %and12.4
510  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
511  %14 = load i8, ptr %arrayidx.5, align 1
512  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
513  %15 = load i8, ptr %arrayidx3.5, align 1
514  %and12.5 = and i8 %15, %14
515  %16 = xor i8 %13, %and12.5
516  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
517  %17 = load i8, ptr %arrayidx.6, align 1
518  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
519  %18 = load i8, ptr %arrayidx3.6, align 1
520  %and12.6 = and i8 %18, %17
521  %19 = xor i8 %16, %and12.6
522  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
523  %20 = load i8, ptr %arrayidx.7, align 1
524  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
525  %21 = load i8, ptr %arrayidx3.7, align 1
526  %and12.7 = and i8 %21, %20
527  %22 = xor i8 %19, %and12.7
528  %xor13.7 = xor i8 %22, 1
529  ret i8 %xor13.7
530}
531
532
533
534define i8 @reduce_add(ptr %a, ptr %b) {
535; CHECK-LABEL: @reduce_add(
536; CHECK-NEXT:  entry:
537; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
538; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
539; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
540; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
541; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]]
542; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[TMP2]])
543; CHECK-NEXT:    [[OP_RDX:%.*]] = add i8 [[TMP3]], 1
544; CHECK-NEXT:    ret i8 [[OP_RDX]]
545;
546entry:
547  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
548  %0 = load i8, ptr %arrayidx, align 1
549  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
550  %1 = load i8, ptr %arrayidx3, align 1
551  %and12 = and i8 %1, %0
552  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
553  %2 = load i8, ptr %arrayidx.1, align 1
554  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
555  %3 = load i8, ptr %arrayidx3.1, align 1
556  %and12.1 = and i8 %3, %2
557  %4 = add i8 %and12, %and12.1
558  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
559  %5 = load i8, ptr %arrayidx.2, align 1
560  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
561  %6 = load i8, ptr %arrayidx3.2, align 1
562  %and12.2 = and i8 %6, %5
563  %7 = add i8 %4, %and12.2
564  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
565  %8 = load i8, ptr %arrayidx.3, align 1
566  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
567  %9 = load i8, ptr %arrayidx3.3, align 1
568  %and12.3 = and i8 %9, %8
569  %10 = add i8 %7, %and12.3
570  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
571  %11 = load i8, ptr %arrayidx.4, align 1
572  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
573  %12 = load i8, ptr %arrayidx3.4, align 1
574  %and12.4 = and i8 %12, %11
575  %13 = add i8 %10, %and12.4
576  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
577  %14 = load i8, ptr %arrayidx.5, align 1
578  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
579  %15 = load i8, ptr %arrayidx3.5, align 1
580  %and12.5 = and i8 %15, %14
581  %16 = add i8 %13, %and12.5
582  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
583  %17 = load i8, ptr %arrayidx.6, align 1
584  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
585  %18 = load i8, ptr %arrayidx3.6, align 1
586  %and12.6 = and i8 %18, %17
587  %19 = add i8 %16, %and12.6
588  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
589  %20 = load i8, ptr %arrayidx.7, align 1
590  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
591  %21 = load i8, ptr %arrayidx3.7, align 1
592  %and12.7 = and i8 %21, %20
593  %22 = add i8 %19, %and12.7
594  %add13.7 = add i8 %22, 1
595  ret i8 %add13.7
596}
597
598declare i8 @llvm.smin.i8(i8, i8)
599
600define i8 @reduce_smin(ptr %a, ptr %b) {
601; CHECK-LABEL: @reduce_smin(
602; CHECK-NEXT:  entry:
603; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
604; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
605; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
606; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
607; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]]
608; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> [[TMP2]])
609; CHECK-NEXT:    ret i8 [[TMP3]]
610;
611entry:
612  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
613  %0 = load i8, ptr %arrayidx, align 1
614  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
615  %1 = load i8, ptr %arrayidx3, align 1
616  %and12 = and i8 %1, %0
617  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
618  %2 = load i8, ptr %arrayidx.1, align 1
619  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
620  %3 = load i8, ptr %arrayidx3.1, align 1
621  %and12.1 = and i8 %3, %2
622  %4 = tail call i8 @llvm.smin.i8(i8 %and12, i8 %and12.1)
623  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
624  %5 = load i8, ptr %arrayidx.2, align 1
625  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
626  %6 = load i8, ptr %arrayidx3.2, align 1
627  %and12.2 = and i8 %6, %5
628  %7 = tail call i8 @llvm.smin.i8(i8 %4, i8 %and12.2)
629  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
630  %8 = load i8, ptr %arrayidx.3, align 1
631  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
632  %9 = load i8, ptr %arrayidx3.3, align 1
633  %and12.3 = and i8 %9, %8
634  %10 = tail call i8 @llvm.smin.i8(i8 %7, i8 %and12.3)
635  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
636  %11 = load i8, ptr %arrayidx.4, align 1
637  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
638  %12 = load i8, ptr %arrayidx3.4, align 1
639  %and12.4 = and i8 %12, %11
640  %13 = tail call i8 @llvm.smin.i8(i8 %10, i8 %and12.4)
641  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
642  %14 = load i8, ptr %arrayidx.5, align 1
643  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
644  %15 = load i8, ptr %arrayidx3.5, align 1
645  %and12.5 = and i8 %15, %14
646  %16 = tail call i8 @llvm.smin.i8(i8 %13, i8 %and12.5)
647  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
648  %17 = load i8, ptr %arrayidx.6, align 1
649  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
650  %18 = load i8, ptr %arrayidx3.6, align 1
651  %and12.6 = and i8 %18, %17
652  %19 = tail call i8 @llvm.smin.i8(i8 %16, i8 %and12.6)
653  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
654  %20 = load i8, ptr %arrayidx.7, align 1
655  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
656  %21 = load i8, ptr %arrayidx3.7, align 1
657  %and12.7 = and i8 %21, %20
658  %22 = tail call i8 @llvm.smin.i8(i8 %19, i8 %and12.7)
659  ret i8 %22
660}
661
662declare i8 @llvm.smax.i8(i8, i8)
663
664define i8 @reduce_smax(ptr %a, ptr %b) {
665; CHECK-LABEL: @reduce_smax(
666; CHECK-NEXT:  entry:
667; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
668; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
669; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
670; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
671; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]]
672; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP2]])
673; CHECK-NEXT:    ret i8 [[TMP3]]
674;
675entry:
676  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
677  %0 = load i8, ptr %arrayidx, align 1
678  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
679  %1 = load i8, ptr %arrayidx3, align 1
680  %and12 = and i8 %1, %0
681  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
682  %2 = load i8, ptr %arrayidx.1, align 1
683  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
684  %3 = load i8, ptr %arrayidx3.1, align 1
685  %and12.1 = and i8 %3, %2
686  %4 = tail call i8 @llvm.smax.i8(i8 %and12, i8 %and12.1)
687  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
688  %5 = load i8, ptr %arrayidx.2, align 1
689  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
690  %6 = load i8, ptr %arrayidx3.2, align 1
691  %and12.2 = and i8 %6, %5
692  %7 = tail call i8 @llvm.smax.i8(i8 %4, i8 %and12.2)
693  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
694  %8 = load i8, ptr %arrayidx.3, align 1
695  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
696  %9 = load i8, ptr %arrayidx3.3, align 1
697  %and12.3 = and i8 %9, %8
698  %10 = tail call i8 @llvm.smax.i8(i8 %7, i8 %and12.3)
699  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
700  %11 = load i8, ptr %arrayidx.4, align 1
701  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
702  %12 = load i8, ptr %arrayidx3.4, align 1
703  %and12.4 = and i8 %12, %11
704  %13 = tail call i8 @llvm.smax.i8(i8 %10, i8 %and12.4)
705  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
706  %14 = load i8, ptr %arrayidx.5, align 1
707  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
708  %15 = load i8, ptr %arrayidx3.5, align 1
709  %and12.5 = and i8 %15, %14
710  %16 = tail call i8 @llvm.smax.i8(i8 %13, i8 %and12.5)
711  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
712  %17 = load i8, ptr %arrayidx.6, align 1
713  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
714  %18 = load i8, ptr %arrayidx3.6, align 1
715  %and12.6 = and i8 %18, %17
716  %19 = tail call i8 @llvm.smax.i8(i8 %16, i8 %and12.6)
717  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
718  %20 = load i8, ptr %arrayidx.7, align 1
719  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
720  %21 = load i8, ptr %arrayidx3.7, align 1
721  %and12.7 = and i8 %21, %20
722  %22 = tail call i8 @llvm.smax.i8(i8 %19, i8 %and12.7)
723  ret i8 %22
724}
725
726declare i8 @llvm.umax.i8(i8, i8)
727
728define i8 @reduce_umax(ptr %a, ptr %b) {
729; CHECK-LABEL: @reduce_umax(
730; CHECK-NEXT:  entry:
731; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
732; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
733; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
734; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
735; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]]
736; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> [[TMP2]])
737; CHECK-NEXT:    ret i8 [[TMP3]]
738;
739entry:
740  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
741  %0 = load i8, ptr %arrayidx, align 1
742  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
743  %1 = load i8, ptr %arrayidx3, align 1
744  %and12 = and i8 %1, %0
745  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
746  %2 = load i8, ptr %arrayidx.1, align 1
747  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
748  %3 = load i8, ptr %arrayidx3.1, align 1
749  %and12.1 = and i8 %3, %2
750  %4 = tail call i8 @llvm.umax.i8(i8 %and12, i8 %and12.1)
751  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
752  %5 = load i8, ptr %arrayidx.2, align 1
753  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
754  %6 = load i8, ptr %arrayidx3.2, align 1
755  %and12.2 = and i8 %6, %5
756  %7 = tail call i8 @llvm.umax.i8(i8 %4, i8 %and12.2)
757  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
758  %8 = load i8, ptr %arrayidx.3, align 1
759  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
760  %9 = load i8, ptr %arrayidx3.3, align 1
761  %and12.3 = and i8 %9, %8
762  %10 = tail call i8 @llvm.umax.i8(i8 %7, i8 %and12.3)
763  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
764  %11 = load i8, ptr %arrayidx.4, align 1
765  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
766  %12 = load i8, ptr %arrayidx3.4, align 1
767  %and12.4 = and i8 %12, %11
768  %13 = tail call i8 @llvm.umax.i8(i8 %10, i8 %and12.4)
769  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
770  %14 = load i8, ptr %arrayidx.5, align 1
771  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
772  %15 = load i8, ptr %arrayidx3.5, align 1
773  %and12.5 = and i8 %15, %14
774  %16 = tail call i8 @llvm.umax.i8(i8 %13, i8 %and12.5)
775  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
776  %17 = load i8, ptr %arrayidx.6, align 1
777  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
778  %18 = load i8, ptr %arrayidx3.6, align 1
779  %and12.6 = and i8 %18, %17
780  %19 = tail call i8 @llvm.umax.i8(i8 %16, i8 %and12.6)
781  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
782  %20 = load i8, ptr %arrayidx.7, align 1
783  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
784  %21 = load i8, ptr %arrayidx3.7, align 1
785  %and12.7 = and i8 %21, %20
786  %22 = tail call i8 @llvm.umax.i8(i8 %19, i8 %and12.7)
787  ret i8 %22
788}
789
790declare i8 @llvm.umin.i8(i8, i8)
791
792define i8 @reduce_umin(ptr %a, ptr %b) {
793; CHECK-LABEL: @reduce_umin(
794; CHECK-NEXT:  entry:
795; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_BUF:%.*]], ptr [[A:%.*]], i64 0, i32 0, i64 0
796; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_BUF]], ptr [[B:%.*]], i64 0, i32 0, i64 0
797; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 1
798; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX3]], align 1
799; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[TMP0]]
800; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> [[TMP2]])
801; CHECK-NEXT:    ret i8 [[TMP3]]
802;
803entry:
804  %arrayidx = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 0
805  %0 = load i8, ptr %arrayidx, align 1
806  %arrayidx3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 0
807  %1 = load i8, ptr %arrayidx3, align 1
808  %and12 = and i8 %1, %0
809  %arrayidx.1 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 1
810  %2 = load i8, ptr %arrayidx.1, align 1
811  %arrayidx3.1 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 1
812  %3 = load i8, ptr %arrayidx3.1, align 1
813  %and12.1 = and i8 %3, %2
814  %4 = tail call i8 @llvm.umin.i8(i8 %and12, i8 %and12.1)
815  %arrayidx.2 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 2
816  %5 = load i8, ptr %arrayidx.2, align 1
817  %arrayidx3.2 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 2
818  %6 = load i8, ptr %arrayidx3.2, align 1
819  %and12.2 = and i8 %6, %5
820  %7 = tail call i8 @llvm.umin.i8(i8 %4, i8 %and12.2)
821  %arrayidx.3 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 3
822  %8 = load i8, ptr %arrayidx.3, align 1
823  %arrayidx3.3 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 3
824  %9 = load i8, ptr %arrayidx3.3, align 1
825  %and12.3 = and i8 %9, %8
826  %10 = tail call i8 @llvm.umin.i8(i8 %7, i8 %and12.3)
827  %arrayidx.4 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 4
828  %11 = load i8, ptr %arrayidx.4, align 1
829  %arrayidx3.4 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 4
830  %12 = load i8, ptr %arrayidx3.4, align 1
831  %and12.4 = and i8 %12, %11
832  %13 = tail call i8 @llvm.umin.i8(i8 %10, i8 %and12.4)
833  %arrayidx.5 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 5
834  %14 = load i8, ptr %arrayidx.5, align 1
835  %arrayidx3.5 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 5
836  %15 = load i8, ptr %arrayidx3.5, align 1
837  %and12.5 = and i8 %15, %14
838  %16 = tail call i8 @llvm.umin.i8(i8 %13, i8 %and12.5)
839  %arrayidx.6 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 6
840  %17 = load i8, ptr %arrayidx.6, align 1
841  %arrayidx3.6 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 6
842  %18 = load i8, ptr %arrayidx3.6, align 1
843  %and12.6 = and i8 %18, %17
844  %19 = tail call i8 @llvm.umin.i8(i8 %16, i8 %and12.6)
845  %arrayidx.7 = getelementptr inbounds %struct.buf, ptr %a, i64 0, i32 0, i64 7
846  %20 = load i8, ptr %arrayidx.7, align 1
847  %arrayidx3.7 = getelementptr inbounds %struct.buf, ptr %b, i64 0, i32 0, i64 7
848  %21 = load i8, ptr %arrayidx3.7, align 1
849  %and12.7 = and i8 %21, %20
850  %22 = tail call i8 @llvm.umin.i8(i8 %19, i8 %and12.7)
851  ret i8 %22
852}
853
854; Next batch exercise reductions involing zext of narrower loads
855
856define i64 @red_zext_ld_2xi64(ptr %ptr) {
857; CHECK-LABEL: @red_zext_ld_2xi64(
858; CHECK-NEXT:  entry:
859; CHECK-NEXT:    [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1
860; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[LD0]] to i64
861; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
862; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP]], align 1
863; CHECK-NEXT:    [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64
864; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]]
865; CHECK-NEXT:    ret i64 [[ADD_1]]
866;
867entry:
868  %ld0 = load i8, ptr %ptr
869  %zext = zext i8 %ld0 to i64
870  %gep = getelementptr inbounds i8, ptr %ptr, i64 1
871  %ld1 = load i8, ptr %gep
872  %zext.1 = zext i8 %ld1 to i64
873  %add.1 = add nuw nsw i64 %zext, %zext.1
874  ret i64 %add.1
875}
876
877define i64 @red_zext_ld_4xi64(ptr %ptr) {
878; CHECK-LABEL: @red_zext_ld_4xi64(
879; CHECK-NEXT:  entry:
880; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
881; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
882; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
883; CHECK-NEXT:    [[ADD_3:%.*]] = zext i16 [[TMP2]] to i64
884; CHECK-NEXT:    ret i64 [[ADD_3]]
885;
886entry:
887  %ld0 = load i8, ptr %ptr
888  %zext = zext i8 %ld0 to i64
889  %gep = getelementptr inbounds i8, ptr %ptr, i64 1
890  %ld1 = load i8, ptr %gep
891  %zext.1 = zext i8 %ld1 to i64
892  %add.1 = add nuw nsw i64 %zext, %zext.1
893  %gep.1 = getelementptr inbounds i8, ptr %ptr, i64 2
894  %ld2 = load i8, ptr %gep.1
895  %zext.2 = zext i8 %ld2 to i64
896  %add.2 = add nuw nsw i64 %add.1, %zext.2
897  %gep.2 = getelementptr inbounds i8, ptr %ptr, i64 3
898  %ld3 = load i8, ptr %gep.2
899  %zext.3 = zext i8 %ld3 to i64
900  %add.3 = add nuw nsw i64 %add.2, %zext.3
901  ret i64 %add.3
902}
903
904define i64 @red_zext_ld_8xi64(ptr %ptr) {
905; CHECK-LABEL: @red_zext_ld_8xi64(
906; CHECK-NEXT:  entry:
907; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1
908; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i64>
909; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
910; CHECK-NEXT:    ret i64 [[TMP2]]
911;
912entry:
913  %ld0 = load i8, ptr %ptr
914  %zext = zext i8 %ld0 to i64
915  %gep = getelementptr inbounds i8, ptr %ptr, i64 1
916  %ld1 = load i8, ptr %gep
917  %zext.1 = zext i8 %ld1 to i64
918  %add.1 = add nuw nsw i64 %zext, %zext.1
919  %gep.1 = getelementptr inbounds i8, ptr %ptr, i64 2
920  %ld2 = load i8, ptr %gep.1
921  %zext.2 = zext i8 %ld2 to i64
922  %add.2 = add nuw nsw i64 %add.1, %zext.2
923  %gep.2 = getelementptr inbounds i8, ptr %ptr, i64 3
924  %ld3 = load i8, ptr %gep.2
925  %zext.3 = zext i8 %ld3 to i64
926  %add.3 = add nuw nsw i64 %add.2, %zext.3
927  %gep.3 = getelementptr inbounds i8, ptr %ptr, i64 4
928  %ld4 = load i8, ptr %gep.3
929  %zext.4 = zext i8 %ld4 to i64
930  %add.4 = add nuw nsw i64 %add.3, %zext.4
931  %gep.4 = getelementptr inbounds i8, ptr %ptr, i64 5
932  %ld5 = load i8, ptr %gep.4
933  %zext.5 = zext i8 %ld5 to i64
934  %add.5 = add nuw nsw i64 %add.4, %zext.5
935  %gep.5 = getelementptr inbounds i8, ptr %ptr, i64 6
936  %ld6 = load i8, ptr %gep.5
937  %zext.6 = zext i8 %ld6 to i64
938  %add.6 = add nuw nsw i64 %add.5, %zext.6
939  %gep.6 = getelementptr inbounds i8, ptr %ptr, i64 7
940  %ld7 = load i8, ptr %gep.6
941  %zext.7 = zext i8 %ld7 to i64
942  %add.7 = add nuw nsw i64 %add.6, %zext.7
943  ret i64 %add.7
944}
945
946define i64 @red_zext_ld_16xi64(ptr %ptr) {
947; CHECK-LABEL: @red_zext_ld_16xi64(
948; CHECK-NEXT:  entry:
949; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[PTR:%.*]], align 1
950; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i64>
951; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
952; CHECK-NEXT:    ret i64 [[TMP2]]
953;
954entry:
955  %ld0 = load i8, ptr %ptr
956  %zext = zext i8 %ld0 to i64
957  %gep = getelementptr inbounds i8, ptr %ptr, i64 1
958  %ld1 = load i8, ptr %gep
959  %zext.1 = zext i8 %ld1 to i64
960  %add.1 = add nuw nsw i64 %zext, %zext.1
961  %gep.1 = getelementptr inbounds i8, ptr %ptr, i64 2
962  %ld2 = load i8, ptr %gep.1
963  %zext.2 = zext i8 %ld2 to i64
964  %add.2 = add nuw nsw i64 %add.1, %zext.2
965  %gep.2 = getelementptr inbounds i8, ptr %ptr, i64 3
966  %ld3 = load i8, ptr %gep.2
967  %zext.3 = zext i8 %ld3 to i64
968  %add.3 = add nuw nsw i64 %add.2, %zext.3
969  %gep.3 = getelementptr inbounds i8, ptr %ptr, i64 4
970  %ld4 = load i8, ptr %gep.3
971  %zext.4 = zext i8 %ld4 to i64
972  %add.4 = add nuw nsw i64 %add.3, %zext.4
973  %gep.4 = getelementptr inbounds i8, ptr %ptr, i64 5
974  %ld5 = load i8, ptr %gep.4
975  %zext.5 = zext i8 %ld5 to i64
976  %add.5 = add nuw nsw i64 %add.4, %zext.5
977  %gep.5 = getelementptr inbounds i8, ptr %ptr, i64 6
978  %ld6 = load i8, ptr %gep.5
979  %zext.6 = zext i8 %ld6 to i64
980  %add.6 = add nuw nsw i64 %add.5, %zext.6
981  %gep.6 = getelementptr inbounds i8, ptr %ptr, i64 7
982  %ld7 = load i8, ptr %gep.6
983  %zext.7 = zext i8 %ld7 to i64
984  %add.7 = add nuw nsw i64 %add.6, %zext.7
985  %gep.7 = getelementptr inbounds i8, ptr %ptr, i64 8
986  %ld8 = load i8, ptr %gep.7
987  %zext.8 = zext i8 %ld8 to i64
988  %add.8 = add nuw nsw i64 %add.7, %zext.8
989  %gep.8 = getelementptr inbounds i8, ptr %ptr, i64 9
990  %ld9 = load i8, ptr %gep.8
991  %zext.9 = zext i8 %ld9 to i64
992  %add.9 = add nuw nsw i64 %add.8, %zext.9
993  %gep.9 = getelementptr inbounds i8, ptr %ptr, i64 10
994  %ld10 = load i8, ptr %gep.9
995  %zext.10 = zext i8 %ld10 to i64
996  %add.10 = add nuw nsw i64 %add.9, %zext.10
997  %gep.10 = getelementptr inbounds i8, ptr %ptr, i64 11
998  %ld11 = load i8, ptr %gep.10
999  %zext.11 = zext i8 %ld11 to i64
1000  %add.11 = add nuw nsw i64 %add.10, %zext.11
1001  %gep.11 = getelementptr inbounds i8, ptr %ptr, i64 12
1002  %ld12 = load i8, ptr %gep.11
1003  %zext.12 = zext i8 %ld12 to i64
1004  %add.12 = add nuw nsw i64 %add.11, %zext.12
1005  %gep.12 = getelementptr inbounds i8, ptr %ptr, i64 13
1006  %ld13 = load i8, ptr %gep.12
1007  %zext.13 = zext i8 %ld13 to i64
1008  %add.13 = add nuw nsw i64 %add.12, %zext.13
1009  %gep.13 = getelementptr inbounds i8, ptr %ptr, i64 14
1010  %ld14 = load i8, ptr %gep.13
1011  %zext.14 = zext i8 %ld14 to i64
1012  %add.14 = add nuw nsw i64 %add.13, %zext.14
1013  %gep.14 = getelementptr inbounds i8, ptr %ptr, i64 15
1014  %ld15 = load i8, ptr %gep.14
1015  %zext.15 = zext i8 %ld15 to i64
1016  %add.15 = add nuw nsw i64 %add.14, %zext.15
1017  ret i64 %add.15
1018}
1019
1020declare i32 @llvm.abs.i32(i32, i1)
1021
1022define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {
1023; CHECK-LABEL: @stride_sum_abs_diff(
1024; CHECK-NEXT:    [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]]
1025; CHECK-NEXT:    [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]]
1026; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4
1027; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4
1028; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4
1029; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4
1030; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0)
1031; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2)
1032; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0)
1033; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2)
1034; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]]
1035; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
1036; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
1037; CHECK-NEXT:    ret i32 [[TMP11]]
1038;
1039  %x.0 = load i32, ptr %p
1040  %y.0 = load i32, ptr %q
1041  %sub.0 = sub i32 %x.0, %y.0
1042  %abs.0 = tail call i32 @llvm.abs.i32(i32 %sub.0, i1 true)
1043
1044  %p.1 = getelementptr inbounds i32, ptr %p, i64 1
1045  %x.1 = load i32, ptr %p.1
1046  %q.1 = getelementptr inbounds i32, ptr %q, i64 1
1047  %y.1 = load i32, ptr %q.1
1048  %sub.1 = sub i32 %x.1, %y.1
1049  %abs.1 = tail call i32 @llvm.abs.i32(i32 %sub.1, i1 true)
1050  %sum.0 = add i32 %abs.0, %abs.1
1051
1052  %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride
1053  %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride
1054
1055  %x.2 = load i32, ptr %p.2
1056  %y.2 = load i32, ptr %q.2
1057  %sub.2 = sub i32 %x.2, %y.2
1058  %abs.2 = tail call i32 @llvm.abs.i32(i32 %sub.2, i1 true)
1059  %sum.1 = add i32 %sum.0, %abs.2
1060
1061  %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1
1062  %x.3 = load i32, ptr %p.3
1063  %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1
1064  %y.3 = load i32, ptr %q.3
1065  %sub.3 = sub i32 %x.3, %y.3
1066  %abs.3 = tail call i32 @llvm.abs.i32(i32 %sub.3, i1 true)
1067  %sum.2 = add i32 %sum.1, %abs.3
1068
1069  ret i32 %sum.2
1070}
1071
1072define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) {
1073; CHECK-LABEL: @reduce_sum_2arrays_a(
1074; CHECK-NEXT:  entry:
1075; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
1076; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1
1077; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0)
1078; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4)
1079; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32>
1080; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
1081; CHECK-NEXT:    ret i32 [[TMP5]]
1082;
1083entry:
1084  %x.0 = load i8, ptr %p, align 1
1085  %conv = zext i8 %x.0 to i32
1086  %y.0 = load i8, ptr %q, align 1
1087  %conv3 = zext i8 %y.0 to i32
1088  %add4 = add nuw nsw i32 %conv, %conv3
1089
1090  %arrayidx.1 = getelementptr inbounds i8, ptr %p, i64 1
1091  %x.1 = load i8, ptr %arrayidx.1, align 1
1092  %conv.1 = zext i8 %x.1 to i32
1093  %arrayidx2.1 = getelementptr inbounds i8, ptr %q, i64 1
1094  %y.1 = load i8, ptr %arrayidx2.1, align 1
1095  %conv3.1 = zext i8 %y.1 to i32
1096  %add.1 = add nuw nsw i32 %add4, %conv.1
1097  %add4.1 = add nuw nsw i32 %add.1, %conv3.1
1098
1099  %arrayidx.2 = getelementptr inbounds i8, ptr %p, i64 2
1100  %x.2 = load i8, ptr %arrayidx.2, align 1
1101  %conv.2 = zext i8 %x.2 to i32
1102  %arrayidx2.2 = getelementptr inbounds i8, ptr %q, i64 2
1103  %y.2 = load i8, ptr %arrayidx2.2, align 1
1104  %conv3.2 = zext i8 %y.2 to i32
1105  %add.2 = add nuw nsw i32 %add4.1, %conv.2
1106  %add4.2 = add nuw nsw i32 %add.2, %conv3.2
1107
1108  %arrayidx.3 = getelementptr inbounds i8, ptr %p, i64 3
1109  %x.3 = load i8, ptr %arrayidx.3, align 1
1110  %conv.3 = zext i8 %x.3 to i32
1111  %arrayidx2.3 = getelementptr inbounds i8, ptr %q, i64 3
1112  %y.3 = load i8, ptr %arrayidx2.3, align 1
1113  %conv3.3 = zext i8 %y.3 to i32
1114  %add.3 = add nuw nsw i32 %add4.2, %conv.3
1115  %add4.3 = add nuw nsw i32 %add.3, %conv3.3
1116
1117  ret i32 %add4.3
1118}
1119
1120define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) {
1121; CHECK-LABEL: @reduce_sum_2arrays_b(
1122; CHECK-NEXT:  entry:
1123; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1
1124; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1
1125; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0)
1126; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4)
1127; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32>
1128; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
1129; CHECK-NEXT:    ret i32 [[TMP5]]
1130;
1131  entry:
1132  %0 = load i8, ptr %x, align 1
1133  %conv = zext i8 %0 to i32
1134  %arrayidx.1 = getelementptr inbounds i8, ptr %x, i64 1
1135  %1 = load i8, ptr %arrayidx.1, align 1
1136  %conv.1 = zext i8 %1 to i32
1137  %add.1 = add nuw nsw i32 %conv, %conv.1
1138  %arrayidx.2 = getelementptr inbounds i8, ptr %x, i64 2
1139  %2 = load i8, ptr %arrayidx.2, align 1
1140  %conv.2 = zext i8 %2 to i32
1141  %add.2 = add nuw nsw i32 %add.1, %conv.2
1142  %arrayidx.3 = getelementptr inbounds i8, ptr %x, i64 3
1143  %3 = load i8, ptr %arrayidx.3, align 1
1144  %conv.3 = zext i8 %3 to i32
1145  %add.3 = add nuw nsw i32 %add.2, %conv.3
1146  %4 = load i8, ptr %y, align 1
1147  %conv9 = zext i8 %4 to i32
1148  %add10 = add nuw nsw i32 %add.3, %conv9
1149  %arrayidx8.1 = getelementptr inbounds i8, ptr %y, i64 1
1150  %5 = load i8, ptr %arrayidx8.1, align 1
1151  %conv9.1 = zext i8 %5 to i32
1152  %add10.1 = add nuw nsw i32 %add10, %conv9.1
1153  %arrayidx8.2 = getelementptr inbounds i8, ptr %y, i64 2
1154  %6 = load i8, ptr %arrayidx8.2, align 1
1155  %conv9.2 = zext i8 %6 to i32
1156  %add10.2 = add nuw nsw i32 %add10.1, %conv9.2
1157  %arrayidx8.3 = getelementptr inbounds i8, ptr %y, i64 3
1158  %7 = load i8, ptr %arrayidx8.3, align 1
1159  %conv9.3 = zext i8 %7 to i32
1160  %add10.3 = add nuw nsw i32 %add10.2, %conv9.3
1161  ret i32 %add10.3
1162}
1163
1164; Shouldn't vectorize to a reduction because we can't promote it
1165define bfloat @fadd_4xbf16(ptr %p) {
1166; CHECK-LABEL: @fadd_4xbf16(
1167; CHECK-NEXT:    [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2
1168; CHECK-NEXT:    [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1
1169; CHECK-NEXT:    [[X1:%.*]] = load bfloat, ptr [[P1]], align 2
1170; CHECK-NEXT:    [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2
1171; CHECK-NEXT:    [[X2:%.*]] = load bfloat, ptr [[P2]], align 2
1172; CHECK-NEXT:    [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3
1173; CHECK-NEXT:    [[X3:%.*]] = load bfloat, ptr [[P3]], align 2
1174; CHECK-NEXT:    [[R0:%.*]] = fadd fast bfloat [[X0]], [[X1]]
1175; CHECK-NEXT:    [[R1:%.*]] = fadd fast bfloat [[R0]], [[X2]]
1176; CHECK-NEXT:    [[R2:%.*]] = fadd fast bfloat [[R1]], [[X3]]
1177; CHECK-NEXT:    ret bfloat [[R2]]
1178;
1179  %x0 = load bfloat, ptr %p
1180  %p1 = getelementptr bfloat, ptr %p, i32 1
1181  %x1 = load bfloat, ptr %p1
1182  %p2 = getelementptr bfloat, ptr %p, i32 2
1183  %x2 = load bfloat, ptr %p2
1184  %p3 = getelementptr bfloat, ptr %p, i32 3
1185  %x3 = load bfloat, ptr %p3
1186
1187  %r0 = fadd fast bfloat %x0, %x1
1188  %r1 = fadd fast bfloat %r0, %x2
1189  %r2 = fadd fast bfloat %r1, %x3
1190
1191  ret bfloat %r2
1192}
1193
1194; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs
1195define bfloat @fmul_4xbf16(ptr %p) {
1196; CHECK-LABEL: @fmul_4xbf16(
1197; CHECK-NEXT:    [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2
1198; CHECK-NEXT:    [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1
1199; CHECK-NEXT:    [[X1:%.*]] = load bfloat, ptr [[P1]], align 2
1200; CHECK-NEXT:    [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2
1201; CHECK-NEXT:    [[X2:%.*]] = load bfloat, ptr [[P2]], align 2
1202; CHECK-NEXT:    [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3
1203; CHECK-NEXT:    [[X3:%.*]] = load bfloat, ptr [[P3]], align 2
1204; CHECK-NEXT:    [[R0:%.*]] = fmul fast bfloat [[X0]], [[X1]]
1205; CHECK-NEXT:    [[R1:%.*]] = fmul fast bfloat [[R0]], [[X2]]
1206; CHECK-NEXT:    [[R2:%.*]] = fmul fast bfloat [[R1]], [[X3]]
1207; CHECK-NEXT:    ret bfloat [[R2]]
1208;
1209  %x0 = load bfloat, ptr %p
1210  %p1 = getelementptr bfloat, ptr %p, i32 1
1211  %x1 = load bfloat, ptr %p1
1212  %p2 = getelementptr bfloat, ptr %p, i32 2
1213  %x2 = load bfloat, ptr %p2
1214  %p3 = getelementptr bfloat, ptr %p, i32 3
1215  %x3 = load bfloat, ptr %p3
1216
1217  %r0 = fmul fast bfloat %x0, %x1
1218  %r1 = fmul fast bfloat %r0, %x2
1219  %r2 = fmul fast bfloat %r1, %x3
1220
1221  ret bfloat %r2
1222}
1223
1224; Shouldn't vectorize to a reduction on zvfhmin because we can't promote it
1225define half @fadd_4xf16(ptr %p) {
1226; ZVFHMIN-LABEL: @fadd_4xf16(
1227; ZVFHMIN-NEXT:    [[X0:%.*]] = load half, ptr [[P:%.*]], align 2
1228; ZVFHMIN-NEXT:    [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1
1229; ZVFHMIN-NEXT:    [[X1:%.*]] = load half, ptr [[P1]], align 2
1230; ZVFHMIN-NEXT:    [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2
1231; ZVFHMIN-NEXT:    [[X2:%.*]] = load half, ptr [[P2]], align 2
1232; ZVFHMIN-NEXT:    [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3
1233; ZVFHMIN-NEXT:    [[X3:%.*]] = load half, ptr [[P3]], align 2
1234; ZVFHMIN-NEXT:    [[R0:%.*]] = fadd fast half [[X0]], [[X1]]
1235; ZVFHMIN-NEXT:    [[R1:%.*]] = fadd fast half [[R0]], [[X2]]
1236; ZVFHMIN-NEXT:    [[R2:%.*]] = fadd fast half [[R1]], [[X3]]
1237; ZVFHMIN-NEXT:    ret half [[R2]]
1238;
1239; ZVFH-LABEL: @fadd_4xf16(
1240; ZVFH-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[P:%.*]], align 2
1241; ZVFH-NEXT:    [[TMP2:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP1]])
1242; ZVFH-NEXT:    ret half [[TMP2]]
1243;
1244  %x0 = load half, ptr %p
1245  %p1 = getelementptr half, ptr %p, i32 1
1246  %x1 = load half, ptr %p1
1247  %p2 = getelementptr half, ptr %p, i32 2
1248  %x2 = load half, ptr %p2
1249  %p3 = getelementptr half, ptr %p, i32 3
1250  %x3 = load half, ptr %p3
1251
1252  %r0 = fadd fast half %x0, %x1
1253  %r1 = fadd fast half %r0, %x2
1254  %r2 = fadd fast half %r1, %x3
1255
1256  ret half %r2
1257}
1258
1259; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs
1260define half @fmul_4xf16(ptr %p) {
1261; CHECK-LABEL: @fmul_4xf16(
1262; CHECK-NEXT:    [[X0:%.*]] = load half, ptr [[P:%.*]], align 2
1263; CHECK-NEXT:    [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1
1264; CHECK-NEXT:    [[X1:%.*]] = load half, ptr [[P1]], align 2
1265; CHECK-NEXT:    [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2
1266; CHECK-NEXT:    [[X2:%.*]] = load half, ptr [[P2]], align 2
1267; CHECK-NEXT:    [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3
1268; CHECK-NEXT:    [[X3:%.*]] = load half, ptr [[P3]], align 2
1269; CHECK-NEXT:    [[R0:%.*]] = fmul fast half [[X0]], [[X1]]
1270; CHECK-NEXT:    [[R1:%.*]] = fmul fast half [[R0]], [[X2]]
1271; CHECK-NEXT:    [[R2:%.*]] = fmul fast half [[R1]], [[X3]]
1272; CHECK-NEXT:    ret half [[R2]]
1273;
1274  %x0 = load half, ptr %p
1275  %p1 = getelementptr half, ptr %p, i32 1
1276  %x1 = load half, ptr %p1
1277  %p2 = getelementptr half, ptr %p, i32 2
1278  %x2 = load half, ptr %p2
1279  %p3 = getelementptr half, ptr %p, i32 3
1280  %x3 = load half, ptr %p3
1281
1282  %r0 = fmul fast half %x0, %x1
1283  %r1 = fmul fast half %r0, %x2
1284  %r2 = fmul fast half %r1, %x3
1285
1286  ret half %r2
1287}
1288