xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-aliasing.ll (revision 4d8012259846274c6122b3befdff32d4c3010f7c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
3
4; These test should allow scheduling of the loads before the stores.
5
6define void @scalable_v16i8(ptr noalias nocapture noundef %l0) {
7; CHECK-LABEL: scalable_v16i8:
8; CHECK:       // %bb.0:
9; CHECK-NEXT:    ptrue p0.b
10; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
11; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0, #1, mul vl]
12; CHECK-NEXT:    movprfx z2, z0
13; CHECK-NEXT:    mul z2.b, p0/m, z2.b, z0.b
14; CHECK-NEXT:    movprfx z3, z1
15; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z1.b
16; CHECK-NEXT:    eor z0.d, z2.d, z0.d
17; CHECK-NEXT:    eor z1.d, z3.d, z1.d
18; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
19; CHECK-NEXT:    st1b { z1.b }, p0, [x0, #1, mul vl]
20; CHECK-NEXT:    ret
21  %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
22  %l5 = mul <vscale x 16 x i8> %l3, %l3
23  %l6 = xor <vscale x 16 x i8> %l5, %l3
24  store <vscale x 16 x i8> %l6, ptr %l0, align 16
25  %l7 = tail call i64 @llvm.vscale.i64()
26  %l8 = shl nuw nsw i64 %l7, 4
27  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
28  %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
29  %l13 = mul <vscale x 16 x i8> %l11, %l11
30  %l14 = xor <vscale x 16 x i8> %l13, %l11
31  store <vscale x 16 x i8> %l14, ptr %l9, align 16
32  ret void
33}
34
35define void @scalable_v8i16(ptr noalias nocapture noundef %l0) {
36; CHECK-LABEL: scalable_v8i16:
37; CHECK:       // %bb.0:
38; CHECK-NEXT:    ptrue p0.h
39; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
40; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #1, mul vl]
41; CHECK-NEXT:    movprfx z2, z0
42; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z0.h
43; CHECK-NEXT:    movprfx z3, z1
44; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z1.h
45; CHECK-NEXT:    eor z0.d, z2.d, z0.d
46; CHECK-NEXT:    eor z1.d, z3.d, z1.d
47; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
48; CHECK-NEXT:    st1h { z1.h }, p0, [x0, #1, mul vl]
49; CHECK-NEXT:    ret
50  %l3 = load <vscale x 8 x i16>, ptr %l0, align 16
51  %l5 = mul <vscale x 8 x i16> %l3, %l3
52  %l6 = xor <vscale x 8 x i16> %l5, %l3
53  store <vscale x 8 x i16> %l6, ptr %l0, align 16
54  %l7 = tail call i64 @llvm.vscale.i64()
55  %l8 = shl nuw nsw i64 %l7, 4
56  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
57  %l11 = load <vscale x 8 x i16>, ptr %l9, align 16
58  %l13 = mul <vscale x 8 x i16> %l11, %l11
59  %l14 = xor <vscale x 8 x i16> %l13, %l11
60  store <vscale x 8 x i16> %l14, ptr %l9, align 16
61  ret void
62}
63
64define void @scalable_v4i32(ptr noalias nocapture noundef %l0) {
65; CHECK-LABEL: scalable_v4i32:
66; CHECK:       // %bb.0:
67; CHECK-NEXT:    ptrue p0.s
68; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
69; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
70; CHECK-NEXT:    movprfx z2, z0
71; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z0.s
72; CHECK-NEXT:    movprfx z3, z1
73; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z1.s
74; CHECK-NEXT:    eor z0.d, z2.d, z0.d
75; CHECK-NEXT:    eor z1.d, z3.d, z1.d
76; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
77; CHECK-NEXT:    st1w { z1.s }, p0, [x0, #1, mul vl]
78; CHECK-NEXT:    ret
79  %l3 = load <vscale x 4 x i32>, ptr %l0, align 16
80  %l5 = mul <vscale x 4 x i32> %l3, %l3
81  %l6 = xor <vscale x 4 x i32> %l5, %l3
82  store <vscale x 4 x i32> %l6, ptr %l0, align 16
83  %l7 = tail call i64 @llvm.vscale.i64()
84  %l8 = shl nuw nsw i64 %l7, 4
85  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
86  %l11 = load <vscale x 4 x i32>, ptr %l9, align 16
87  %l13 = mul <vscale x 4 x i32> %l11, %l11
88  %l14 = xor <vscale x 4 x i32> %l13, %l11
89  store <vscale x 4 x i32> %l14, ptr %l9, align 16
90  ret void
91}
92
93define void @scalable_v2i64(ptr noalias nocapture noundef %l0) {
94; CHECK-LABEL: scalable_v2i64:
95; CHECK:       // %bb.0:
96; CHECK-NEXT:    ptrue p0.d
97; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
98; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
99; CHECK-NEXT:    movprfx z2, z0
100; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z0.d
101; CHECK-NEXT:    movprfx z3, z1
102; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z1.d
103; CHECK-NEXT:    eor z0.d, z2.d, z0.d
104; CHECK-NEXT:    eor z1.d, z3.d, z1.d
105; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
106; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #1, mul vl]
107; CHECK-NEXT:    ret
108  %l3 = load <vscale x 2 x i64>, ptr %l0, align 16
109  %l5 = mul <vscale x 2 x i64> %l3, %l3
110  %l6 = xor <vscale x 2 x i64> %l5, %l3
111  store <vscale x 2 x i64> %l6, ptr %l0, align 16
112  %l7 = tail call i64 @llvm.vscale.i64()
113  %l8 = shl nuw nsw i64 %l7, 4
114  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
115  %l11 = load <vscale x 2 x i64>, ptr %l9, align 16
116  %l13 = mul <vscale x 2 x i64> %l11, %l11
117  %l14 = xor <vscale x 2 x i64> %l13, %l11
118  store <vscale x 2 x i64> %l14, ptr %l9, align 16
119  ret void
120}
121
122define void @scalable_v8i8(ptr noalias nocapture noundef %l0) {
123; CHECK-LABEL: scalable_v8i8:
124; CHECK:       // %bb.0:
125; CHECK-NEXT:    ptrue p0.h
126; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
127; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
128; CHECK-NEXT:    movprfx z2, z0
129; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z0.h
130; CHECK-NEXT:    movprfx z3, z1
131; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z1.h
132; CHECK-NEXT:    eor z0.d, z2.d, z0.d
133; CHECK-NEXT:    eor z1.d, z3.d, z1.d
134; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
135; CHECK-NEXT:    st1b { z1.h }, p0, [x0, #1, mul vl]
136; CHECK-NEXT:    ret
137  %l3 = load <vscale x 8 x i8>, ptr %l0, align 16
138  %s3 = sext <vscale x 8 x i8> %l3 to <vscale x 8 x i16>
139  %l5 = mul <vscale x 8 x i16> %s3, %s3
140  %l6 = xor <vscale x 8 x i16> %l5, %s3
141  %t6 = trunc <vscale x 8 x i16> %l6 to <vscale x 8 x i8>
142  store <vscale x 8 x i8> %t6, ptr %l0, align 16
143  %l7 = tail call i64 @llvm.vscale.i64()
144  %l8 = shl nuw nsw i64 %l7, 3
145  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
146  %l11 = load <vscale x 8 x i8>, ptr %l9, align 16
147  %s11 = sext <vscale x 8 x i8> %l11 to <vscale x 8 x i16>
148  %l13 = mul <vscale x 8 x i16> %s11, %s11
149  %l14 = xor <vscale x 8 x i16> %l13, %s11
150  %t14 = trunc <vscale x 8 x i16> %l14 to <vscale x 8 x i8>
151  store <vscale x 8 x i8> %t14, ptr %l9, align 16
152  ret void
153}
154
155define void @scalable_v4i8(ptr noalias nocapture noundef %l0) {
156; CHECK-LABEL: scalable_v4i8:
157; CHECK:       // %bb.0:
158; CHECK-NEXT:    ptrue p0.s
159; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
160; CHECK-NEXT:    ld1sb { z1.s }, p0/z, [x0, #1, mul vl]
161; CHECK-NEXT:    movprfx z2, z0
162; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z0.s
163; CHECK-NEXT:    movprfx z3, z1
164; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z1.s
165; CHECK-NEXT:    eor z0.d, z2.d, z0.d
166; CHECK-NEXT:    eor z1.d, z3.d, z1.d
167; CHECK-NEXT:    st1b { z0.s }, p0, [x0]
168; CHECK-NEXT:    st1b { z1.s }, p0, [x0, #1, mul vl]
169; CHECK-NEXT:    ret
170  %l3 = load <vscale x 4 x i8>, ptr %l0, align 16
171  %s3 = sext <vscale x 4 x i8> %l3 to <vscale x 4 x i32>
172  %l5 = mul <vscale x 4 x i32> %s3, %s3
173  %l6 = xor <vscale x 4 x i32> %l5, %s3
174  %t6 = trunc <vscale x 4 x i32> %l6 to <vscale x 4 x i8>
175  store <vscale x 4 x i8> %t6, ptr %l0, align 16
176  %l7 = tail call i64 @llvm.vscale.i64()
177  %l8 = shl nuw nsw i64 %l7, 2
178  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
179  %l11 = load <vscale x 4 x i8>, ptr %l9, align 16
180  %s11 = sext <vscale x 4 x i8> %l11 to <vscale x 4 x i32>
181  %l13 = mul <vscale x 4 x i32> %s11, %s11
182  %l14 = xor <vscale x 4 x i32> %l13, %s11
183  %t14 = trunc <vscale x 4 x i32> %l14 to <vscale x 4 x i8>
184  store <vscale x 4 x i8> %t14, ptr %l9, align 16
185  ret void
186}
187
188define void @scalable_v2i8(ptr noalias nocapture noundef %l0) {
189; CHECK-LABEL: scalable_v2i8:
190; CHECK:       // %bb.0:
191; CHECK-NEXT:    ptrue p0.d
192; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0]
193; CHECK-NEXT:    ld1sb { z1.d }, p0/z, [x0, #1, mul vl]
194; CHECK-NEXT:    movprfx z2, z0
195; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z0.d
196; CHECK-NEXT:    movprfx z3, z1
197; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z1.d
198; CHECK-NEXT:    eor z0.d, z2.d, z0.d
199; CHECK-NEXT:    eor z1.d, z3.d, z1.d
200; CHECK-NEXT:    st1b { z0.d }, p0, [x0]
201; CHECK-NEXT:    st1b { z1.d }, p0, [x0, #1, mul vl]
202; CHECK-NEXT:    ret
203  %l3 = load <vscale x 2 x i8>, ptr %l0, align 16
204  %s3 = sext <vscale x 2 x i8> %l3 to <vscale x 2 x i64>
205  %l5 = mul <vscale x 2 x i64> %s3, %s3
206  %l6 = xor <vscale x 2 x i64> %l5, %s3
207  %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i8>
208  store <vscale x 2 x i8> %t6, ptr %l0, align 16
209  %l7 = tail call i64 @llvm.vscale.i64()
210  %l8 = shl nuw nsw i64 %l7, 1
211  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
212  %l11 = load <vscale x 2 x i8>, ptr %l9, align 16
213  %s11 = sext <vscale x 2 x i8> %l11 to <vscale x 2 x i64>
214  %l13 = mul <vscale x 2 x i64> %s11, %s11
215  %l14 = xor <vscale x 2 x i64> %l13, %s11
216  %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i8>
217  store <vscale x 2 x i8> %t14, ptr %l9, align 16
218  ret void
219}
220
221define void @scalable_v4i16(ptr noalias nocapture noundef %l0) {
222; CHECK-LABEL: scalable_v4i16:
223; CHECK:       // %bb.0:
224; CHECK-NEXT:    ptrue p0.s
225; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
226; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
227; CHECK-NEXT:    movprfx z2, z0
228; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z0.s
229; CHECK-NEXT:    movprfx z3, z1
230; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z1.s
231; CHECK-NEXT:    eor z0.d, z2.d, z0.d
232; CHECK-NEXT:    eor z1.d, z3.d, z1.d
233; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
234; CHECK-NEXT:    st1h { z1.s }, p0, [x0, #1, mul vl]
235; CHECK-NEXT:    ret
236  %l3 = load <vscale x 4 x i16>, ptr %l0, align 16
237  %s3 = sext <vscale x 4 x i16> %l3 to <vscale x 4 x i32>
238  %l5 = mul <vscale x 4 x i32> %s3, %s3
239  %l6 = xor <vscale x 4 x i32> %l5, %s3
240  %t6 = trunc <vscale x 4 x i32> %l6 to <vscale x 4 x i16>
241  store <vscale x 4 x i16> %t6, ptr %l0, align 16
242  %l7 = tail call i64 @llvm.vscale.i64()
243  %l8 = shl nuw nsw i64 %l7, 3
244  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
245  %l11 = load <vscale x 4 x i16>, ptr %l9, align 16
246  %s11 = sext <vscale x 4 x i16> %l11 to <vscale x 4 x i32>
247  %l13 = mul <vscale x 4 x i32> %s11, %s11
248  %l14 = xor <vscale x 4 x i32> %l13, %s11
249  %t14 = trunc <vscale x 4 x i32> %l14 to <vscale x 4 x i16>
250  store <vscale x 4 x i16> %t14, ptr %l9, align 16
251  ret void
252}
253
254define void @scalable_v2i16(ptr noalias nocapture noundef %l0) {
255; CHECK-LABEL: scalable_v2i16:
256; CHECK:       // %bb.0:
257; CHECK-NEXT:    ptrue p0.d
258; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0]
259; CHECK-NEXT:    ld1sh { z1.d }, p0/z, [x0, #1, mul vl]
260; CHECK-NEXT:    movprfx z2, z0
261; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z0.d
262; CHECK-NEXT:    movprfx z3, z1
263; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z1.d
264; CHECK-NEXT:    eor z0.d, z2.d, z0.d
265; CHECK-NEXT:    eor z1.d, z3.d, z1.d
266; CHECK-NEXT:    st1h { z0.d }, p0, [x0]
267; CHECK-NEXT:    st1h { z1.d }, p0, [x0, #1, mul vl]
268; CHECK-NEXT:    ret
269  %l3 = load <vscale x 2 x i16>, ptr %l0, align 16
270  %s3 = sext <vscale x 2 x i16> %l3 to <vscale x 2 x i64>
271  %l5 = mul <vscale x 2 x i64> %s3, %s3
272  %l6 = xor <vscale x 2 x i64> %l5, %s3
273  %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i16>
274  store <vscale x 2 x i16> %t6, ptr %l0, align 16
275  %l7 = tail call i64 @llvm.vscale.i64()
276  %l8 = shl nuw nsw i64 %l7, 2
277  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
278  %l11 = load <vscale x 2 x i16>, ptr %l9, align 16
279  %s11 = sext <vscale x 2 x i16> %l11 to <vscale x 2 x i64>
280  %l13 = mul <vscale x 2 x i64> %s11, %s11
281  %l14 = xor <vscale x 2 x i64> %l13, %s11
282  %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i16>
283  store <vscale x 2 x i16> %t14, ptr %l9, align 16
284  ret void
285}
286
287define void @scalable_v2i32(ptr noalias nocapture noundef %l0) {
288; CHECK-LABEL: scalable_v2i32:
289; CHECK:       // %bb.0:
290; CHECK-NEXT:    ptrue p0.d
291; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
292; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
293; CHECK-NEXT:    movprfx z2, z0
294; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z0.d
295; CHECK-NEXT:    movprfx z3, z1
296; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z1.d
297; CHECK-NEXT:    eor z0.d, z2.d, z0.d
298; CHECK-NEXT:    eor z1.d, z3.d, z1.d
299; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
300; CHECK-NEXT:    st1w { z1.d }, p0, [x0, #1, mul vl]
301; CHECK-NEXT:    ret
302  %l3 = load <vscale x 2 x i32>, ptr %l0, align 16
303  %s3 = sext <vscale x 2 x i32> %l3 to <vscale x 2 x i64>
304  %l5 = mul <vscale x 2 x i64> %s3, %s3
305  %l6 = xor <vscale x 2 x i64> %l5, %s3
306  %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i32>
307  store <vscale x 2 x i32> %t6, ptr %l0, align 16
308  %l7 = tail call i64 @llvm.vscale.i64()
309  %l8 = shl nuw nsw i64 %l7, 3
310  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
311  %l11 = load <vscale x 2 x i32>, ptr %l9, align 16
312  %s11 = sext <vscale x 2 x i32> %l11 to <vscale x 2 x i64>
313  %l13 = mul <vscale x 2 x i64> %s11, %s11
314  %l14 = xor <vscale x 2 x i64> %l13, %s11
315  %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i32>
316  store <vscale x 2 x i32> %t14, ptr %l9, align 16
317  ret void
318}
319
320define void @negative_tooshort_v16i8(ptr noalias nocapture noundef %l0) {
321; CHECK-LABEL: negative_tooshort_v16i8:
322; CHECK:       // %bb.0:
323; CHECK-NEXT:    ptrue p0.b
324; CHECK-NEXT:    cnth x8
325; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
326; CHECK-NEXT:    movprfx z1, z0
327; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z0.b
328; CHECK-NEXT:    eor z0.d, z1.d, z0.d
329; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
330; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
331; CHECK-NEXT:    movprfx z1, z0
332; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z0.b
333; CHECK-NEXT:    eor z0.d, z1.d, z0.d
334; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
335; CHECK-NEXT:    ret
336  %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
337  %l5 = mul <vscale x 16 x i8> %l3, %l3
338  %l6 = xor <vscale x 16 x i8> %l5, %l3
339  store <vscale x 16 x i8> %l6, ptr %l0, align 16
340  %l7 = tail call i64 @llvm.vscale.i64()
341  %l8 = shl nuw nsw i64 %l7, 3
342  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
343  %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
344  %l13 = mul <vscale x 16 x i8> %l11, %l11
345  %l14 = xor <vscale x 16 x i8> %l13, %l11
346  store <vscale x 16 x i8> %l14, ptr %l9, align 16
347  ret void
348}
349
350define void @negative_scalable_v2i8(ptr noalias nocapture noundef %l0) {
351; CHECK-LABEL: negative_scalable_v2i8:
352; CHECK:       // %bb.0:
353; CHECK-NEXT:    ptrue p0.d
354; CHECK-NEXT:    rdvl x8, #1
355; CHECK-NEXT:    lsr x8, x8, #4
356; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0]
357; CHECK-NEXT:    movprfx z1, z0
358; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
359; CHECK-NEXT:    eor z0.d, z1.d, z0.d
360; CHECK-NEXT:    st1b { z0.d }, p0, [x0]
361; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, x8]
362; CHECK-NEXT:    movprfx z1, z0
363; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
364; CHECK-NEXT:    eor z0.d, z1.d, z0.d
365; CHECK-NEXT:    st1b { z0.d }, p0, [x0, x8]
366; CHECK-NEXT:    ret
367  %l3 = load <vscale x 2 x i8>, ptr %l0, align 16
368  %s3 = sext <vscale x 2 x i8> %l3 to <vscale x 2 x i64>
369  %l5 = mul <vscale x 2 x i64> %s3, %s3
370  %l6 = xor <vscale x 2 x i64> %l5, %s3
371  %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i8>
372  store <vscale x 2 x i8> %t6, ptr %l0, align 16
373  %l7 = tail call i64 @llvm.vscale.i64()
374  %l8 = shl nuw nsw i64 %l7, 0
375  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
376  %l11 = load <vscale x 2 x i8>, ptr %l9, align 16
377  %s11 = sext <vscale x 2 x i8> %l11 to <vscale x 2 x i64>
378  %l13 = mul <vscale x 2 x i64> %s11, %s11
379  %l14 = xor <vscale x 2 x i64> %l13, %s11
380  %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i8>
381  store <vscale x 2 x i8> %t14, ptr %l9, align 16
382  ret void
383}
384
385define void @negative_scalable_v2i16(ptr noalias nocapture noundef %l0) {
386; CHECK-LABEL: negative_scalable_v2i16:
387; CHECK:       // %bb.0:
388; CHECK-NEXT:    ptrue p0.d
389; CHECK-NEXT:    cntd x8
390; CHECK-NEXT:    add x8, x0, x8
391; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0]
392; CHECK-NEXT:    movprfx z1, z0
393; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
394; CHECK-NEXT:    eor z0.d, z1.d, z0.d
395; CHECK-NEXT:    st1h { z0.d }, p0, [x0]
396; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x8]
397; CHECK-NEXT:    movprfx z1, z0
398; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
399; CHECK-NEXT:    eor z0.d, z1.d, z0.d
400; CHECK-NEXT:    st1h { z0.d }, p0, [x8]
401; CHECK-NEXT:    ret
402  %l3 = load <vscale x 2 x i16>, ptr %l0, align 16
403  %s3 = sext <vscale x 2 x i16> %l3 to <vscale x 2 x i64>
404  %l5 = mul <vscale x 2 x i64> %s3, %s3
405  %l6 = xor <vscale x 2 x i64> %l5, %s3
406  %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i16>
407  store <vscale x 2 x i16> %t6, ptr %l0, align 16
408  %l7 = tail call i64 @llvm.vscale.i64()
409  %l8 = shl nuw nsw i64 %l7, 1
410  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
411  %l11 = load <vscale x 2 x i16>, ptr %l9, align 16
412  %s11 = sext <vscale x 2 x i16> %l11 to <vscale x 2 x i64>
413  %l13 = mul <vscale x 2 x i64> %s11, %s11
414  %l14 = xor <vscale x 2 x i64> %l13, %s11
415  %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i16>
416  store <vscale x 2 x i16> %t14, ptr %l9, align 16
417  ret void
418}
419
420define void @negative_scalable_v2i32(ptr noalias nocapture noundef %l0) {
421; CHECK-LABEL: negative_scalable_v2i32:
422; CHECK:       // %bb.0:
423; CHECK-NEXT:    ptrue p0.d
424; CHECK-NEXT:    cntw x8
425; CHECK-NEXT:    add x8, x0, x8
426; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
427; CHECK-NEXT:    movprfx z1, z0
428; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
429; CHECK-NEXT:    eor z0.d, z1.d, z0.d
430; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
431; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x8]
432; CHECK-NEXT:    movprfx z1, z0
433; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z0.d
434; CHECK-NEXT:    eor z0.d, z1.d, z0.d
435; CHECK-NEXT:    st1w { z0.d }, p0, [x8]
436; CHECK-NEXT:    ret
437  %l3 = load <vscale x 2 x i32>, ptr %l0, align 16
438  %s3 = sext <vscale x 2 x i32> %l3 to <vscale x 2 x i64>
439  %l5 = mul <vscale x 2 x i64> %s3, %s3
440  %l6 = xor <vscale x 2 x i64> %l5, %s3
441  %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i32>
442  store <vscale x 2 x i32> %t6, ptr %l0, align 16
443  %l7 = tail call i64 @llvm.vscale.i64()
444  %l8 = shl nuw nsw i64 %l7, 2
445  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
446  %l11 = load <vscale x 2 x i32>, ptr %l9, align 16
447  %s11 = sext <vscale x 2 x i32> %l11 to <vscale x 2 x i64>
448  %l13 = mul <vscale x 2 x i64> %s11, %s11
449  %l14 = xor <vscale x 2 x i64> %l13, %s11
450  %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i32>
451  store <vscale x 2 x i32> %t14, ptr %l9, align 16
452  ret void
453}
454
455define void @triple_v16i8(ptr noalias nocapture noundef %l0) {
456; CHECK-LABEL: triple_v16i8:
457; CHECK:       // %bb.0:
458; CHECK-NEXT:    ptrue p0.b
459; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
460; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0, #1, mul vl]
461; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x0, #2, mul vl]
462; CHECK-NEXT:    movprfx z3, z0
463; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z0.b
464; CHECK-NEXT:    movprfx z4, z1
465; CHECK-NEXT:    mul z4.b, p0/m, z4.b, z1.b
466; CHECK-NEXT:    movprfx z5, z2
467; CHECK-NEXT:    mul z5.b, p0/m, z5.b, z2.b
468; CHECK-NEXT:    eor z0.d, z3.d, z0.d
469; CHECK-NEXT:    eor z1.d, z4.d, z1.d
470; CHECK-NEXT:    eor z2.d, z5.d, z2.d
471; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
472; CHECK-NEXT:    st1b { z1.b }, p0, [x0, #1, mul vl]
473; CHECK-NEXT:    st1b { z2.b }, p0, [x0, #2, mul vl]
474; CHECK-NEXT:    ret
475  %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
476  %l5 = mul <vscale x 16 x i8> %l3, %l3
477  %l6 = xor <vscale x 16 x i8> %l5, %l3
478  store <vscale x 16 x i8> %l6, ptr %l0, align 16
479  %l7 = tail call i64 @llvm.vscale.i64()
480  %l8 = shl nuw nsw i64 %l7, 4
481  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
482  %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
483  %l13 = mul <vscale x 16 x i8> %l11, %l11
484  %l14 = xor <vscale x 16 x i8> %l13, %l11
485  store <vscale x 16 x i8> %l14, ptr %l9, align 16
486  %m9 = getelementptr inbounds i8, ptr %l9, i64 %l8
487  %m11 = load <vscale x 16 x i8>, ptr %m9, align 16
488  %m13 = mul <vscale x 16 x i8> %m11, %m11
489  %m14 = xor <vscale x 16 x i8> %m13, %m11
490  store <vscale x 16 x i8> %m14, ptr %m9, align 16
491  ret void
492}
493
494define void @negative_tripletooshort_v16i8(ptr noalias nocapture noundef %l0) {
495; CHECK-LABEL: negative_tripletooshort_v16i8:
496; CHECK:       // %bb.0:
497; CHECK-NEXT:    ptrue p0.b
498; CHECK-NEXT:    cntw x8
499; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
500; CHECK-NEXT:    movprfx z1, z0
501; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z0.b
502; CHECK-NEXT:    eor z0.d, z1.d, z0.d
503; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
504; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
505; CHECK-NEXT:    movprfx z1, z0
506; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z0.b
507; CHECK-NEXT:    eor z0.d, z1.d, z0.d
508; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
509; CHECK-NEXT:    cnth x8
510; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
511; CHECK-NEXT:    movprfx z1, z0
512; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z0.b
513; CHECK-NEXT:    eor z0.d, z1.d, z0.d
514; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
515; CHECK-NEXT:    ret
516  %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
517  %l5 = mul <vscale x 16 x i8> %l3, %l3
518  %l6 = xor <vscale x 16 x i8> %l5, %l3
519  store <vscale x 16 x i8> %l6, ptr %l0, align 16
520  %l7 = tail call i64 @llvm.vscale.i64()
521  %l8 = shl nuw nsw i64 %l7, 2
522  %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
523  %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
524  %l13 = mul <vscale x 16 x i8> %l11, %l11
525  %l14 = xor <vscale x 16 x i8> %l13, %l11
526  store <vscale x 16 x i8> %l14, ptr %l9, align 16
527  %m9 = getelementptr inbounds i8, ptr %l9, i64 %l8
528  %m11 = load <vscale x 16 x i8>, ptr %m9, align 16
529  %m13 = mul <vscale x 16 x i8> %m11, %m11
530  %m14 = xor <vscale x 16 x i8> %m13, %m11
531  store <vscale x 16 x i8> %m14, ptr %m9, align 16
532  ret void
533}
534
535declare i64 @llvm.vscale.i64()
536