xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll (revision ab7110bcd6b137803935508de8c9f6af377f9454)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6; Test we can code generater patterns of the form:
7;   fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0
8;   scalable_vector = ISD::INSERT_SUBVECTOR scalable_vector, fixed_length_vector, 0
9;
10; NOTE: Currently shufflevector does not support scalable vectors so it cannot
11; be used to model the above operations.  Instead these tests rely on knowing
12; how fixed length operation are lowered to scalable ones, with multiple blocks
13; ensuring insert/extract sequences are not folded away.
14
15target triple = "aarch64-unknown-linux-gnu"
16
17define void @subvector_v8i16(ptr %in, ptr %out) vscale_range(2,0) #0 {
18; CHECK-LABEL: subvector_v8i16:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    ldr q0, [x0]
21; CHECK-NEXT:    str q0, [x1]
22; CHECK-NEXT:    ret
23  %a = load <8 x i16>, ptr %in
24  br label %bb1
25
26bb1:
27  store <8 x i16> %a, ptr %out
28  ret void
29}
30
31define void @subvector_v16i16(ptr %in, ptr %out) vscale_range(2,0) #0 {
32; CHECK-LABEL: subvector_v16i16:
33; CHECK:       // %bb.0:
34; CHECK-NEXT:    ptrue p0.h, vl16
35; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
36; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
37; CHECK-NEXT:    ret
38  %a = load <16 x i16>, ptr %in
39  br label %bb1
40
41bb1:
42  store <16 x i16> %a, ptr %out
43  ret void
44}
45
46define void @subvector_v32i16(ptr %in, ptr %out) #0 {
47; VBITS_GE_256-LABEL: subvector_v32i16:
48; VBITS_GE_256:       // %bb.0:
49; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
50; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
51; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
52; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
53; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
54; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
55; VBITS_GE_256-NEXT:    ret
56;
57; VBITS_GE_512-LABEL: subvector_v32i16:
58; VBITS_GE_512:       // %bb.0:
59; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
60; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
61; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
62; VBITS_GE_512-NEXT:    ret
63  %a = load <32 x i16>, ptr %in
64  br label %bb1
65
66bb1:
67  store <32 x i16> %a, ptr %out
68  ret void
69}
70
71define void @subvector_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
72; CHECK-LABEL: subvector_v64i16:
73; CHECK:       // %bb.0:
74; CHECK-NEXT:    ptrue p0.h, vl64
75; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
76; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
77; CHECK-NEXT:    ret
78  %a = load <64 x i16>, ptr %in
79  br label %bb1
80
81bb1:
82  store <64 x i16> %a, ptr %out
83  ret void
84}
85
86define void @subvector_v8i32(ptr %in, ptr %out) vscale_range(2,0) #0 {
87; CHECK-LABEL: subvector_v8i32:
88; CHECK:       // %bb.0:
89; CHECK-NEXT:    ptrue p0.s, vl8
90; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
91; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
92; CHECK-NEXT:    ret
93  %a = load <8 x i32>, ptr %in
94  br label %bb1
95
96bb1:
97  store <8 x i32> %a, ptr %out
98  ret void
99}
100
101define void @subvector_v16i32(ptr %in, ptr %out) #0 {
102; VBITS_GE_256-LABEL: subvector_v16i32:
103; VBITS_GE_256:       // %bb.0:
104; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
105; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
106; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
107; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
108; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
109; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
110; VBITS_GE_256-NEXT:    ret
111;
112; VBITS_GE_512-LABEL: subvector_v16i32:
113; VBITS_GE_512:       // %bb.0:
114; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
115; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
116; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
117; VBITS_GE_512-NEXT:    ret
118  %a = load <16 x i32>, ptr %in
119  br label %bb1
120
121bb1:
122  store <16 x i32> %a, ptr %out
123  ret void
124}
125
126define void @subvector_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
127; CHECK-LABEL: subvector_v32i32:
128; CHECK:       // %bb.0:
129; CHECK-NEXT:    ptrue p0.s, vl32
130; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
131; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
132; CHECK-NEXT:    ret
133  %a = load <32 x i32>, ptr %in
134  br label %bb1
135
136bb1:
137  store <32 x i32> %a, ptr %out
138  ret void
139}
140
141define void @subvector_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
142; CHECK-LABEL: subvector_v64i32:
143; CHECK:       // %bb.0:
144; CHECK-NEXT:    ptrue p0.s, vl64
145; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
146; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
147; CHECK-NEXT:    ret
148  %a = load <64 x i32>, ptr %in
149  br label %bb1
150
151bb1:
152  store <64 x i32> %a, ptr %out
153  ret void
154}
155
156
157define void @subvector_v8i64(ptr %in, ptr %out) vscale_range(2,0) #0 {
158; CHECK-LABEL: subvector_v8i64:
159; CHECK:       // %bb.0:
160; CHECK-NEXT:    ptrue p0.d, vl4
161; CHECK-NEXT:    mov x8, #4 // =0x4
162; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
163; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
164; CHECK-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
165; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
166; CHECK-NEXT:    ret
167  %a = load <8 x i64>, ptr %in
168  br label %bb1
169
170bb1:
171  store <8 x i64> %a, ptr %out
172  ret void
173}
174
175define void @subvector_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
176; CHECK-LABEL: subvector_v16i64:
177; CHECK:       // %bb.0:
178; CHECK-NEXT:    ptrue p0.d, vl16
179; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
180; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
181; CHECK-NEXT:    ret
182  %a = load <16 x i64>, ptr %in
183  br label %bb1
184
185bb1:
186  store <16 x i64> %a, ptr %out
187  ret void
188}
189
190define void @subvector_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
191; CHECK-LABEL: subvector_v32i64:
192; CHECK:       // %bb.0:
193; CHECK-NEXT:    ptrue p0.d, vl32
194; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
195; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
196; CHECK-NEXT:    ret
197  %a = load <32 x i64>, ptr %in
198  br label %bb1
199
200bb1:
201  store <32 x i64> %a, ptr %out
202  ret void
203}
204
205define void @subvector_v8f16(ptr %in, ptr %out) vscale_range(2,0) #0 {
206; CHECK-LABEL: subvector_v8f16:
207; CHECK:       // %bb.0:
208; CHECK-NEXT:    ldr q0, [x0]
209; CHECK-NEXT:    str q0, [x1]
210; CHECK-NEXT:    ret
211  %a = load <8 x half>, ptr %in
212  br label %bb1
213
214bb1:
215  store <8 x half> %a, ptr %out
216  ret void
217}
218
219define void @subvector_v16f16(ptr %in, ptr %out) vscale_range(2,0) #0 {
220; CHECK-LABEL: subvector_v16f16:
221; CHECK:       // %bb.0:
222; CHECK-NEXT:    ptrue p0.h, vl16
223; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
224; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
225; CHECK-NEXT:    ret
226  %a = load <16 x half>, ptr %in
227  br label %bb1
228
229bb1:
230  store <16 x half> %a, ptr %out
231  ret void
232}
233
234define void @subvector_v32f16(ptr %in, ptr %out) #0 {
235; VBITS_GE_256-LABEL: subvector_v32f16:
236; VBITS_GE_256:       // %bb.0:
237; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
238; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
239; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
240; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
241; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
242; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
243; VBITS_GE_256-NEXT:    ret
244;
245; VBITS_GE_512-LABEL: subvector_v32f16:
246; VBITS_GE_512:       // %bb.0:
247; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
248; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
249; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
250; VBITS_GE_512-NEXT:    ret
251  %a = load <32 x half>, ptr %in
252  br label %bb1
253
254bb1:
255  store <32 x half> %a, ptr %out
256  ret void
257}
258
259define void @subvector_v64f16(ptr %in, ptr %out) vscale_range(8,0) #0 {
260; CHECK-LABEL: subvector_v64f16:
261; CHECK:       // %bb.0:
262; CHECK-NEXT:    ptrue p0.h, vl64
263; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
264; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
265; CHECK-NEXT:    ret
266  %a = load <64 x half>, ptr %in
267  br label %bb1
268
269bb1:
270  store <64 x half> %a, ptr %out
271  ret void
272}
273
274define void @subvector_v8f32(ptr %in, ptr %out) vscale_range(2,0) #0 {
275; CHECK-LABEL: subvector_v8f32:
276; CHECK:       // %bb.0:
277; CHECK-NEXT:    ptrue p0.s, vl8
278; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
279; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
280; CHECK-NEXT:    ret
281  %a = load <8 x float>, ptr %in
282  br label %bb1
283
284bb1:
285  store <8 x float> %a, ptr %out
286  ret void
287}
288
289define void @subvector_v16f32(ptr %in, ptr %out) #0 {
290; VBITS_GE_256-LABEL: subvector_v16f32:
291; VBITS_GE_256:       // %bb.0:
292; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
293; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
294; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
295; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
296; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
297; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
298; VBITS_GE_256-NEXT:    ret
299;
300; VBITS_GE_512-LABEL: subvector_v16f32:
301; VBITS_GE_512:       // %bb.0:
302; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
303; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
304; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
305; VBITS_GE_512-NEXT:    ret
306  %a = load <16 x float>, ptr %in
307  br label %bb1
308
309bb1:
310  store <16 x float> %a, ptr %out
311  ret void
312}
313
314define void @subvector_v32f32(ptr %in, ptr %out) vscale_range(8,0) #0 {
315; CHECK-LABEL: subvector_v32f32:
316; CHECK:       // %bb.0:
317; CHECK-NEXT:    ptrue p0.s, vl32
318; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
319; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
320; CHECK-NEXT:    ret
321  %a = load <32 x float>, ptr %in
322  br label %bb1
323
324bb1:
325  store <32 x float> %a, ptr %out
326  ret void
327}
328
329define void @subvector_v64f32(ptr %in, ptr %out) vscale_range(16,0) #0 {
330; CHECK-LABEL: subvector_v64f32:
331; CHECK:       // %bb.0:
332; CHECK-NEXT:    ptrue p0.s, vl64
333; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
334; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
335; CHECK-NEXT:    ret
336  %a = load <64 x float>, ptr %in
337  br label %bb1
338
339bb1:
340  store <64 x float> %a, ptr %out
341  ret void
342}
343define void @subvector_v8f64(ptr %in, ptr %out) #0 {
344; VBITS_GE_256-LABEL: subvector_v8f64:
345; VBITS_GE_256:       // %bb.0:
346; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
347; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
348; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
349; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
350; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
351; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
352; VBITS_GE_256-NEXT:    ret
353;
354; VBITS_GE_512-LABEL: subvector_v8f64:
355; VBITS_GE_512:       // %bb.0:
356; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
357; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
358; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
359; VBITS_GE_512-NEXT:    ret
360  %a = load <8 x double>, ptr %in
361  br label %bb1
362
363bb1:
364  store <8 x double> %a, ptr %out
365  ret void
366}
367
368define void @subvector_v16f64(ptr %in, ptr %out) vscale_range(8,0) #0 {
369; CHECK-LABEL: subvector_v16f64:
370; CHECK:       // %bb.0:
371; CHECK-NEXT:    ptrue p0.d, vl16
372; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
373; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
374; CHECK-NEXT:    ret
375  %a = load <16 x double>, ptr %in
376  br label %bb1
377
378bb1:
379  store <16 x double> %a, ptr %out
380  ret void
381}
382
383define void @subvector_v32f64(ptr %in, ptr %out) vscale_range(16,0) #0 {
384; CHECK-LABEL: subvector_v32f64:
385; CHECK:       // %bb.0:
386; CHECK-NEXT:    ptrue p0.d, vl32
387; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
388; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
389; CHECK-NEXT:    ret
390  %a = load <32 x double>, ptr %in
391  br label %bb1
392
393bb1:
394  store <32 x double> %a, ptr %out
395  ret void
396}
397
398define <8 x i1> @no_warn_dropped_scalable(ptr %in) #0 {
399; CHECK-LABEL: no_warn_dropped_scalable:
400; CHECK:       // %bb.0:
401; CHECK-NEXT:    ptrue p0.s, vl8
402; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
403; CHECK-NEXT:    cmpgt p0.s, p0/z, z0.s, #0
404; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
405; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
406; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
407; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
408; CHECK-NEXT:    ret
409  %a = load <8 x i32>, ptr %in
410  br label %bb1
411
412bb1:
413  %cond = icmp sgt <8 x i32> %a, zeroinitializer
414  ret <8 x i1> %cond
415}
416
417; binop(insert_subvec(a), insert_subvec(b)) -> insert_subvec(binop(a,b)) like
418; combines remove redundant subvector operations. This test ensures it's not
419; performed when the input idiom is the result of operation legalisation. When
420; not prevented the test triggers infinite combine->legalise->combine->...
421define void @no_subvector_binop_hang(ptr %in, ptr %out, i1 %cond) #0 {
422; CHECK-LABEL: no_subvector_binop_hang:
423; CHECK:       // %bb.0:
424; CHECK-NEXT:    tbz w2, #0, .LBB23_2
425; CHECK-NEXT:  // %bb.1: // %bb.1
426; CHECK-NEXT:    ptrue p0.s, vl8
427; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
428; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
429; CHECK-NEXT:    orr z0.d, z0.d, z1.d
430; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
431; CHECK-NEXT:  .LBB23_2: // %bb.2
432; CHECK-NEXT:    ret
433  %a = load <8 x i32>, ptr %in
434  %b = load <8 x i32>, ptr %out
435  br i1 %cond, label %bb.1, label %bb.2
436
437bb.1:
438  %or = or <8 x i32> %a, %b
439  store <8 x i32> %or, ptr %out
440  br label %bb.2
441
442bb.2:
443  ret void
444}
445
446attributes #0 = { "target-features"="+sve" }
447