xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
3
4; SRSHL (Single, x2)
5
6define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_single_x2_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) {
7; CHECK-LABEL: multi_vec_rounding_shl_single_x2_s8:
8; CHECK:       // %bb.0:
9; CHECK-NEXT:    mov z5.d, z2.d
10; CHECK-NEXT:    mov z4.d, z1.d
11; CHECK-NEXT:    srshl { z4.b, z5.b }, { z4.b, z5.b }, z3.b
12; CHECK-NEXT:    mov z0.d, z4.d
13; CHECK-NEXT:    mov z1.d, z5.d
14; CHECK-NEXT:    ret
15  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm)
16  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
17}
18
19define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_single_x2_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) {
20; CHECK-LABEL: multi_vec_rounding_shl_single_x2_s16:
21; CHECK:       // %bb.0:
22; CHECK-NEXT:    mov z5.d, z2.d
23; CHECK-NEXT:    mov z4.d, z1.d
24; CHECK-NEXT:    srshl { z4.h, z5.h }, { z4.h, z5.h }, z3.h
25; CHECK-NEXT:    mov z0.d, z4.d
26; CHECK-NEXT:    mov z1.d, z5.d
27; CHECK-NEXT:    ret
28  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm)
29  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
30}
31
32define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_single_x2_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) {
33; CHECK-LABEL: multi_vec_rounding_shl_single_x2_s32:
34; CHECK:       // %bb.0:
35; CHECK-NEXT:    mov z5.d, z2.d
36; CHECK-NEXT:    mov z4.d, z1.d
37; CHECK-NEXT:    srshl { z4.s, z5.s }, { z4.s, z5.s }, z3.s
38; CHECK-NEXT:    mov z0.d, z4.d
39; CHECK-NEXT:    mov z1.d, z5.d
40; CHECK-NEXT:    ret
41  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm)
42  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
43}
44
45define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single_x2_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) {
46; CHECK-LABEL: multi_vec_rounding_shl_single_x2_s64:
47; CHECK:       // %bb.0:
48; CHECK-NEXT:    mov z5.d, z2.d
49; CHECK-NEXT:    mov z4.d, z1.d
50; CHECK-NEXT:    srshl { z4.d, z5.d }, { z4.d, z5.d }, z3.d
51; CHECK-NEXT:    mov z0.d, z4.d
52; CHECK-NEXT:    mov z1.d, z5.d
53; CHECK-NEXT:    ret
54  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm)
55  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
56}
57
58; SRSHL (Single, x4)
59
60define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_single_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
61; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s8:
62; CHECK:       // %bb.0:
63; CHECK-NEXT:    mov z27.d, z4.d
64; CHECK-NEXT:    mov z26.d, z3.d
65; CHECK-NEXT:    mov z25.d, z2.d
66; CHECK-NEXT:    mov z24.d, z1.d
67; CHECK-NEXT:    srshl { z24.b - z27.b }, { z24.b - z27.b }, z5.b
68; CHECK-NEXT:    mov z0.d, z24.d
69; CHECK-NEXT:    mov z1.d, z25.d
70; CHECK-NEXT:    mov z2.d, z26.d
71; CHECK-NEXT:    mov z3.d, z27.d
72; CHECK-NEXT:    ret
73  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
74              @llvm.aarch64.sve.srshl.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm)
75  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
76}
77
78define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_single_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
79; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s16:
80; CHECK:       // %bb.0:
81; CHECK-NEXT:    mov z27.d, z4.d
82; CHECK-NEXT:    mov z26.d, z3.d
83; CHECK-NEXT:    mov z25.d, z2.d
84; CHECK-NEXT:    mov z24.d, z1.d
85; CHECK-NEXT:    srshl { z24.h - z27.h }, { z24.h - z27.h }, z5.h
86; CHECK-NEXT:    mov z0.d, z24.d
87; CHECK-NEXT:    mov z1.d, z25.d
88; CHECK-NEXT:    mov z2.d, z26.d
89; CHECK-NEXT:    mov z3.d, z27.d
90; CHECK-NEXT:    ret
91  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
92              @llvm.aarch64.sve.srshl.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm)
93  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
94}
95
96define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_single_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
97; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s32:
98; CHECK:       // %bb.0:
99; CHECK-NEXT:    mov z27.d, z4.d
100; CHECK-NEXT:    mov z26.d, z3.d
101; CHECK-NEXT:    mov z25.d, z2.d
102; CHECK-NEXT:    mov z24.d, z1.d
103; CHECK-NEXT:    srshl { z24.s - z27.s }, { z24.s - z27.s }, z5.s
104; CHECK-NEXT:    mov z0.d, z24.d
105; CHECK-NEXT:    mov z1.d, z25.d
106; CHECK-NEXT:    mov z2.d, z26.d
107; CHECK-NEXT:    mov z3.d, z27.d
108; CHECK-NEXT:    ret
109  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
110              @llvm.aarch64.sve.srshl.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm)
111  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
112}
113
114define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
115; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s64:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    mov z27.d, z4.d
118; CHECK-NEXT:    mov z26.d, z3.d
119; CHECK-NEXT:    mov z25.d, z2.d
120; CHECK-NEXT:    mov z24.d, z1.d
121; CHECK-NEXT:    srshl { z24.d - z27.d }, { z24.d - z27.d }, z5.d
122; CHECK-NEXT:    mov z0.d, z24.d
123; CHECK-NEXT:    mov z1.d, z25.d
124; CHECK-NEXT:    mov z2.d, z26.d
125; CHECK-NEXT:    mov z3.d, z27.d
126; CHECK-NEXT:    ret
127  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
128              @llvm.aarch64.sve.srshl.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm)
129  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
130}
131
132; URSHL (Single, x2)
133
134define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_single_x2_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) {
135; CHECK-LABEL: multi_vec_rounding_shl_single_x2_u8:
136; CHECK:       // %bb.0:
137; CHECK-NEXT:    mov z5.d, z2.d
138; CHECK-NEXT:    mov z4.d, z1.d
139; CHECK-NEXT:    urshl { z4.b, z5.b }, { z4.b, z5.b }, z3.b
140; CHECK-NEXT:    mov z0.d, z4.d
141; CHECK-NEXT:    mov z1.d, z5.d
142; CHECK-NEXT:    ret
143  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm)
144  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
145}
146
147define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_single_x2_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) {
148; CHECK-LABEL: multi_vec_rounding_shl_single_x2_u16:
149; CHECK:       // %bb.0:
150; CHECK-NEXT:    mov z5.d, z2.d
151; CHECK-NEXT:    mov z4.d, z1.d
152; CHECK-NEXT:    urshl { z4.h, z5.h }, { z4.h, z5.h }, z3.h
153; CHECK-NEXT:    mov z0.d, z4.d
154; CHECK-NEXT:    mov z1.d, z5.d
155; CHECK-NEXT:    ret
156  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm)
157  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
158}
159
160define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_single_x2_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) {
161; CHECK-LABEL: multi_vec_rounding_shl_single_x2_u32:
162; CHECK:       // %bb.0:
163; CHECK-NEXT:    mov z5.d, z2.d
164; CHECK-NEXT:    mov z4.d, z1.d
165; CHECK-NEXT:    urshl { z4.s, z5.s }, { z4.s, z5.s }, z3.s
166; CHECK-NEXT:    mov z0.d, z4.d
167; CHECK-NEXT:    mov z1.d, z5.d
168; CHECK-NEXT:    ret
169  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm)
170  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
171}
172
173define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single_x2_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) {
174; CHECK-LABEL: multi_vec_rounding_shl_single_x2_u64:
175; CHECK:       // %bb.0:
176; CHECK-NEXT:    mov z5.d, z2.d
177; CHECK-NEXT:    mov z4.d, z1.d
178; CHECK-NEXT:    urshl { z4.d, z5.d }, { z4.d, z5.d }, z3.d
179; CHECK-NEXT:    mov z0.d, z4.d
180; CHECK-NEXT:    mov z1.d, z5.d
181; CHECK-NEXT:    ret
182  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm)
183  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
184}
185
186; URSHL (Single, x4)
187
188define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_single_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
189; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u8:
190; CHECK:       // %bb.0:
191; CHECK-NEXT:    mov z27.d, z4.d
192; CHECK-NEXT:    mov z26.d, z3.d
193; CHECK-NEXT:    mov z25.d, z2.d
194; CHECK-NEXT:    mov z24.d, z1.d
195; CHECK-NEXT:    urshl { z24.b - z27.b }, { z24.b - z27.b }, z5.b
196; CHECK-NEXT:    mov z0.d, z24.d
197; CHECK-NEXT:    mov z1.d, z25.d
198; CHECK-NEXT:    mov z2.d, z26.d
199; CHECK-NEXT:    mov z3.d, z27.d
200; CHECK-NEXT:    ret
201  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
202              @llvm.aarch64.sve.urshl.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm)
203  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
204}
205
206define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_single_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
207; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u16:
208; CHECK:       // %bb.0:
209; CHECK-NEXT:    mov z27.d, z4.d
210; CHECK-NEXT:    mov z26.d, z3.d
211; CHECK-NEXT:    mov z25.d, z2.d
212; CHECK-NEXT:    mov z24.d, z1.d
213; CHECK-NEXT:    urshl { z24.h - z27.h }, { z24.h - z27.h }, z5.h
214; CHECK-NEXT:    mov z0.d, z24.d
215; CHECK-NEXT:    mov z1.d, z25.d
216; CHECK-NEXT:    mov z2.d, z26.d
217; CHECK-NEXT:    mov z3.d, z27.d
218; CHECK-NEXT:    ret
219  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
220              @llvm.aarch64.sve.urshl.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm)
221  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
222}
223
224define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_single_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
225; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u32:
226; CHECK:       // %bb.0:
227; CHECK-NEXT:    mov z27.d, z4.d
228; CHECK-NEXT:    mov z26.d, z3.d
229; CHECK-NEXT:    mov z25.d, z2.d
230; CHECK-NEXT:    mov z24.d, z1.d
231; CHECK-NEXT:    urshl { z24.s - z27.s }, { z24.s - z27.s }, z5.s
232; CHECK-NEXT:    mov z0.d, z24.d
233; CHECK-NEXT:    mov z1.d, z25.d
234; CHECK-NEXT:    mov z2.d, z26.d
235; CHECK-NEXT:    mov z3.d, z27.d
236; CHECK-NEXT:    ret
237  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
238              @llvm.aarch64.sve.urshl.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm)
239  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
240}
241
242define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
243; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u64:
244; CHECK:       // %bb.0:
245; CHECK-NEXT:    mov z27.d, z4.d
246; CHECK-NEXT:    mov z26.d, z3.d
247; CHECK-NEXT:    mov z25.d, z2.d
248; CHECK-NEXT:    mov z24.d, z1.d
249; CHECK-NEXT:    urshl { z24.d - z27.d }, { z24.d - z27.d }, z5.d
250; CHECK-NEXT:    mov z0.d, z24.d
251; CHECK-NEXT:    mov z1.d, z25.d
252; CHECK-NEXT:    mov z2.d, z26.d
253; CHECK-NEXT:    mov z3.d, z27.d
254; CHECK-NEXT:    ret
255  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
256              @llvm.aarch64.sve.urshl.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm)
257  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
258}
259
260; SRSHL (Multi, x2)
261
262define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_x2_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) {
263; CHECK-LABEL: multi_vec_rounding_shl_x2_s8:
264; CHECK:       // %bb.0:
265; CHECK-NEXT:    mov z7.d, z4.d
266; CHECK-NEXT:    mov z5.d, z2.d
267; CHECK-NEXT:    mov z6.d, z3.d
268; CHECK-NEXT:    mov z4.d, z1.d
269; CHECK-NEXT:    srshl { z4.b, z5.b }, { z4.b, z5.b }, { z6.b, z7.b }
270; CHECK-NEXT:    mov z0.d, z4.d
271; CHECK-NEXT:    mov z1.d, z5.d
272; CHECK-NEXT:    ret
273  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
274  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
275}
276
277define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_x2_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) {
278; CHECK-LABEL: multi_vec_rounding_shl_x2_s16:
279; CHECK:       // %bb.0:
280; CHECK-NEXT:    mov z7.d, z4.d
281; CHECK-NEXT:    mov z5.d, z2.d
282; CHECK-NEXT:    mov z6.d, z3.d
283; CHECK-NEXT:    mov z4.d, z1.d
284; CHECK-NEXT:    srshl { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h }
285; CHECK-NEXT:    mov z0.d, z4.d
286; CHECK-NEXT:    mov z1.d, z5.d
287; CHECK-NEXT:    ret
288  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
289  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
290}
291
292define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_x2_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) {
293; CHECK-LABEL: multi_vec_rounding_shl_x2_s32:
294; CHECK:       // %bb.0:
295; CHECK-NEXT:    mov z7.d, z4.d
296; CHECK-NEXT:    mov z5.d, z2.d
297; CHECK-NEXT:    mov z6.d, z3.d
298; CHECK-NEXT:    mov z4.d, z1.d
299; CHECK-NEXT:    srshl { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s }
300; CHECK-NEXT:    mov z0.d, z4.d
301; CHECK-NEXT:    mov z1.d, z5.d
302; CHECK-NEXT:    ret
303  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
304  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
305}
306
307define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x2_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) {
308; CHECK-LABEL: multi_vec_rounding_shl_x2_s64:
309; CHECK:       // %bb.0:
310; CHECK-NEXT:    mov z7.d, z4.d
311; CHECK-NEXT:    mov z5.d, z2.d
312; CHECK-NEXT:    mov z6.d, z3.d
313; CHECK-NEXT:    mov z4.d, z1.d
314; CHECK-NEXT:    srshl { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d }
315; CHECK-NEXT:    mov z0.d, z4.d
316; CHECK-NEXT:    mov z1.d, z5.d
317; CHECK-NEXT:    ret
318  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
319  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
320}
321
322; SRSHL (Multi, x4)
323
324define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
325; CHECK-LABEL: multi_vec_rounding_shl_x4_s8:
326; CHECK:       // %bb.0:
327; CHECK-NEXT:    mov z30.d, z7.d
328; CHECK-NEXT:    mov z27.d, z4.d
329; CHECK-NEXT:    ptrue p0.b
330; CHECK-NEXT:    mov z29.d, z6.d
331; CHECK-NEXT:    mov z26.d, z3.d
332; CHECK-NEXT:    mov z28.d, z5.d
333; CHECK-NEXT:    mov z25.d, z2.d
334; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
335; CHECK-NEXT:    mov z24.d, z1.d
336; CHECK-NEXT:    srshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
337; CHECK-NEXT:    mov z0.d, z24.d
338; CHECK-NEXT:    mov z1.d, z25.d
339; CHECK-NEXT:    mov z2.d, z26.d
340; CHECK-NEXT:    mov z3.d, z27.d
341; CHECK-NEXT:    ret
342  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
343              @llvm.aarch64.sve.srshl.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
344                                                 <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
345  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
346}
347
348define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
349; CHECK-LABEL: multi_vec_rounding_shl_x4_s16:
350; CHECK:       // %bb.0:
351; CHECK-NEXT:    mov z30.d, z7.d
352; CHECK-NEXT:    mov z27.d, z4.d
353; CHECK-NEXT:    ptrue p0.h
354; CHECK-NEXT:    mov z29.d, z6.d
355; CHECK-NEXT:    mov z26.d, z3.d
356; CHECK-NEXT:    mov z28.d, z5.d
357; CHECK-NEXT:    mov z25.d, z2.d
358; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
359; CHECK-NEXT:    mov z24.d, z1.d
360; CHECK-NEXT:    srshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
361; CHECK-NEXT:    mov z0.d, z24.d
362; CHECK-NEXT:    mov z1.d, z25.d
363; CHECK-NEXT:    mov z2.d, z26.d
364; CHECK-NEXT:    mov z3.d, z27.d
365; CHECK-NEXT:    ret
366  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
367              @llvm.aarch64.sve.srshl.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
368                                                 <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4)
369  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
370}
371
372define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
373; CHECK-LABEL: multi_vec_rounding_shl_x4_s32:
374; CHECK:       // %bb.0:
375; CHECK-NEXT:    mov z30.d, z7.d
376; CHECK-NEXT:    mov z27.d, z4.d
377; CHECK-NEXT:    ptrue p0.s
378; CHECK-NEXT:    mov z29.d, z6.d
379; CHECK-NEXT:    mov z26.d, z3.d
380; CHECK-NEXT:    mov z28.d, z5.d
381; CHECK-NEXT:    mov z25.d, z2.d
382; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
383; CHECK-NEXT:    mov z24.d, z1.d
384; CHECK-NEXT:    srshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
385; CHECK-NEXT:    mov z0.d, z24.d
386; CHECK-NEXT:    mov z1.d, z25.d
387; CHECK-NEXT:    mov z2.d, z26.d
388; CHECK-NEXT:    mov z3.d, z27.d
389; CHECK-NEXT:    ret
390  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
391              @llvm.aarch64.sve.srshl.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
392                                                 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4)
393  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
394}
395
396define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
397; CHECK-LABEL: multi_vec_rounding_shl_x4_s64:
398; CHECK:       // %bb.0:
399; CHECK-NEXT:    mov z30.d, z7.d
400; CHECK-NEXT:    mov z27.d, z4.d
401; CHECK-NEXT:    ptrue p0.d
402; CHECK-NEXT:    mov z29.d, z6.d
403; CHECK-NEXT:    mov z26.d, z3.d
404; CHECK-NEXT:    mov z28.d, z5.d
405; CHECK-NEXT:    mov z25.d, z2.d
406; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
407; CHECK-NEXT:    mov z24.d, z1.d
408; CHECK-NEXT:    srshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
409; CHECK-NEXT:    mov z0.d, z24.d
410; CHECK-NEXT:    mov z1.d, z25.d
411; CHECK-NEXT:    mov z2.d, z26.d
412; CHECK-NEXT:    mov z3.d, z27.d
413; CHECK-NEXT:    ret
414  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
415              @llvm.aarch64.sve.srshl.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
416                                                 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4)
417  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
418}
419
420; URSHL (Multi, x2)
421
422define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_uhl_x2_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) {
423; CHECK-LABEL: multi_vec_rounding_uhl_x2_u8:
424; CHECK:       // %bb.0:
425; CHECK-NEXT:    mov z7.d, z4.d
426; CHECK-NEXT:    mov z5.d, z2.d
427; CHECK-NEXT:    mov z6.d, z3.d
428; CHECK-NEXT:    mov z4.d, z1.d
429; CHECK-NEXT:    urshl { z4.b, z5.b }, { z4.b, z5.b }, { z6.b, z7.b }
430; CHECK-NEXT:    mov z0.d, z4.d
431; CHECK-NEXT:    mov z1.d, z5.d
432; CHECK-NEXT:    ret
433  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
434  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
435}
436
437define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_uhl_x2_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2) {
438; CHECK-LABEL: multi_vec_rounding_uhl_x2_u16:
439; CHECK:       // %bb.0:
440; CHECK-NEXT:    mov z7.d, z4.d
441; CHECK-NEXT:    mov z5.d, z2.d
442; CHECK-NEXT:    mov z6.d, z3.d
443; CHECK-NEXT:    mov z4.d, z1.d
444; CHECK-NEXT:    urshl { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h }
445; CHECK-NEXT:    mov z0.d, z4.d
446; CHECK-NEXT:    mov z1.d, z5.d
447; CHECK-NEXT:    ret
448  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2)
449  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
450}
451
452define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_uhl_x2_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) {
453; CHECK-LABEL: multi_vec_rounding_uhl_x2_u32:
454; CHECK:       // %bb.0:
455; CHECK-NEXT:    mov z7.d, z4.d
456; CHECK-NEXT:    mov z5.d, z2.d
457; CHECK-NEXT:    mov z6.d, z3.d
458; CHECK-NEXT:    mov z4.d, z1.d
459; CHECK-NEXT:    urshl { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s }
460; CHECK-NEXT:    mov z0.d, z4.d
461; CHECK-NEXT:    mov z1.d, z5.d
462; CHECK-NEXT:    ret
463  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
464  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
465}
466
467define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_uhl_x2_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) {
468; CHECK-LABEL: multi_vec_rounding_uhl_x2_u64:
469; CHECK:       // %bb.0:
470; CHECK-NEXT:    mov z7.d, z4.d
471; CHECK-NEXT:    mov z5.d, z2.d
472; CHECK-NEXT:    mov z6.d, z3.d
473; CHECK-NEXT:    mov z4.d, z1.d
474; CHECK-NEXT:    urshl { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d }
475; CHECK-NEXT:    mov z0.d, z4.d
476; CHECK-NEXT:    mov z1.d, z5.d
477; CHECK-NEXT:    ret
478  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
479  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
480}
481
482; URSHL (Multi, x4)
483
484define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
485; CHECK-LABEL: multi_vec_rounding_shl_x4_u8:
486; CHECK:       // %bb.0:
487; CHECK-NEXT:    mov z30.d, z7.d
488; CHECK-NEXT:    mov z27.d, z4.d
489; CHECK-NEXT:    ptrue p0.b
490; CHECK-NEXT:    mov z29.d, z6.d
491; CHECK-NEXT:    mov z26.d, z3.d
492; CHECK-NEXT:    mov z28.d, z5.d
493; CHECK-NEXT:    mov z25.d, z2.d
494; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
495; CHECK-NEXT:    mov z24.d, z1.d
496; CHECK-NEXT:    urshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
497; CHECK-NEXT:    mov z0.d, z24.d
498; CHECK-NEXT:    mov z1.d, z25.d
499; CHECK-NEXT:    mov z2.d, z26.d
500; CHECK-NEXT:    mov z3.d, z27.d
501; CHECK-NEXT:    ret
502  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
503              @llvm.aarch64.sve.urshl.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
504                                                 <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
505  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
506}
507
508define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
509; CHECK-LABEL: multi_vec_rounding_shl_x4_u16:
510; CHECK:       // %bb.0:
511; CHECK-NEXT:    mov z30.d, z7.d
512; CHECK-NEXT:    mov z27.d, z4.d
513; CHECK-NEXT:    ptrue p0.h
514; CHECK-NEXT:    mov z29.d, z6.d
515; CHECK-NEXT:    mov z26.d, z3.d
516; CHECK-NEXT:    mov z28.d, z5.d
517; CHECK-NEXT:    mov z25.d, z2.d
518; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
519; CHECK-NEXT:    mov z24.d, z1.d
520; CHECK-NEXT:    urshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
521; CHECK-NEXT:    mov z0.d, z24.d
522; CHECK-NEXT:    mov z1.d, z25.d
523; CHECK-NEXT:    mov z2.d, z26.d
524; CHECK-NEXT:    mov z3.d, z27.d
525; CHECK-NEXT:    ret
526  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
527              @llvm.aarch64.sve.urshl.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
528                                                 <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4)
529  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
530}
531
532define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
533; CHECK-LABEL: multi_vec_rounding_shl_x4_u32:
534; CHECK:       // %bb.0:
535; CHECK-NEXT:    mov z30.d, z7.d
536; CHECK-NEXT:    mov z27.d, z4.d
537; CHECK-NEXT:    ptrue p0.s
538; CHECK-NEXT:    mov z29.d, z6.d
539; CHECK-NEXT:    mov z26.d, z3.d
540; CHECK-NEXT:    mov z28.d, z5.d
541; CHECK-NEXT:    mov z25.d, z2.d
542; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
543; CHECK-NEXT:    mov z24.d, z1.d
544; CHECK-NEXT:    urshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
545; CHECK-NEXT:    mov z0.d, z24.d
546; CHECK-NEXT:    mov z1.d, z25.d
547; CHECK-NEXT:    mov z2.d, z26.d
548; CHECK-NEXT:    mov z3.d, z27.d
549; CHECK-NEXT:    ret
550  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
551              @llvm.aarch64.sve.urshl.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
552                                                 <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4)
553  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
554}
555
556define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
557; CHECK-LABEL: multi_vec_rounding_shl_x4_u64:
558; CHECK:       // %bb.0:
559; CHECK-NEXT:    mov z30.d, z7.d
560; CHECK-NEXT:    mov z27.d, z4.d
561; CHECK-NEXT:    ptrue p0.d
562; CHECK-NEXT:    mov z29.d, z6.d
563; CHECK-NEXT:    mov z26.d, z3.d
564; CHECK-NEXT:    mov z28.d, z5.d
565; CHECK-NEXT:    mov z25.d, z2.d
566; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
567; CHECK-NEXT:    mov z24.d, z1.d
568; CHECK-NEXT:    urshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
569; CHECK-NEXT:    mov z0.d, z24.d
570; CHECK-NEXT:    mov z1.d, z25.d
571; CHECK-NEXT:    mov z2.d, z26.d
572; CHECK-NEXT:    mov z3.d, z27.d
573; CHECK-NEXT:    ret
574  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
575              @llvm.aarch64.sve.urshl.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
576                                                 <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4)
577  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
578}
579
580declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
581declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
582declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
583declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
584
585declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
586declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
587declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
588declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
589
590declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
591declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
592declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
593declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
594
595declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
596declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
597declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
598declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
599
600declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.srshl.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
601declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.srshl.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,  <vscale x 8 x i16>)
602declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.srshl.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
603declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.srshl.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
604
605declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
606 @llvm.aarch64.sve.srshl.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
607declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
608 @llvm.aarch64.sve.srshl.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
609declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
610 @llvm.aarch64.sve.srshl.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
611declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
612 @llvm.aarch64.sve.srshl.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> )
613
614declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.urshl.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
615declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.urshl.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
616declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.urshl.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
617declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.urshl.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
618
619declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
620 @llvm.aarch64.sve.urshl.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
621declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
622 @llvm.aarch64.sve.urshl.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
623declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
624 @llvm.aarch64.sve.urshl.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
625declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
626 @llvm.aarch64.sve.urshl.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
627