xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll (revision f4f8f9f18590d6fdf531bb9d6981a6081a244d33)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc i32 @vqdmulh_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
5; CHECK-LABEL: vqdmulh_v16i8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
8; CHECK-NEXT:    vaddv.s8 r0, q0
9; CHECK-NEXT:    bx lr
10entry:
11  %l2 = sext <16 x i8> %s0 to <16 x i32>
12  %l5 = sext <16 x i8> %s1 to <16 x i32>
13  %l6 = mul nsw <16 x i32> %l5, %l2
14  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
15  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
16  %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9)
17  ret i32 %l10
18}
19
20define arm_aapcs_vfpcc <16 x i8> @vqdmulh_v16i8_b(<16 x i8> %s0, <16 x i8> %s1) {
21; CHECK-LABEL: vqdmulh_v16i8_b:
22; CHECK:       @ %bb.0: @ %entry
23; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
24; CHECK-NEXT:    bx lr
25entry:
26  %l2 = sext <16 x i8> %s0 to <16 x i32>
27  %l5 = sext <16 x i8> %s1 to <16 x i32>
28  %l6 = mul nsw <16 x i32> %l5, %l2
29  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
30  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
31  %l10 = trunc <16 x i32> %l9 to <16 x i8>
32  ret <16 x i8> %l10
33}
34
35define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) {
36; CHECK-LABEL: vqdmulh_v8i8_b:
37; CHECK:       @ %bb.0: @ %entry
38; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
39; CHECK-NEXT:    vmovlb.s8 q0, q0
40; CHECK-NEXT:    bx lr
41entry:
42  %l2 = sext <8 x i8> %s0 to <8 x i32>
43  %l5 = sext <8 x i8> %s1 to <8 x i32>
44  %l6 = mul nsw <8 x i32> %l5, %l2
45  %l7 = ashr <8 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
46  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
47  %l10 = trunc <8 x i32> %l9 to <8 x i8>
48  ret <8 x i8> %l10
49}
50
51define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) {
52; CHECK-LABEL: vqdmulh_v4i8_b:
53; CHECK:       @ %bb.0: @ %entry
54; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
55; CHECK-NEXT:    vmovlb.s8 q0, q0
56; CHECK-NEXT:    vmovlb.s16 q0, q0
57; CHECK-NEXT:    bx lr
58entry:
59  %l2 = sext <4 x i8> %s0 to <4 x i32>
60  %l5 = sext <4 x i8> %s1 to <4 x i32>
61  %l6 = mul nsw <4 x i32> %l5, %l2
62  %l7 = ashr <4 x i32> %l6, <i32 7, i32 7, i32 7, i32 7>
63  %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
64  %l10 = trunc <4 x i32> %l9 to <4 x i8>
65  ret <4 x i8> %l10
66}
67
68define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) {
69; CHECK-LABEL: vqdmulh_v32i8_b:
70; CHECK:       @ %bb.0: @ %entry
71; CHECK-NEXT:    vqdmulh.s8 q0, q2, q0
72; CHECK-NEXT:    vqdmulh.s8 q1, q3, q1
73; CHECK-NEXT:    bx lr
74entry:
75  %l2 = sext <32 x i8> %s0 to <32 x i32>
76  %l5 = sext <32 x i8> %s1 to <32 x i32>
77  %l6 = mul nsw <32 x i32> %l5, %l2
78  %l7 = ashr <32 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
79  %l9 = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %l7, <32 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
80  %l10 = trunc <32 x i32> %l9 to <32 x i8>
81  ret <32 x i8> %l10
82}
83
84define arm_aapcs_vfpcc i32 @vqdmulh_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
85; CHECK-LABEL: vqdmulh_v8i16:
86; CHECK:       @ %bb.0: @ %entry
87; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
88; CHECK-NEXT:    vaddv.s16 r0, q0
89; CHECK-NEXT:    bx lr
90entry:
91  %l2 = sext <8 x i16> %s0 to <8 x i32>
92  %l5 = sext <8 x i16> %s1 to <8 x i32>
93  %l6 = mul nsw <8 x i32> %l5, %l2
94  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
95  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
96  %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9)
97  ret i32 %l10
98}
99
100define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_b(<8 x i16> %s0, <8 x i16> %s1) {
101; CHECK-LABEL: vqdmulh_v8i16_b:
102; CHECK:       @ %bb.0: @ %entry
103; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
104; CHECK-NEXT:    bx lr
105entry:
106  %l2 = sext <8 x i16> %s0 to <8 x i32>
107  %l5 = sext <8 x i16> %s1 to <8 x i32>
108  %l6 = mul nsw <8 x i32> %l5, %l2
109  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
110  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
111  %l10 = trunc <8 x i32> %l9 to <8 x i16>
112  ret <8 x i16> %l10
113}
114
115define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) {
116; CHECK-LABEL: vqdmulh_v4i16_b:
117; CHECK:       @ %bb.0: @ %entry
118; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
119; CHECK-NEXT:    vmovlb.s16 q0, q0
120; CHECK-NEXT:    bx lr
121entry:
122  %l2 = sext <4 x i16> %s0 to <4 x i32>
123  %l5 = sext <4 x i16> %s1 to <4 x i32>
124  %l6 = mul nsw <4 x i32> %l5, %l2
125  %l7 = ashr <4 x i32> %l6, <i32 15, i32 15, i32 15, i32 15>
126  %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
127  %l10 = trunc <4 x i32> %l9 to <4 x i16>
128  ret <4 x i16> %l10
129}
130
131define arm_aapcs_vfpcc <16 x i16> @vqdmulh_v16i16_b(<16 x i16> %s0, <16 x i16> %s1) {
132; CHECK-LABEL: vqdmulh_v16i16_b:
133; CHECK:       @ %bb.0: @ %entry
134; CHECK-NEXT:    vqdmulh.s16 q0, q2, q0
135; CHECK-NEXT:    vqdmulh.s16 q1, q3, q1
136; CHECK-NEXT:    bx lr
137entry:
138  %l2 = sext <16 x i16> %s0 to <16 x i32>
139  %l5 = sext <16 x i16> %s1 to <16 x i32>
140  %l6 = mul nsw <16 x i32> %l5, %l2
141  %l7 = ashr <16 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
142  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
143  %l10 = trunc <16 x i32> %l9 to <16 x i16>
144  ret <16 x i16> %l10
145}
146
147define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) {
148; CHECK-LABEL: vqdmulh_v8i16_c:
149; CHECK:       @ %bb.0: @ %entry
150; CHECK-NEXT:    .pad #16
151; CHECK-NEXT:    sub sp, #16
152; CHECK-NEXT:    vmov.u16 r0, q0[6]
153; CHECK-NEXT:    vmov.u16 r1, q0[4]
154; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
155; CHECK-NEXT:    vmov.u16 r0, q0[7]
156; CHECK-NEXT:    vmov.u16 r1, q0[5]
157; CHECK-NEXT:    vmov.u16 r2, q0[0]
158; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
159; CHECK-NEXT:    vmov.u16 r0, q1[6]
160; CHECK-NEXT:    vmov.u16 r1, q1[4]
161; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
162; CHECK-NEXT:    vmov.u16 r0, q1[7]
163; CHECK-NEXT:    vmov.u16 r1, q1[5]
164; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
165; CHECK-NEXT:    mov r0, sp
166; CHECK-NEXT:    vmullb.s16 q2, q3, q2
167; CHECK-NEXT:    vmov.u16 r1, q0[2]
168; CHECK-NEXT:    vshl.i32 q2, q2, #10
169; CHECK-NEXT:    vshr.s32 q2, q2, #10
170; CHECK-NEXT:    vshr.s32 q2, q2, #15
171; CHECK-NEXT:    vstrh.32 q2, [r0, #8]
172; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
173; CHECK-NEXT:    vmov.u16 r1, q0[3]
174; CHECK-NEXT:    vmov.u16 r2, q0[1]
175; CHECK-NEXT:    vmov q2[3], q2[1], r2, r1
176; CHECK-NEXT:    vmov.u16 r1, q1[2]
177; CHECK-NEXT:    vmov.u16 r2, q1[0]
178; CHECK-NEXT:    vmov q0[2], q0[0], r2, r1
179; CHECK-NEXT:    vmov.u16 r1, q1[3]
180; CHECK-NEXT:    vmov.u16 r2, q1[1]
181; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
182; CHECK-NEXT:    vmullb.s16 q0, q0, q2
183; CHECK-NEXT:    vshl.i32 q0, q0, #10
184; CHECK-NEXT:    vshr.s32 q0, q0, #10
185; CHECK-NEXT:    vshr.s32 q0, q0, #15
186; CHECK-NEXT:    vstrh.32 q0, [r0]
187; CHECK-NEXT:    vldrw.u32 q0, [r0]
188; CHECK-NEXT:    add sp, #16
189; CHECK-NEXT:    bx lr
190entry:
191  %l2 = sext <8 x i16> %s0 to <8 x i22>
192  %l5 = sext <8 x i16> %s1 to <8 x i22>
193  %l6 = mul nsw <8 x i22> %l5, %l2
194  %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15>
195  %l9 = call <8 x i22> @llvm.smin.v8i22(<8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>)
196  %l10 = trunc <8 x i22> %l9 to <8 x i16>
197  ret <8 x i16> %l10
198}
199
200define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) {
201; CHECK-LABEL: vqdmulh_v8i16_interleaved:
202; CHECK:       @ %bb.0: @ %entry
203; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
204; CHECK-NEXT:    bx lr
205entry:
206  %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
207  %1 = sext <8 x i16> %0 to <8 x i32>
208  %l2 = sext <8 x i16> %s0 to <8 x i32>
209  %2 = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
210  %3 = sext <8 x i16> %2 to <8 x i32>
211  %l5 = sext <8 x i16> %s1 to <8 x i32>
212  %l6 = mul nsw <8 x i32> %3, %1
213  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
214  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
215  %l10 = trunc <8 x i32> %l9 to <8 x i16>
216  %4 = shufflevector <8 x i16> %l10, <8 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
217  ret <8 x i16> %4
218}
219
220define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) {
221; CHECK-LABEL: vqdmulh_v8i16_interleaved2:
222; CHECK:       @ %bb.0:
223; CHECK-NEXT:    vqdmulh.s16 q2, q1, q0
224; CHECK-NEXT:    vrev32.16 q1, q1
225; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
226; CHECK-NEXT:    vmovnt.i32 q2, q0
227; CHECK-NEXT:    vmov q0, q2
228; CHECK-NEXT:    bx lr
229  %s0 = trunc <4 x i32> %s0a to <4 x i16>
230  %strided.vec = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
231  %strided.vec44 = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
232  %l7 = sext <4 x i16> %strided.vec to <4 x i32>
233  %l8 = sext <4 x i16> %s0 to <4 x i32>
234  %l9 = mul nsw <4 x i32> %l7, %l8
235  %l10 = ashr <4 x i32> %l9, <i32 15, i32 15, i32 15, i32 15>
236  %l12 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l10, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
237  %l13 = trunc <4 x i32> %l12 to <4 x i16>
238  %l14 = sext <4 x i16> %strided.vec44 to <4 x i32>
239  %l15 = mul nsw <4 x i32> %l14, %l8
240  %l16 = ashr <4 x i32> %l15, <i32 15, i32 15, i32 15, i32 15>
241  %l18 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
242  %l19 = trunc <4 x i32> %l18 to <4 x i16>
243  %interleaved.vec = shufflevector <4 x i16> %l13, <4 x i16> %l19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
244  ret <8 x i16> %interleaved.vec
245}
246
247define arm_aapcs_vfpcc i64 @vqdmulh_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
248; CHECK-LABEL: vqdmulh_v4i32:
249; CHECK:       @ %bb.0: @ %entry
250; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
251; CHECK-NEXT:    vaddlv.s32 r0, r1, q0
252; CHECK-NEXT:    bx lr
253entry:
254  %l2 = sext <4 x i32> %s0 to <4 x i64>
255  %l5 = sext <4 x i32> %s1 to <4 x i64>
256  %l6 = mul nsw <4 x i64> %l5, %l2
257  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
258  %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
259  %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9)
260  ret i64 %l10
261}
262
263define arm_aapcs_vfpcc <4 x i32> @vqdmulh_v4i32_b(<4 x i32> %s0, <4 x i32> %s1) {
264; CHECK-LABEL: vqdmulh_v4i32_b:
265; CHECK:       @ %bb.0: @ %entry
266; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
267; CHECK-NEXT:    bx lr
268entry:
269  %l2 = sext <4 x i32> %s0 to <4 x i64>
270  %l5 = sext <4 x i32> %s1 to <4 x i64>
271  %l6 = mul nsw <4 x i64> %l5, %l2
272  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
273  %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
274  %l10 = trunc <4 x i64> %l9 to <4 x i32>
275  ret <4 x i32> %l10
276}
277
278define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) {
279; CHECK-LABEL: vqdmulh_v2i32_b:
280; CHECK:       @ %bb.0: @ %entry
281; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
282; CHECK-NEXT:    vmov r0, s2
283; CHECK-NEXT:    vmov r1, s0
284; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
285; CHECK-NEXT:    asrs r0, r0, #31
286; CHECK-NEXT:    asrs r1, r1, #31
287; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
288; CHECK-NEXT:    bx lr
289entry:
290  %l2 = sext <2 x i32> %s0 to <2 x i64>
291  %l5 = sext <2 x i32> %s1 to <2 x i64>
292  %l6 = mul nsw <2 x i64> %l5, %l2
293  %l7 = ashr <2 x i64> %l6, <i64 31, i64 31>
294  %l9 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %l7, <2 x i64> <i64 2147483647, i64 2147483647>)
295  %l10 = trunc <2 x i64> %l9 to <2 x i32>
296  ret <2 x i32> %l10
297}
298
299define arm_aapcs_vfpcc <8 x i32> @vqdmulh_v8i32_b(<8 x i32> %s0, <8 x i32> %s1) {
300; CHECK-LABEL: vqdmulh_v8i32_b:
301; CHECK:       @ %bb.0: @ %entry
302; CHECK-NEXT:    vqdmulh.s32 q0, q2, q0
303; CHECK-NEXT:    vqdmulh.s32 q1, q3, q1
304; CHECK-NEXT:    bx lr
305entry:
306  %l2 = sext <8 x i32> %s0 to <8 x i64>
307  %l5 = sext <8 x i32> %s1 to <8 x i64>
308  %l6 = mul nsw <8 x i64> %l5, %l2
309  %l7 = ashr <8 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
310  %l9 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %l7, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
311  %l10 = trunc <8 x i64> %l9 to <8 x i32>
312  ret <8 x i32> %l10
313}
314
315define arm_aapcs_vfpcc <16 x i32> @vqdmulh_v16i32_b(<16 x i32> %s0, <16 x i32> %s1) {
316; CHECK-LABEL: vqdmulh_v16i32_b:
317; CHECK:       @ %bb.0: @ %entry
318; CHECK-NEXT:    .vsave {d8, d9}
319; CHECK-NEXT:    vpush {d8, d9}
320; CHECK-NEXT:    add r0, sp, #16
321; CHECK-NEXT:    vldrw.u32 q4, [r0]
322; CHECK-NEXT:    add r0, sp, #32
323; CHECK-NEXT:    vqdmulh.s32 q0, q4, q0
324; CHECK-NEXT:    vldrw.u32 q4, [r0]
325; CHECK-NEXT:    add r0, sp, #48
326; CHECK-NEXT:    vqdmulh.s32 q1, q4, q1
327; CHECK-NEXT:    vldrw.u32 q4, [r0]
328; CHECK-NEXT:    add r0, sp, #64
329; CHECK-NEXT:    vqdmulh.s32 q2, q4, q2
330; CHECK-NEXT:    vldrw.u32 q4, [r0]
331; CHECK-NEXT:    vqdmulh.s32 q3, q4, q3
332; CHECK-NEXT:    vpop {d8, d9}
333; CHECK-NEXT:    bx lr
334entry:
335  %l2 = sext <16 x i32> %s0 to <16 x i64>
336  %l5 = sext <16 x i32> %s1 to <16 x i64>
337  %l6 = mul nsw <16 x i64> %l5, %l2
338  %l7 = ashr <16 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
339  %l9 = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %l7, <16 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
340  %l10 = trunc <16 x i64> %l9 to <16 x i32>
341  ret <16 x i32> %l10
342}
343
344
345
346define void @vqdmulh_loop_i8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) local_unnamed_addr #0 {
347; CHECK-LABEL: vqdmulh_loop_i8:
348; CHECK:       @ %bb.0: @ %entry
349; CHECK-NEXT:    .save {r7, lr}
350; CHECK-NEXT:    push {r7, lr}
351; CHECK-NEXT:    mov.w lr, #64
352; CHECK-NEXT:  .LBB17_1: @ %vector.body
353; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
354; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
355; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
356; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
357; CHECK-NEXT:    vstrb.8 q0, [r2], #16
358; CHECK-NEXT:    le lr, .LBB17_1
359; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
360; CHECK-NEXT:    pop {r7, pc}
361entry:
362  br label %vector.body
363
364vector.body:                                      ; preds = %vector.body, %entry
365  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
366  %0 = getelementptr inbounds i8, ptr %x, i32 %index
367  %wide.load = load <16 x i8>, ptr %0, align 1
368  %1 = sext <16 x i8> %wide.load to <16 x i32>
369  %2 = getelementptr inbounds i8, ptr %y, i32 %index
370  %wide.load26 = load <16 x i8>, ptr %2, align 1
371  %3 = sext <16 x i8> %wide.load26 to <16 x i32>
372  %4 = mul nsw <16 x i32> %3, %1
373  %5 = ashr <16 x i32> %4, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
374  %6 = icmp slt <16 x i32> %5, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
375  %7 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %5, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
376  %8 = trunc <16 x i32> %7 to <16 x i8>
377  %9 = getelementptr inbounds i8, ptr %z, i32 %index
378  store <16 x i8> %8, ptr %9, align 1
379  %index.next = add i32 %index, 16
380  %10 = icmp eq i32 %index.next, 1024
381  br i1 %10, label %for.cond.cleanup, label %vector.body
382
383for.cond.cleanup:                                 ; preds = %vector.body
384  ret void
385}
386
387define void @vqdmulh_loop_i16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
388; CHECK-LABEL: vqdmulh_loop_i16:
389; CHECK:       @ %bb.0: @ %entry
390; CHECK-NEXT:    .save {r7, lr}
391; CHECK-NEXT:    push {r7, lr}
392; CHECK-NEXT:    mov.w lr, #128
393; CHECK-NEXT:  .LBB18_1: @ %vector.body
394; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
395; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
396; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
397; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
398; CHECK-NEXT:    vstrb.8 q0, [r2], #16
399; CHECK-NEXT:    le lr, .LBB18_1
400; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
401; CHECK-NEXT:    pop {r7, pc}
402entry:
403  br label %vector.body
404
405vector.body:                                      ; preds = %vector.body, %entry
406  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
407  %0 = getelementptr inbounds i16, ptr %x, i32 %index
408  %wide.load = load <8 x i16>, ptr %0, align 2
409  %1 = sext <8 x i16> %wide.load to <8 x i32>
410  %2 = getelementptr inbounds i16, ptr %y, i32 %index
411  %wide.load30 = load <8 x i16>, ptr %2, align 2
412  %3 = sext <8 x i16> %wide.load30 to <8 x i32>
413  %4 = mul nsw <8 x i32> %3, %1
414  %5 = ashr <8 x i32> %4, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
415  %6 = icmp slt <8 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
416  %7 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %5, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
417  %8 = trunc <8 x i32> %7 to <8 x i16>
418  %9 = getelementptr inbounds i16, ptr %z, i32 %index
419  store <8 x i16> %8, ptr %9, align 2
420  %index.next = add i32 %index, 8
421  %10 = icmp eq i32 %index.next, 1024
422  br i1 %10, label %for.cond.cleanup, label %vector.body
423
424for.cond.cleanup:                                 ; preds = %vector.body
425  ret void
426}
427
428define void @vqdmulh_loop_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
429; CHECK-LABEL: vqdmulh_loop_i32:
430; CHECK:       @ %bb.0: @ %entry
431; CHECK-NEXT:    .save {r7, lr}
432; CHECK-NEXT:    push {r7, lr}
433; CHECK-NEXT:    mov.w lr, #256
434; CHECK-NEXT:  .LBB19_1: @ %vector.body
435; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
436; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
437; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
438; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
439; CHECK-NEXT:    vstrb.8 q0, [r2], #16
440; CHECK-NEXT:    le lr, .LBB19_1
441; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
442; CHECK-NEXT:    pop {r7, pc}
443entry:
444  br label %vector.body
445
446vector.body:                                      ; preds = %vector.body, %entry
447  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
448  %0 = getelementptr inbounds i32, ptr %x, i32 %index
449  %wide.load = load <4 x i32>, ptr %0, align 4
450  %1 = sext <4 x i32> %wide.load to <4 x i64>
451  %2 = getelementptr inbounds i32, ptr %y, i32 %index
452  %wide.load30 = load <4 x i32>, ptr %2, align 4
453  %3 = sext <4 x i32> %wide.load30 to <4 x i64>
454  %4 = mul nsw <4 x i64> %3, %1
455  %5 = ashr <4 x i64> %4, <i64 31, i64 31, i64 31, i64 31>
456  %6 = icmp slt <4 x i64> %5, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
457  %7 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %5, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
458  %8 = trunc <4 x i64> %7 to <4 x i32>
459  %9 = getelementptr inbounds i32, ptr %z, i32 %index
460  store <4 x i32> %8, ptr %9, align 4
461  %index.next = add i32 %index, 4
462  %10 = icmp eq i32 %index.next, 1024
463  br i1 %10, label %for.cond.cleanup, label %vector.body
464
465for.cond.cleanup:                                 ; preds = %vector.body
466  ret void
467}
468
469define i32 @scalar(i16 %a) {
470; CHECK-LABEL: scalar:
471; CHECK:       @ %bb.0:
472; CHECK-NEXT:    smulbb r1, r0, r0
473; CHECK-NEXT:    movs r0, #127
474; CHECK-NEXT:    lsrs r2, r1, #7
475; CHECK-NEXT:    cmp r2, #127
476; CHECK-NEXT:    it lt
477; CHECK-NEXT:    lsrlt r0, r1, #7
478; CHECK-NEXT:    bx lr
479  %e = sext i16 %a to i32
480  %d = mul nsw i32 %e, %e
481  %b = ashr i32 %d, 7
482  %c = call i32 @llvm.smin.i32(i32 %b, i32 127)
483  ret i32 %c
484}
485
486declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
487declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
488declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
489declare i32 @llvm.smin.i32(i32 %a, i32 %b)
490declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
491declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
492declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>)
493declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>)
494declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
495declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
496declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>)
497declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>)
498declare <8 x i22> @llvm.smin.v8i22(<8 x i22>, <8 x i22>)
499