xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll (revision b24af43fdfa1b1242b7cb77540462212227c57c4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6; This test only tests the legal types for a given vector width, as mulh nodes
7; do not get generated for non-legal types.
8
9target triple = "aarch64-unknown-linux-gnu"
10
11;
12; SMULH
13;
14
15; Don't use SVE for 64-bit vectors.
16define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
17; CHECK-LABEL: smulh_v8i8:
18; CHECK:       // %bb.0:
19; CHECK-NEXT:    ptrue p0.b, vl8
20; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
21; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
22; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
23; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
24; CHECK-NEXT:    ret
25  %insert = insertelement <8 x i16> undef, i16 8, i64 0
26  %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
27  %1 = sext <8 x i8> %op1 to <8 x i16>
28  %2 = sext <8 x i8> %op2 to <8 x i16>
29  %mul = mul <8 x i16> %1, %2
30  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
31  %res = trunc <8 x i16> %shr to <8 x i8>
32  ret <8 x i8> %res
33}
34
35; Don't use SVE for 128-bit vectors.
36define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
37; CHECK-LABEL: smulh_v16i8:
38; CHECK:       // %bb.0:
39; CHECK-NEXT:    ptrue p0.b, vl16
40; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
41; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
42; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
43; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
44; CHECK-NEXT:    ret
45  %1 = sext <16 x i8> %op1 to <16 x i16>
46  %2 = sext <16 x i8> %op2 to <16 x i16>
47  %mul = mul <16 x i16> %1, %2
48  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
49  %res = trunc <16 x i16> %shr to <16 x i8>
50  ret <16 x i8> %res
51}
52
53define void @smulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
54; CHECK-LABEL: smulh_v32i8:
55; CHECK:       // %bb.0:
56; CHECK-NEXT:    ptrue p0.b, vl32
57; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
58; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
59; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
60; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
61; CHECK-NEXT:    ret
62  %op1 = load <32 x i8>, ptr %a
63  %op2 = load <32 x i8>, ptr %b
64  %1 = sext <32 x i8> %op1 to <32 x i16>
65  %2 = sext <32 x i8> %op2 to <32 x i16>
66  %mul = mul <32 x i16> %1, %2
67  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
68  %res = trunc <32 x i16> %shr to <32 x i8>
69  store <32 x i8> %res, ptr %a
70  ret void
71}
72
73define void @smulh_v64i8(ptr %a, ptr %b) #0 {
74; VBITS_GE_256-LABEL: smulh_v64i8:
75; VBITS_GE_256:       // %bb.0:
76; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
77; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
78; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
79; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
80; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
81; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
82; VBITS_GE_256-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
83; VBITS_GE_256-NEXT:    movprfx z1, z2
84; VBITS_GE_256-NEXT:    smulh z1.b, p0/m, z1.b, z3.b
85; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
86; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
87; VBITS_GE_256-NEXT:    ret
88;
89; VBITS_GE_512-LABEL: smulh_v64i8:
90; VBITS_GE_512:       // %bb.0:
91; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
92; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
93; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
94; VBITS_GE_512-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
95; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
96; VBITS_GE_512-NEXT:    ret
97  %op1 = load <64 x i8>, ptr %a
98  %op2 = load <64 x i8>, ptr %b
99  %insert = insertelement <64 x i16> undef, i16 8, i64 0
100  %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
101  %1 = sext <64 x i8> %op1 to <64 x i16>
102  %2 = sext <64 x i8> %op2 to <64 x i16>
103  %mul = mul <64 x i16> %1, %2
104  %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
105  %res = trunc <64 x i16> %shr to <64 x i8>
106  store <64 x i8> %res, ptr %a
107  ret void
108}
109
110define void @smulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
111; CHECK-LABEL: smulh_v128i8:
112; CHECK:       // %bb.0:
113; CHECK-NEXT:    ptrue p0.b, vl128
114; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
115; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
116; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
117; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
118; CHECK-NEXT:    ret
119  %op1 = load <128 x i8>, ptr %a
120  %op2 = load <128 x i8>, ptr %b
121  %1 = sext <128 x i8> %op1 to <128 x i16>
122  %2 = sext <128 x i8> %op2 to <128 x i16>
123  %mul = mul <128 x i16> %1, %2
124  %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
125  %res = trunc <128 x i16> %shr to <128 x i8>
126  store <128 x i8> %res, ptr %a
127  ret void
128}
129
130define void @smulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
131; CHECK-LABEL: smulh_v256i8:
132; CHECK:       // %bb.0:
133; CHECK-NEXT:    ptrue p0.b, vl256
134; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
135; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
136; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
137; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
138; CHECK-NEXT:    ret
139  %op1 = load <256 x i8>, ptr %a
140  %op2 = load <256 x i8>, ptr %b
141  %1 = sext <256 x i8> %op1 to <256 x i16>
142  %2 = sext <256 x i8> %op2 to <256 x i16>
143  %mul = mul <256 x i16> %1, %2
144  %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
145  %res = trunc <256 x i16> %shr to <256 x i8>
146  store <256 x i8> %res, ptr %a
147  ret void
148}
149
150; Don't use SVE for 64-bit vectors.
151define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
152; CHECK-LABEL: smulh_v4i16:
153; CHECK:       // %bb.0:
154; CHECK-NEXT:    ptrue p0.h, vl4
155; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
156; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
157; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
158; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
159; CHECK-NEXT:    ret
160  %1 = sext <4 x i16> %op1 to <4 x i32>
161  %2 = sext <4 x i16> %op2 to <4 x i32>
162  %mul = mul <4 x i32> %1, %2
163  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
164  %res = trunc <4 x i32> %shr to <4 x i16>
165  ret <4 x i16> %res
166}
167
168; Don't use SVE for 128-bit vectors.
169define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
170; CHECK-LABEL: smulh_v8i16:
171; CHECK:       // %bb.0:
172; CHECK-NEXT:    ptrue p0.h, vl8
173; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
174; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
175; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
176; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
177; CHECK-NEXT:    ret
178  %1 = sext <8 x i16> %op1 to <8 x i32>
179  %2 = sext <8 x i16> %op2 to <8 x i32>
180  %mul = mul <8 x i32> %1, %2
181  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
182  %res = trunc <8 x i32> %shr to <8 x i16>
183  ret <8 x i16> %res
184}
185
186define void @smulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
187; CHECK-LABEL: smulh_v16i16:
188; CHECK:       // %bb.0:
189; CHECK-NEXT:    ptrue p0.h, vl16
190; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
191; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
192; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
193; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
194; CHECK-NEXT:    ret
195  %op1 = load <16 x i16>, ptr %a
196  %op2 = load <16 x i16>, ptr %b
197  %1 = sext <16 x i16> %op1 to <16 x i32>
198  %2 = sext <16 x i16> %op2 to <16 x i32>
199  %mul = mul <16 x i32> %1, %2
200  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
201  %res = trunc <16 x i32> %shr to <16 x i16>
202  store <16 x i16> %res, ptr %a
203  ret void
204}
205
206define void @smulh_v32i16(ptr %a, ptr %b) #0 {
207; VBITS_GE_256-LABEL: smulh_v32i16:
208; VBITS_GE_256:       // %bb.0:
209; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
210; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
211; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
212; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
213; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
214; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
215; VBITS_GE_256-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
216; VBITS_GE_256-NEXT:    movprfx z1, z2
217; VBITS_GE_256-NEXT:    smulh z1.h, p0/m, z1.h, z3.h
218; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
219; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
220; VBITS_GE_256-NEXT:    ret
221;
222; VBITS_GE_512-LABEL: smulh_v32i16:
223; VBITS_GE_512:       // %bb.0:
224; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
225; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
226; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
227; VBITS_GE_512-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
228; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
229; VBITS_GE_512-NEXT:    ret
230  %op1 = load <32 x i16>, ptr %a
231  %op2 = load <32 x i16>, ptr %b
232  %1 = sext <32 x i16> %op1 to <32 x i32>
233  %2 = sext <32 x i16> %op2 to <32 x i32>
234  %mul = mul <32 x i32> %1, %2
235  %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
236  %res = trunc <32 x i32> %shr to <32 x i16>
237  store <32 x i16> %res, ptr %a
238  ret void
239}
240
241define void @smulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
242; CHECK-LABEL: smulh_v64i16:
243; CHECK:       // %bb.0:
244; CHECK-NEXT:    ptrue p0.h, vl64
245; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
246; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
247; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
248; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
249; CHECK-NEXT:    ret
250  %op1 = load <64 x i16>, ptr %a
251  %op2 = load <64 x i16>, ptr %b
252  %1 = sext <64 x i16> %op1 to <64 x i32>
253  %2 = sext <64 x i16> %op2 to <64 x i32>
254  %mul = mul <64 x i32> %1, %2
255  %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
256  %res = trunc <64 x i32> %shr to <64 x i16>
257  store <64 x i16> %res, ptr %a
258  ret void
259}
260
261define void @smulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
262; CHECK-LABEL: smulh_v128i16:
263; CHECK:       // %bb.0:
264; CHECK-NEXT:    ptrue p0.h, vl128
265; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
266; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
267; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
268; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
269; CHECK-NEXT:    ret
270  %op1 = load <128 x i16>, ptr %a
271  %op2 = load <128 x i16>, ptr %b
272  %1 = sext <128 x i16> %op1 to <128 x i32>
273  %2 = sext <128 x i16> %op2 to <128 x i32>
274  %mul = mul <128 x i32> %1, %2
275  %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
276  %res = trunc <128 x i32> %shr to <128 x i16>
277  store <128 x i16> %res, ptr %a
278  ret void
279}
280
281; Vector i64 multiplications are not legal for NEON so use SVE when available.
282define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
283; CHECK-LABEL: smulh_v2i32:
284; CHECK:       // %bb.0:
285; CHECK-NEXT:    ptrue p0.s, vl2
286; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
287; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
288; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
289; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
290; CHECK-NEXT:    ret
291  %1 = sext <2 x i32> %op1 to <2 x i64>
292  %2 = sext <2 x i32> %op2 to <2 x i64>
293  %mul = mul <2 x i64> %1, %2
294  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
295  %res = trunc <2 x i64> %shr to <2 x i32>
296  ret <2 x i32> %res
297}
298
299; Don't use SVE for 128-bit vectors.
300define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
301; CHECK-LABEL: smulh_v4i32:
302; CHECK:       // %bb.0:
303; CHECK-NEXT:    ptrue p0.s, vl4
304; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
305; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
306; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
307; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
308; CHECK-NEXT:    ret
309  %1 = sext <4 x i32> %op1 to <4 x i64>
310  %2 = sext <4 x i32> %op2 to <4 x i64>
311  %mul = mul <4 x i64> %1, %2
312  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
313  %res = trunc <4 x i64> %shr to <4 x i32>
314  ret <4 x i32> %res
315}
316
317define void @smulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
318; CHECK-LABEL: smulh_v8i32:
319; CHECK:       // %bb.0:
320; CHECK-NEXT:    ptrue p0.s, vl8
321; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
322; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
323; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
324; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
325; CHECK-NEXT:    ret
326  %op1 = load <8 x i32>, ptr %a
327  %op2 = load <8 x i32>, ptr %b
328  %1 = sext <8 x i32> %op1 to <8 x i64>
329  %2 = sext <8 x i32> %op2 to <8 x i64>
330  %mul = mul <8 x i64> %1, %2
331  %shr = lshr <8 x i64> %mul,  <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
332  %res = trunc <8 x i64> %shr to <8 x i32>
333  store <8 x i32> %res, ptr %a
334  ret void
335}
336
337define void @smulh_v16i32(ptr %a, ptr %b) #0 {
338; VBITS_GE_256-LABEL: smulh_v16i32:
339; VBITS_GE_256:       // %bb.0:
340; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
341; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
342; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
343; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
344; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
345; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
346; VBITS_GE_256-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
347; VBITS_GE_256-NEXT:    movprfx z1, z2
348; VBITS_GE_256-NEXT:    smulh z1.s, p0/m, z1.s, z3.s
349; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
350; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
351; VBITS_GE_256-NEXT:    ret
352;
353; VBITS_GE_512-LABEL: smulh_v16i32:
354; VBITS_GE_512:       // %bb.0:
355; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
356; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
357; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
358; VBITS_GE_512-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
359; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
360; VBITS_GE_512-NEXT:    ret
361  %op1 = load <16 x i32>, ptr %a
362  %op2 = load <16 x i32>, ptr %b
363  %1 = sext <16 x i32> %op1 to <16 x i64>
364  %2 = sext <16 x i32> %op2 to <16 x i64>
365  %mul = mul <16 x i64> %1, %2
366  %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
367  %res = trunc <16 x i64> %shr to <16 x i32>
368  store <16 x i32> %res, ptr %a
369  ret void
370}
371
372define void @smulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
373; CHECK-LABEL: smulh_v32i32:
374; CHECK:       // %bb.0:
375; CHECK-NEXT:    ptrue p0.s, vl32
376; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
377; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
378; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
379; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
380; CHECK-NEXT:    ret
381  %op1 = load <32 x i32>, ptr %a
382  %op2 = load <32 x i32>, ptr %b
383  %1 = sext <32 x i32> %op1 to <32 x i64>
384  %2 = sext <32 x i32> %op2 to <32 x i64>
385  %mul = mul <32 x i64> %1, %2
386  %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
387  %res = trunc <32 x i64> %shr to <32 x i32>
388  store <32 x i32> %res, ptr %a
389  ret void
390}
391
392define void @smulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
393; CHECK-LABEL: smulh_v64i32:
394; CHECK:       // %bb.0:
395; CHECK-NEXT:    ptrue p0.s, vl64
396; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
397; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
398; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
399; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
400; CHECK-NEXT:    ret
401  %op1 = load <64 x i32>, ptr %a
402  %op2 = load <64 x i32>, ptr %b
403  %1 = sext <64 x i32> %op1 to <64 x i64>
404  %2 = sext <64 x i32> %op2 to <64 x i64>
405  %mul = mul <64 x i64> %1, %2
406  %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
407  %res = trunc <64 x i64> %shr to <64 x i32>
408  store <64 x i32> %res, ptr %a
409  ret void
410}
411
412; Vector i64 multiplications are not legal for NEON so use SVE when available.
413define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
414; CHECK-LABEL: smulh_v1i64:
415; CHECK:       // %bb.0:
416; CHECK-NEXT:    ptrue p0.d, vl1
417; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
418; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
419; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
420; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
421; CHECK-NEXT:    ret
422  %insert = insertelement <1 x i128> undef, i128 64, i128 0
423  %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
424  %1 = sext <1 x i64> %op1 to <1 x i128>
425  %2 = sext <1 x i64> %op2 to <1 x i128>
426  %mul = mul <1 x i128> %1, %2
427  %shr = lshr <1 x i128> %mul, %splat
428  %res = trunc <1 x i128> %shr to <1 x i64>
429  ret <1 x i64> %res
430}
431
432; Vector i64 multiplications are not legal for NEON so use SVE when available.
433define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
434; CHECK-LABEL: smulh_v2i64:
435; CHECK:       // %bb.0:
436; CHECK-NEXT:    ptrue p0.d, vl2
437; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
438; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
439; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
440; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
441; CHECK-NEXT:    ret
442  %1 = sext <2 x i64> %op1 to <2 x i128>
443  %2 = sext <2 x i64> %op2 to <2 x i128>
444  %mul = mul <2 x i128> %1, %2
445  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
446  %res = trunc <2 x i128> %shr to <2 x i64>
447  ret <2 x i64> %res
448}
449
450define void @smulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
451; CHECK-LABEL: smulh_v4i64:
452; CHECK:       // %bb.0:
453; CHECK-NEXT:    ptrue p0.d, vl4
454; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
455; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
456; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
457; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
458; CHECK-NEXT:    ret
459  %op1 = load <4 x i64>, ptr %a
460  %op2 = load <4 x i64>, ptr %b
461  %1 = sext <4 x i64> %op1 to <4 x i128>
462  %2 = sext <4 x i64> %op2 to <4 x i128>
463  %mul = mul <4 x i128> %1, %2
464  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
465  %res = trunc <4 x i128> %shr to <4 x i64>
466  store <4 x i64> %res, ptr %a
467  ret void
468}
469
470define void @smulh_v8i64(ptr %a, ptr %b) #0 {
471; VBITS_GE_256-LABEL: smulh_v8i64:
472; VBITS_GE_256:       // %bb.0:
473; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
474; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
475; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
476; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
477; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
478; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
479; VBITS_GE_256-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
480; VBITS_GE_256-NEXT:    movprfx z1, z2
481; VBITS_GE_256-NEXT:    smulh z1.d, p0/m, z1.d, z3.d
482; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
483; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
484; VBITS_GE_256-NEXT:    ret
485;
486; VBITS_GE_512-LABEL: smulh_v8i64:
487; VBITS_GE_512:       // %bb.0:
488; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
489; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
490; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
491; VBITS_GE_512-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
492; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
493; VBITS_GE_512-NEXT:    ret
494  %op1 = load <8 x i64>, ptr %a
495  %op2 = load <8 x i64>, ptr %b
496  %1 = sext <8 x i64> %op1 to <8 x i128>
497  %2 = sext <8 x i64> %op2 to <8 x i128>
498  %mul = mul <8 x i128> %1, %2
499  %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
500  %res = trunc <8 x i128> %shr to <8 x i64>
501  store <8 x i64> %res, ptr %a
502  ret void
503}
504
505define void @smulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
506; CHECK-LABEL: smulh_v16i64:
507; CHECK:       // %bb.0:
508; CHECK-NEXT:    ptrue p0.d, vl16
509; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
510; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
511; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
512; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
513; CHECK-NEXT:    ret
514  %op1 = load <16 x i64>, ptr %a
515  %op2 = load <16 x i64>, ptr %b
516  %1 = sext <16 x i64> %op1 to <16 x i128>
517  %2 = sext <16 x i64> %op2 to <16 x i128>
518  %mul = mul <16 x i128> %1, %2
519  %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
520  %res = trunc <16 x i128> %shr to <16 x i64>
521  store <16 x i64> %res, ptr %a
522  ret void
523}
524
525define void @smulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
526; CHECK-LABEL: smulh_v32i64:
527; CHECK:       // %bb.0:
528; CHECK-NEXT:    ptrue p0.d, vl32
529; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
530; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
531; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
532; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
533; CHECK-NEXT:    ret
534  %op1 = load <32 x i64>, ptr %a
535  %op2 = load <32 x i64>, ptr %b
536  %1 = sext <32 x i64> %op1 to <32 x i128>
537  %2 = sext <32 x i64> %op2 to <32 x i128>
538  %mul = mul <32 x i128> %1, %2
539  %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
540  %res = trunc <32 x i128> %shr to <32 x i64>
541  store <32 x i64> %res, ptr %a
542  ret void
543}
544
545;
546; UMULH
547;
548
549; Don't use SVE for 64-bit vectors.
550; FIXME: The codegen for the >=256 bits case can be improved.
551define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
552; CHECK-LABEL: umulh_v8i8:
553; CHECK:       // %bb.0:
554; CHECK-NEXT:    ptrue p0.b, vl8
555; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
556; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
557; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
558; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
559; CHECK-NEXT:    ret
560  %1 = zext <8 x i8> %op1 to <8 x i16>
561  %2 = zext <8 x i8> %op2 to <8 x i16>
562  %mul = mul <8 x i16> %1, %2
563  %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
564  %res = trunc <8 x i16> %shr to <8 x i8>
565  ret <8 x i8> %res
566}
567
568; Don't use SVE for 128-bit vectors.
569define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
570; CHECK-LABEL: umulh_v16i8:
571; CHECK:       // %bb.0:
572; CHECK-NEXT:    ptrue p0.b, vl16
573; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
574; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
575; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
576; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
577; CHECK-NEXT:    ret
578  %1 = zext <16 x i8> %op1 to <16 x i16>
579  %2 = zext <16 x i8> %op2 to <16 x i16>
580  %mul = mul <16 x i16> %1, %2
581  %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
582  %res = trunc <16 x i16> %shr to <16 x i8>
583  ret <16 x i8> %res
584}
585
586define void @umulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
587; CHECK-LABEL: umulh_v32i8:
588; CHECK:       // %bb.0:
589; CHECK-NEXT:    ptrue p0.b, vl32
590; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
591; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
592; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
593; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
594; CHECK-NEXT:    ret
595  %op1 = load <32 x i8>, ptr %a
596  %op2 = load <32 x i8>, ptr %b
597  %1 = zext <32 x i8> %op1 to <32 x i16>
598  %2 = zext <32 x i8> %op2 to <32 x i16>
599  %mul = mul <32 x i16> %1, %2
600  %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
601  %res = trunc <32 x i16> %shr to <32 x i8>
602  store <32 x i8> %res, ptr %a
603  ret void
604}
605
606define void @umulh_v64i8(ptr %a, ptr %b) #0 {
607; VBITS_GE_256-LABEL: umulh_v64i8:
608; VBITS_GE_256:       // %bb.0:
609; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
610; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
611; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
612; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
613; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
614; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
615; VBITS_GE_256-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
616; VBITS_GE_256-NEXT:    movprfx z1, z2
617; VBITS_GE_256-NEXT:    umulh z1.b, p0/m, z1.b, z3.b
618; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
619; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
620; VBITS_GE_256-NEXT:    ret
621;
622; VBITS_GE_512-LABEL: umulh_v64i8:
623; VBITS_GE_512:       // %bb.0:
624; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
625; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
626; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
627; VBITS_GE_512-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
628; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
629; VBITS_GE_512-NEXT:    ret
630  %op1 = load <64 x i8>, ptr %a
631  %op2 = load <64 x i8>, ptr %b
632  %1 = zext <64 x i8> %op1 to <64 x i16>
633  %2 = zext <64 x i8> %op2 to <64 x i16>
634  %mul = mul <64 x i16> %1, %2
635  %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
636  %res = trunc <64 x i16> %shr to <64 x i8>
637  store <64 x i8> %res, ptr %a
638  ret void
639}
640
641define void @umulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
642; CHECK-LABEL: umulh_v128i8:
643; CHECK:       // %bb.0:
644; CHECK-NEXT:    ptrue p0.b, vl128
645; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
646; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
647; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
648; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
649; CHECK-NEXT:    ret
650  %op1 = load <128 x i8>, ptr %a
651  %op2 = load <128 x i8>, ptr %b
652  %insert = insertelement <128 x i16> undef, i16 8, i64 0
653  %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
654  %1 = zext <128 x i8> %op1 to <128 x i16>
655  %2 = zext <128 x i8> %op2 to <128 x i16>
656  %mul = mul <128 x i16> %1, %2
657  %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
658  %res = trunc <128 x i16> %shr to <128 x i8>
659  store <128 x i8> %res, ptr %a
660  ret void
661}
662
663define void @umulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
664; CHECK-LABEL: umulh_v256i8:
665; CHECK:       // %bb.0:
666; CHECK-NEXT:    ptrue p0.b, vl256
667; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
668; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
669; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
670; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
671; CHECK-NEXT:    ret
672  %op1 = load <256 x i8>, ptr %a
673  %op2 = load <256 x i8>, ptr %b
674  %1 = zext <256 x i8> %op1 to <256 x i16>
675  %2 = zext <256 x i8> %op2 to <256 x i16>
676  %mul = mul <256 x i16> %1, %2
677  %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
678  %res = trunc <256 x i16> %shr to <256 x i8>
679  store <256 x i8> %res, ptr %a
680  ret void
681}
682
683; Don't use SVE for 64-bit vectors.
684; FIXME: The codegen for the >=256 bits case can be improved.
685define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
686; CHECK-LABEL: umulh_v4i16:
687; CHECK:       // %bb.0:
688; CHECK-NEXT:    ptrue p0.h, vl4
689; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
690; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
691; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
692; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
693; CHECK-NEXT:    ret
694  %1 = zext <4 x i16> %op1 to <4 x i32>
695  %2 = zext <4 x i16> %op2 to <4 x i32>
696  %mul = mul <4 x i32> %1, %2
697  %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
698  %res = trunc <4 x i32> %shr to <4 x i16>
699  ret <4 x i16> %res
700}
701
702; Don't use SVE for 128-bit vectors.
703define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
704; CHECK-LABEL: umulh_v8i16:
705; CHECK:       // %bb.0:
706; CHECK-NEXT:    ptrue p0.h, vl8
707; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
708; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
709; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
710; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
711; CHECK-NEXT:    ret
712  %1 = zext <8 x i16> %op1 to <8 x i32>
713  %2 = zext <8 x i16> %op2 to <8 x i32>
714  %mul = mul <8 x i32> %1, %2
715  %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
716  %res = trunc <8 x i32> %shr to <8 x i16>
717  ret <8 x i16> %res
718}
719
720define void @umulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
721; CHECK-LABEL: umulh_v16i16:
722; CHECK:       // %bb.0:
723; CHECK-NEXT:    ptrue p0.h, vl16
724; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
725; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
726; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
727; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
728; CHECK-NEXT:    ret
729  %op1 = load <16 x i16>, ptr %a
730  %op2 = load <16 x i16>, ptr %b
731  %1 = zext <16 x i16> %op1 to <16 x i32>
732  %2 = zext <16 x i16> %op2 to <16 x i32>
733  %mul = mul <16 x i32> %1, %2
734  %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
735  %res = trunc <16 x i32> %shr to <16 x i16>
736  store <16 x i16> %res, ptr %a
737  ret void
738}
739
740define void @umulh_v32i16(ptr %a, ptr %b) #0 {
741; VBITS_GE_256-LABEL: umulh_v32i16:
742; VBITS_GE_256:       // %bb.0:
743; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
744; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
745; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
746; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
747; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
748; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
749; VBITS_GE_256-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
750; VBITS_GE_256-NEXT:    movprfx z1, z2
751; VBITS_GE_256-NEXT:    umulh z1.h, p0/m, z1.h, z3.h
752; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
753; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
754; VBITS_GE_256-NEXT:    ret
755;
756; VBITS_GE_512-LABEL: umulh_v32i16:
757; VBITS_GE_512:       // %bb.0:
758; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
759; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
760; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
761; VBITS_GE_512-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
762; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
763; VBITS_GE_512-NEXT:    ret
764  %op1 = load <32 x i16>, ptr %a
765  %op2 = load <32 x i16>, ptr %b
766  %1 = zext <32 x i16> %op1 to <32 x i32>
767  %2 = zext <32 x i16> %op2 to <32 x i32>
768  %mul = mul <32 x i32> %1, %2
769  %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
770  %res = trunc <32 x i32> %shr to <32 x i16>
771  store <32 x i16> %res, ptr %a
772  ret void
773}
774
775define void @umulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
776; CHECK-LABEL: umulh_v64i16:
777; CHECK:       // %bb.0:
778; CHECK-NEXT:    ptrue p0.h, vl64
779; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
780; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
781; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
782; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
783; CHECK-NEXT:    ret
784  %op1 = load <64 x i16>, ptr %a
785  %op2 = load <64 x i16>, ptr %b
786  %1 = zext <64 x i16> %op1 to <64 x i32>
787  %2 = zext <64 x i16> %op2 to <64 x i32>
788  %mul = mul <64 x i32> %1, %2
789  %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
790  %res = trunc <64 x i32> %shr to <64 x i16>
791  store <64 x i16> %res, ptr %a
792  ret void
793}
794
795define void @umulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
796; CHECK-LABEL: umulh_v128i16:
797; CHECK:       // %bb.0:
798; CHECK-NEXT:    ptrue p0.h, vl128
799; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
800; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
801; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
802; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
803; CHECK-NEXT:    ret
804  %op1 = load <128 x i16>, ptr %a
805  %op2 = load <128 x i16>, ptr %b
806  %1 = zext <128 x i16> %op1 to <128 x i32>
807  %2 = zext <128 x i16> %op2 to <128 x i32>
808  %mul = mul <128 x i32> %1, %2
809  %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
810  %res = trunc <128 x i32> %shr to <128 x i16>
811  store <128 x i16> %res, ptr %a
812  ret void
813}
814
815; Vector i64 multiplications are not legal for NEON so use SVE when available.
816define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
817; CHECK-LABEL: umulh_v2i32:
818; CHECK:       // %bb.0:
819; CHECK-NEXT:    ptrue p0.s, vl2
820; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
821; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
822; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
823; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
824; CHECK-NEXT:    ret
825  %1 = zext <2 x i32> %op1 to <2 x i64>
826  %2 = zext <2 x i32> %op2 to <2 x i64>
827  %mul = mul <2 x i64> %1, %2
828  %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
829  %res = trunc <2 x i64> %shr to <2 x i32>
830  ret <2 x i32> %res
831}
832
833; Don't use SVE for 128-bit vectors.
834define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
835; CHECK-LABEL: umulh_v4i32:
836; CHECK:       // %bb.0:
837; CHECK-NEXT:    ptrue p0.s, vl4
838; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
839; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
840; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
841; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
842; CHECK-NEXT:    ret
843  %1 = zext <4 x i32> %op1 to <4 x i64>
844  %2 = zext <4 x i32> %op2 to <4 x i64>
845  %mul = mul <4 x i64> %1, %2
846  %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
847  %res = trunc <4 x i64> %shr to <4 x i32>
848  ret <4 x i32> %res
849}
850
851define void @umulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
852; CHECK-LABEL: umulh_v8i32:
853; CHECK:       // %bb.0:
854; CHECK-NEXT:    ptrue p0.s, vl8
855; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
856; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
857; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
858; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
859; CHECK-NEXT:    ret
860  %op1 = load <8 x i32>, ptr %a
861  %op2 = load <8 x i32>, ptr %b
862  %insert = insertelement <8 x i64> undef, i64 32, i64 0
863  %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
864  %1 = zext <8 x i32> %op1 to <8 x i64>
865  %2 = zext <8 x i32> %op2 to <8 x i64>
866  %mul = mul <8 x i64> %1, %2
867  %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
868  %res = trunc <8 x i64> %shr to <8 x i32>
869  store <8 x i32> %res, ptr %a
870  ret void
871}
872
873define void @umulh_v16i32(ptr %a, ptr %b) #0 {
874; VBITS_GE_256-LABEL: umulh_v16i32:
875; VBITS_GE_256:       // %bb.0:
876; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
877; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
878; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
879; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
880; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
881; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
882; VBITS_GE_256-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
883; VBITS_GE_256-NEXT:    movprfx z1, z2
884; VBITS_GE_256-NEXT:    umulh z1.s, p0/m, z1.s, z3.s
885; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
886; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
887; VBITS_GE_256-NEXT:    ret
888;
889; VBITS_GE_512-LABEL: umulh_v16i32:
890; VBITS_GE_512:       // %bb.0:
891; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
892; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
893; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
894; VBITS_GE_512-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
895; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
896; VBITS_GE_512-NEXT:    ret
897  %op1 = load <16 x i32>, ptr %a
898  %op2 = load <16 x i32>, ptr %b
899  %1 = zext <16 x i32> %op1 to <16 x i64>
900  %2 = zext <16 x i32> %op2 to <16 x i64>
901  %mul = mul <16 x i64> %1, %2
902  %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
903  %res = trunc <16 x i64> %shr to <16 x i32>
904  store <16 x i32> %res, ptr %a
905  ret void
906}
907
908define void @umulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
909; CHECK-LABEL: umulh_v32i32:
910; CHECK:       // %bb.0:
911; CHECK-NEXT:    ptrue p0.s, vl32
912; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
913; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
914; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
915; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
916; CHECK-NEXT:    ret
917  %op1 = load <32 x i32>, ptr %a
918  %op2 = load <32 x i32>, ptr %b
919  %1 = zext <32 x i32> %op1 to <32 x i64>
920  %2 = zext <32 x i32> %op2 to <32 x i64>
921  %mul = mul <32 x i64> %1, %2
922  %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
923  %res = trunc <32 x i64> %shr to <32 x i32>
924  store <32 x i32> %res, ptr %a
925  ret void
926}
927
928define void @umulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
929; CHECK-LABEL: umulh_v64i32:
930; CHECK:       // %bb.0:
931; CHECK-NEXT:    ptrue p0.s, vl64
932; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
933; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
934; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
935; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
936; CHECK-NEXT:    ret
937  %op1 = load <64 x i32>, ptr %a
938  %op2 = load <64 x i32>, ptr %b
939  %1 = zext <64 x i32> %op1 to <64 x i64>
940  %2 = zext <64 x i32> %op2 to <64 x i64>
941  %mul = mul <64 x i64> %1, %2
942  %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
943  %res = trunc <64 x i64> %shr to <64 x i32>
944  store <64 x i32> %res, ptr %a
945  ret void
946}
947
948; Vector i64 multiplications are not legal for NEON so use SVE when available.
949define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
950; CHECK-LABEL: umulh_v1i64:
951; CHECK:       // %bb.0:
952; CHECK-NEXT:    ptrue p0.d, vl1
953; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
954; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
955; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
956; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
957; CHECK-NEXT:    ret
958  %1 = zext <1 x i64> %op1 to <1 x i128>
959  %2 = zext <1 x i64> %op2 to <1 x i128>
960  %mul = mul <1 x i128> %1, %2
961  %shr = lshr <1 x i128> %mul, <i128 64>
962  %res = trunc <1 x i128> %shr to <1 x i64>
963  ret <1 x i64> %res
964}
965
966; Vector i64 multiplications are not legal for NEON so use SVE when available.
967define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
968; CHECK-LABEL: umulh_v2i64:
969; CHECK:       // %bb.0:
970; CHECK-NEXT:    ptrue p0.d, vl2
971; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
972; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
973; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
974; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
975; CHECK-NEXT:    ret
976  %1 = zext <2 x i64> %op1 to <2 x i128>
977  %2 = zext <2 x i64> %op2 to <2 x i128>
978  %mul = mul <2 x i128> %1, %2
979  %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
980  %res = trunc <2 x i128> %shr to <2 x i64>
981  ret <2 x i64> %res
982}
983
984define void @umulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
985; CHECK-LABEL: umulh_v4i64:
986; CHECK:       // %bb.0:
987; CHECK-NEXT:    ptrue p0.d, vl4
988; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
989; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
990; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
991; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
992; CHECK-NEXT:    ret
993  %op1 = load <4 x i64>, ptr %a
994  %op2 = load <4 x i64>, ptr %b
995  %1 = zext <4 x i64> %op1 to <4 x i128>
996  %2 = zext <4 x i64> %op2 to <4 x i128>
997  %mul = mul <4 x i128> %1, %2
998  %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
999  %res = trunc <4 x i128> %shr to <4 x i64>
1000  store <4 x i64> %res, ptr %a
1001  ret void
1002}
1003
1004define void @umulh_v8i64(ptr %a, ptr %b) #0 {
1005; VBITS_GE_256-LABEL: umulh_v8i64:
1006; VBITS_GE_256:       // %bb.0:
1007; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1008; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1009; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1010; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1011; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
1012; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
1013; VBITS_GE_256-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
1014; VBITS_GE_256-NEXT:    movprfx z1, z2
1015; VBITS_GE_256-NEXT:    umulh z1.d, p0/m, z1.d, z3.d
1016; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1017; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1018; VBITS_GE_256-NEXT:    ret
1019;
1020; VBITS_GE_512-LABEL: umulh_v8i64:
1021; VBITS_GE_512:       // %bb.0:
1022; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1023; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1024; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
1025; VBITS_GE_512-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
1026; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1027; VBITS_GE_512-NEXT:    ret
1028  %op1 = load <8 x i64>, ptr %a
1029  %op2 = load <8 x i64>, ptr %b
1030  %1 = zext <8 x i64> %op1 to <8 x i128>
1031  %2 = zext <8 x i64> %op2 to <8 x i128>
1032  %mul = mul <8 x i128> %1, %2
1033  %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1034  %res = trunc <8 x i128> %shr to <8 x i64>
1035  store <8 x i64> %res, ptr %a
1036  ret void
1037}
1038
1039define void @umulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1040; CHECK-LABEL: umulh_v16i64:
1041; CHECK:       // %bb.0:
1042; CHECK-NEXT:    ptrue p0.d, vl16
1043; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1044; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1045; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
1046; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1047; CHECK-NEXT:    ret
1048  %op1 = load <16 x i64>, ptr %a
1049  %op2 = load <16 x i64>, ptr %b
1050  %1 = zext <16 x i64> %op1 to <16 x i128>
1051  %2 = zext <16 x i64> %op2 to <16 x i128>
1052  %mul = mul <16 x i128> %1, %2
1053  %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1054  %res = trunc <16 x i128> %shr to <16 x i64>
1055  store <16 x i64> %res, ptr %a
1056  ret void
1057}
1058
1059define void @umulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1060; CHECK-LABEL: umulh_v32i64:
1061; CHECK:       // %bb.0:
1062; CHECK-NEXT:    ptrue p0.d, vl32
1063; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1064; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
1065; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
1066; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1067; CHECK-NEXT:    ret
1068  %op1 = load <32 x i64>, ptr %a
1069  %op2 = load <32 x i64>, ptr %b
1070  %1 = zext <32 x i64> %op1 to <32 x i128>
1071  %2 = zext <32 x i64> %op2 to <32 x i128>
1072  %mul = mul <32 x i128> %1, %2
1073  %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1074  %res = trunc <32 x i128> %shr to <32 x i64>
1075  store <32 x i64> %res, ptr %a
1076  ret void
1077}
1078attributes #0 = { "target-features"="+sve" }
1079