xref: /llvm-project/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll (revision 61510b51c33464a6bc15e4cf5b1ee07e2e0ec1c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4
5define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
6; CHECK-SD-LABEL: matrix_mul_unsigned:
7; CHECK-SD:       // %bb.0: // %vector.header
8; CHECK-SD-NEXT:    dup v0.4h, w3
9; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
10; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
11; CHECK-SD-NEXT:  .LBB0_1: // %vector.body
12; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
13; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
14; CHECK-SD-NEXT:    subs x8, x8, #8
15; CHECK-SD-NEXT:    ldp d1, d2, [x9]
16; CHECK-SD-NEXT:    add x9, x1, w0, uxtw #2
17; CHECK-SD-NEXT:    add w0, w0, #8
18; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
19; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
20; CHECK-SD-NEXT:    stp q1, q2, [x9]
21; CHECK-SD-NEXT:    b.ne .LBB0_1
22; CHECK-SD-NEXT:  // %bb.2: // %for.end12
23; CHECK-SD-NEXT:    ret
24;
25; CHECK-GI-LABEL: matrix_mul_unsigned:
26; CHECK-GI:       // %bb.0: // %vector.header
27; CHECK-GI-NEXT:    and w8, w3, #0xffff
28; CHECK-GI-NEXT:    dup v0.4s, w8
29; CHECK-GI-NEXT:    mov w8, w0
30; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
31; CHECK-GI-NEXT:  .LBB0_1: // %vector.body
32; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
33; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
34; CHECK-GI-NEXT:    subs x8, x8, #8
35; CHECK-GI-NEXT:    ldp d1, d2, [x9]
36; CHECK-GI-NEXT:    add x9, x1, w0, uxtw #2
37; CHECK-GI-NEXT:    add w0, w0, #8
38; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
39; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
40; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
41; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
42; CHECK-GI-NEXT:    stp q1, q2, [x9]
43; CHECK-GI-NEXT:    b.ne .LBB0_1
44; CHECK-GI-NEXT:  // %bb.2: // %for.end12
45; CHECK-GI-NEXT:    ret
46vector.header:
47  %conv4 = zext i16 %val to i32
48  %wide.trip.count = zext i32 %N to i64
49  %0 = add nsw i64 %wide.trip.count, -1
50  %min.iters.check = icmp ult i32 %N, 8
51  %1 = trunc i64 %0 to i32
52  %2 = icmp ugt i64 %0, 4294967295
53  %n.vec = and i64 %wide.trip.count, 4294967288
54  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
55  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
56  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
57  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
58  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
59  br label %vector.body
60
61vector.body:                                      ; preds = %vector.header, %vector.body
62  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
63  %3 = trunc i64 %index to i32
64  %4 = add i32 %N, %3
65  %5 = zext i32 %4 to i64
66  %6 = getelementptr inbounds i16, ptr %A, i64 %5
67  %7 = bitcast ptr %6 to ptr
68  %wide.load = load <4 x i16>, ptr %7, align 2
69  %8 = getelementptr inbounds i16, ptr %6, i64 4
70  %9 = bitcast ptr %8 to ptr
71  %wide.load30 = load <4 x i16>, ptr %9, align 2
72  %10 = zext <4 x i16> %wide.load to <4 x i32>
73  %11 = zext <4 x i16> %wide.load30 to <4 x i32>
74  %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
75  %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
76  %14 = getelementptr inbounds i32, ptr %C, i64 %5
77  %15 = bitcast ptr %14 to ptr
78  store <4 x i32> %12, ptr %15, align 4
79  %16 = getelementptr inbounds i32, ptr %14, i64 4
80  %17 = bitcast ptr %16 to ptr
81  store <4 x i32> %13, ptr %17, align 4
82  %index.next = add i64 %index, 8
83  %18 = icmp eq i64 %index.next, %n.vec
84  br i1 %18, label %for.end12, label %vector.body
85
86for.end12:                                        ; preds = %vector.body
87  ret void
88}
89
90define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
91; CHECK-SD-LABEL: matrix_mul_signed:
92; CHECK-SD:       // %bb.0: // %vector.header
93; CHECK-SD-NEXT:    dup v0.4h, w3
94; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
95; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
96; CHECK-SD-NEXT:  .LBB1_1: // %vector.body
97; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
98; CHECK-SD-NEXT:    add x9, x2, w0, sxtw #1
99; CHECK-SD-NEXT:    subs x8, x8, #8
100; CHECK-SD-NEXT:    ldp d1, d2, [x9]
101; CHECK-SD-NEXT:    add x9, x1, w0, sxtw #2
102; CHECK-SD-NEXT:    add w0, w0, #8
103; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
104; CHECK-SD-NEXT:    smull v2.4s, v0.4h, v2.4h
105; CHECK-SD-NEXT:    stp q1, q2, [x9]
106; CHECK-SD-NEXT:    b.ne .LBB1_1
107; CHECK-SD-NEXT:  // %bb.2: // %for.end12
108; CHECK-SD-NEXT:    ret
109;
110; CHECK-GI-LABEL: matrix_mul_signed:
111; CHECK-GI:       // %bb.0: // %vector.header
112; CHECK-GI-NEXT:    sxth w9, w3
113; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
114; CHECK-GI-NEXT:    sxtw x8, w0
115; CHECK-GI-NEXT:    dup v0.4s, w9
116; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
117; CHECK-GI-NEXT:  .LBB1_1: // %vector.body
118; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
119; CHECK-GI-NEXT:    add x9, x2, w0, sxtw #1
120; CHECK-GI-NEXT:    subs x8, x8, #8
121; CHECK-GI-NEXT:    ldp d1, d2, [x9]
122; CHECK-GI-NEXT:    add x9, x1, w0, sxtw #2
123; CHECK-GI-NEXT:    add w0, w0, #8
124; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
125; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
126; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
127; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
128; CHECK-GI-NEXT:    stp q1, q2, [x9]
129; CHECK-GI-NEXT:    b.ne .LBB1_1
130; CHECK-GI-NEXT:  // %bb.2: // %for.end12
131; CHECK-GI-NEXT:    ret
132vector.header:
133  %conv4 = sext i16 %val to i32
134  %wide.trip.count = sext i32 %N to i64
135  %0 = add nsw i64 %wide.trip.count, -1
136  %min.iters.check = icmp ult i32 %N, 8
137  %1 = trunc i64 %0 to i32
138  %2 = icmp ugt i64 %0, 4294967295
139  %n.vec = and i64 %wide.trip.count, 4294967288
140  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
141  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
142  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
143  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
144  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
145  br label %vector.body
146
147vector.body:                                      ; preds = %vector.header, %vector.body
148  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
149  %3 = trunc i64 %index to i32
150  %4 = add i32 %N, %3
151  %5 = sext i32 %4 to i64
152  %6 = getelementptr inbounds i16, ptr %A, i64 %5
153  %7 = bitcast ptr %6 to ptr
154  %wide.load = load <4 x i16>, ptr %7, align 2
155  %8 = getelementptr inbounds i16, ptr %6, i64 4
156  %9 = bitcast ptr %8 to ptr
157  %wide.load30 = load <4 x i16>, ptr %9, align 2
158  %10 = sext <4 x i16> %wide.load to <4 x i32>
159  %11 = sext <4 x i16> %wide.load30 to <4 x i32>
160  %12 = mul nsw <4 x i32> %broadcast.splat, %10
161  %13 = mul nsw <4 x i32> %broadcast.splat32, %11
162  %14 = getelementptr inbounds i32, ptr %C, i64 %5
163  %15 = bitcast ptr %14 to ptr
164  store <4 x i32> %12, ptr %15, align 4
165  %16 = getelementptr inbounds i32, ptr %14, i64 4
166  %17 = bitcast ptr %16 to ptr
167  store <4 x i32> %13, ptr %17, align 4
168  %index.next = add i64 %index, 8
169  %18 = icmp eq i64 %index.next, %n.vec
170  br i1 %18, label %for.end12, label %vector.body
171
172for.end12:                                        ; preds = %vector.body
173  ret void
174}
175
176
177define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
178; CHECK-SD-LABEL: matrix_mul_double_shuffle:
179; CHECK-SD:       // %bb.0: // %vector.header
180; CHECK-SD-NEXT:    dup v0.4h, w3
181; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
182; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
183; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 killed $x0 def $x0
184; CHECK-SD-NEXT:  .LBB2_1: // %vector.body
185; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
186; CHECK-SD-NEXT:    ldrh w9, [x2], #16
187; CHECK-SD-NEXT:    subs x8, x8, #8
188; CHECK-SD-NEXT:    dup v1.4h, w9
189; CHECK-SD-NEXT:    ubfiz x9, x0, #2, #32
190; CHECK-SD-NEXT:    add w0, w0, #8
191; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
192; CHECK-SD-NEXT:    str q1, [x1, x9]
193; CHECK-SD-NEXT:    b.ne .LBB2_1
194; CHECK-SD-NEXT:  // %bb.2: // %for.end12
195; CHECK-SD-NEXT:    ret
196;
197; CHECK-GI-LABEL: matrix_mul_double_shuffle:
198; CHECK-GI:       // %bb.0: // %vector.header
199; CHECK-GI-NEXT:    and w9, w3, #0xffff
200; CHECK-GI-NEXT:    adrp x8, .LCPI2_0
201; CHECK-GI-NEXT:    dup v0.4s, w9
202; CHECK-GI-NEXT:    mov w9, w0
203; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
204; CHECK-GI-NEXT:    and x8, x9, #0xfffffff8
205; CHECK-GI-NEXT:  .LBB2_1: // %vector.body
206; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
207; CHECK-GI-NEXT:    ldrh w9, [x2], #16
208; CHECK-GI-NEXT:    subs x8, x8, #8
209; CHECK-GI-NEXT:    mov v2.s[0], w9
210; CHECK-GI-NEXT:    mov w9, w0
211; CHECK-GI-NEXT:    add w0, w0, #8
212; CHECK-GI-NEXT:    lsl x9, x9, #2
213; CHECK-GI-NEXT:    tbl v2.16b, { v2.16b, v3.16b }, v1.16b
214; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
215; CHECK-GI-NEXT:    str q2, [x1, x9]
216; CHECK-GI-NEXT:    b.ne .LBB2_1
217; CHECK-GI-NEXT:  // %bb.2: // %for.end12
218; CHECK-GI-NEXT:    ret
219vector.header:
220  %conv4 = zext i16 %val to i32
221  %wide.trip.count = zext i32 %N to i64
222  %0 = add nsw i64 %wide.trip.count, -1
223  %min.iters.check = icmp ult i32 %N, 8
224  %1 = trunc i64 %0 to i32
225  %2 = icmp ugt i64 %0, 4294967295
226  %n.vec = and i64 %wide.trip.count, 4294967288
227  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
228  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
229  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
230  br label %vector.body
231
232vector.body:                                      ; preds = %vector.header, %vector.body
233  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
234  %g = getelementptr inbounds i16, ptr %A, i64 %index
235  %val1 = load i16, ptr %g
236  %splat.input.ext = zext i16 %val1 to i32
237  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0
238  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
239  %3 = trunc i64 %index to i32
240  %4 = add i32 %N, %3
241  %5 = zext i32 %4 to i64
242  %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32
243  %7 = getelementptr inbounds i32, ptr %C, i64 %5
244  %8 = bitcast ptr %7 to ptr
245  store <4 x i32> %6, ptr %8, align 4
246  %index.next = add i64 %index, 8
247  %9 = icmp eq i64 %index.next, %n.vec
248  br i1 %9, label %for.end12, label %vector.body
249
250for.end12:                                        ; preds = %vector.body
251  ret void
252}
253
254
255define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr noalias nocapture noundef writeonly %s, i32 noundef %n) {
256; CHECK-SD-LABEL: larger_smull:
257; CHECK-SD:       // %bb.0: // %entry
258; CHECK-SD-NEXT:    cmp w3, #1
259; CHECK-SD-NEXT:    b.lt .LBB3_8
260; CHECK-SD-NEXT:  // %bb.1: // %for.body.preheader
261; CHECK-SD-NEXT:    cmp w3, #15
262; CHECK-SD-NEXT:    mov w8, w3
263; CHECK-SD-NEXT:    b.hi .LBB3_3
264; CHECK-SD-NEXT:  // %bb.2:
265; CHECK-SD-NEXT:    mov x9, xzr
266; CHECK-SD-NEXT:    b .LBB3_6
267; CHECK-SD-NEXT:  .LBB3_3: // %vector.ph
268; CHECK-SD-NEXT:    dup v0.8h, w1
269; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
270; CHECK-SD-NEXT:    add x10, x2, #32
271; CHECK-SD-NEXT:    add x11, x0, #16
272; CHECK-SD-NEXT:    mov x12, x9
273; CHECK-SD-NEXT:  .LBB3_4: // %vector.body
274; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
275; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
276; CHECK-SD-NEXT:    subs x12, x12, #16
277; CHECK-SD-NEXT:    add x11, x11, #32
278; CHECK-SD-NEXT:    smull2 v3.4s, v0.8h, v1.8h
279; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
280; CHECK-SD-NEXT:    smull2 v4.4s, v0.8h, v2.8h
281; CHECK-SD-NEXT:    smull v2.4s, v0.4h, v2.4h
282; CHECK-SD-NEXT:    stp q1, q3, [x10, #-32]
283; CHECK-SD-NEXT:    stp q2, q4, [x10], #64
284; CHECK-SD-NEXT:    b.ne .LBB3_4
285; CHECK-SD-NEXT:  // %bb.5: // %middle.block
286; CHECK-SD-NEXT:    cmp x9, x8
287; CHECK-SD-NEXT:    b.eq .LBB3_8
288; CHECK-SD-NEXT:  .LBB3_6: // %for.body.preheader1
289; CHECK-SD-NEXT:    sxth w10, w1
290; CHECK-SD-NEXT:    add x11, x2, x9, lsl #2
291; CHECK-SD-NEXT:    add x12, x0, x9, lsl #1
292; CHECK-SD-NEXT:    sub x8, x8, x9
293; CHECK-SD-NEXT:  .LBB3_7: // %for.body
294; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
295; CHECK-SD-NEXT:    ldrsh w9, [x12], #2
296; CHECK-SD-NEXT:    subs x8, x8, #1
297; CHECK-SD-NEXT:    mul w9, w9, w10
298; CHECK-SD-NEXT:    str w9, [x11], #4
299; CHECK-SD-NEXT:    b.ne .LBB3_7
300; CHECK-SD-NEXT:  .LBB3_8: // %for.cond.cleanup
301; CHECK-SD-NEXT:    ret
302;
303; CHECK-GI-LABEL: larger_smull:
304; CHECK-GI:       // %bb.0: // %entry
305; CHECK-GI-NEXT:    cmp w3, #0
306; CHECK-GI-NEXT:    b.le .LBB3_7
307; CHECK-GI-NEXT:  // %bb.1: // %for.body.preheader
308; CHECK-GI-NEXT:    sxth w8, w1
309; CHECK-GI-NEXT:    mov x9, xzr
310; CHECK-GI-NEXT:    cmp w3, #16
311; CHECK-GI-NEXT:    mov w10, w3
312; CHECK-GI-NEXT:    b.lo .LBB3_5
313; CHECK-GI-NEXT:  // %bb.2: // %vector.ph
314; CHECK-GI-NEXT:    dup v0.4s, w8
315; CHECK-GI-NEXT:    and x9, x10, #0xfffffff0
316; CHECK-GI-NEXT:    add x11, x2, #32
317; CHECK-GI-NEXT:    add x12, x0, #16
318; CHECK-GI-NEXT:    mov x13, x9
319; CHECK-GI-NEXT:  .LBB3_3: // %vector.body
320; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
321; CHECK-GI-NEXT:    ldp q1, q2, [x12, #-16]
322; CHECK-GI-NEXT:    mov x14, x11
323; CHECK-GI-NEXT:    subs x13, x13, #16
324; CHECK-GI-NEXT:    add x12, x12, #32
325; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
326; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
327; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
328; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
329; CHECK-GI-NEXT:    mul v3.4s, v0.4s, v3.4s
330; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
331; CHECK-GI-NEXT:    mul v4.4s, v0.4s, v4.4s
332; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
333; CHECK-GI-NEXT:    stp q3, q1, [x14, #-32]!
334; CHECK-GI-NEXT:    stp q4, q2, [x11], #64
335; CHECK-GI-NEXT:    b.ne .LBB3_3
336; CHECK-GI-NEXT:  // %bb.4: // %middle.block
337; CHECK-GI-NEXT:    cmp x9, x10
338; CHECK-GI-NEXT:    b.eq .LBB3_7
339; CHECK-GI-NEXT:  .LBB3_5: // %for.body.preheader1
340; CHECK-GI-NEXT:    add x11, x2, x9, lsl #2
341; CHECK-GI-NEXT:    add x12, x0, x9, lsl #1
342; CHECK-GI-NEXT:    sub x9, x10, x9
343; CHECK-GI-NEXT:  .LBB3_6: // %for.body
344; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
345; CHECK-GI-NEXT:    ldrsh w10, [x12], #2
346; CHECK-GI-NEXT:    subs x9, x9, #1
347; CHECK-GI-NEXT:    mul w10, w10, w8
348; CHECK-GI-NEXT:    str w10, [x11], #4
349; CHECK-GI-NEXT:    b.ne .LBB3_6
350; CHECK-GI-NEXT:  .LBB3_7: // %for.cond.cleanup
351; CHECK-GI-NEXT:    ret
352entry:
353  %conv1 = sext i16 %y to i32
354  %cmp8 = icmp sgt i32 %n, 0
355  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
356
357for.body.preheader:                               ; preds = %entry
358  %wide.trip.count = zext i32 %n to i64
359  %min.iters.check = icmp ult i32 %n, 16
360  br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
361
362vector.ph:                                        ; preds = %for.body.preheader
363  %n.vec = and i64 %wide.trip.count, 4294967280
364  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
365  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
366  %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
367  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
368  br label %vector.body
369
370vector.body:                                      ; preds = %vector.body, %vector.ph
371  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
372  %0 = getelementptr inbounds i16, ptr %x, i64 %index
373  %1 = bitcast ptr %0 to ptr
374  %wide.load = load <8 x i16>, ptr %1, align 2
375  %2 = getelementptr inbounds i16, ptr %0, i64 8
376  %3 = bitcast ptr %2 to ptr
377  %wide.load11 = load <8 x i16>, ptr %3, align 2
378  %4 = sext <8 x i16> %wide.load to <8 x i32>
379  %5 = sext <8 x i16> %wide.load11 to <8 x i32>
380  %6 = mul nsw <8 x i32> %broadcast.splat, %4
381  %7 = mul nsw <8 x i32> %broadcast.splat13, %5
382  %8 = getelementptr inbounds i32, ptr %s, i64 %index
383  %9 = bitcast ptr %8 to ptr
384  store <8 x i32> %6, ptr %9, align 4
385  %10 = getelementptr inbounds i32, ptr %8, i64 8
386  %11 = bitcast ptr %10 to ptr
387  store <8 x i32> %7, ptr %11, align 4
388  %index.next = add nuw i64 %index, 16
389  %12 = icmp eq i64 %index.next, %n.vec
390  br i1 %12, label %middle.block, label %vector.body
391
392middle.block:                                     ; preds = %vector.body
393  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
394  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
395
396for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
397  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
398  br label %for.body
399
400for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
401  ret void
402
403for.body:                                         ; preds = %for.body.preheader14, %for.body
404  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
405  %arrayidx = getelementptr inbounds i16, ptr %x, i64 %indvars.iv
406  %13 = load i16, ptr %arrayidx, align 2
407  %conv = sext i16 %13 to i32
408  %mul = mul nsw i32 %conv, %conv1
409  %arrayidx3 = getelementptr inbounds i32, ptr %s, i64 %indvars.iv
410  store i32 %mul, ptr %arrayidx3, align 4
411  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
412  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
413  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
414}
415
416
417define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr noalias nocapture noundef writeonly %s, i32 noundef %n) {
418; CHECK-SD-LABEL: larger_umull:
419; CHECK-SD:       // %bb.0: // %entry
420; CHECK-SD-NEXT:    cmp w3, #1
421; CHECK-SD-NEXT:    b.lt .LBB4_8
422; CHECK-SD-NEXT:  // %bb.1: // %for.body.preheader
423; CHECK-SD-NEXT:    cmp w3, #15
424; CHECK-SD-NEXT:    mov w8, w3
425; CHECK-SD-NEXT:    b.hi .LBB4_3
426; CHECK-SD-NEXT:  // %bb.2:
427; CHECK-SD-NEXT:    mov x9, xzr
428; CHECK-SD-NEXT:    b .LBB4_6
429; CHECK-SD-NEXT:  .LBB4_3: // %vector.ph
430; CHECK-SD-NEXT:    dup v0.8h, w1
431; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
432; CHECK-SD-NEXT:    add x10, x2, #32
433; CHECK-SD-NEXT:    add x11, x0, #16
434; CHECK-SD-NEXT:    mov x12, x9
435; CHECK-SD-NEXT:  .LBB4_4: // %vector.body
436; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
437; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
438; CHECK-SD-NEXT:    subs x12, x12, #16
439; CHECK-SD-NEXT:    add x11, x11, #32
440; CHECK-SD-NEXT:    umull2 v3.4s, v0.8h, v1.8h
441; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
442; CHECK-SD-NEXT:    umull2 v4.4s, v0.8h, v2.8h
443; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
444; CHECK-SD-NEXT:    stp q1, q3, [x10, #-32]
445; CHECK-SD-NEXT:    stp q2, q4, [x10], #64
446; CHECK-SD-NEXT:    b.ne .LBB4_4
447; CHECK-SD-NEXT:  // %bb.5: // %middle.block
448; CHECK-SD-NEXT:    cmp x9, x8
449; CHECK-SD-NEXT:    b.eq .LBB4_8
450; CHECK-SD-NEXT:  .LBB4_6: // %for.body.preheader1
451; CHECK-SD-NEXT:    add x10, x2, x9, lsl #2
452; CHECK-SD-NEXT:    add x11, x0, x9, lsl #1
453; CHECK-SD-NEXT:    and w12, w1, #0xffff
454; CHECK-SD-NEXT:    sub x8, x8, x9
455; CHECK-SD-NEXT:  .LBB4_7: // %for.body
456; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
457; CHECK-SD-NEXT:    ldrh w9, [x11], #2
458; CHECK-SD-NEXT:    subs x8, x8, #1
459; CHECK-SD-NEXT:    mul w9, w9, w12
460; CHECK-SD-NEXT:    str w9, [x10], #4
461; CHECK-SD-NEXT:    b.ne .LBB4_7
462; CHECK-SD-NEXT:  .LBB4_8: // %for.cond.cleanup
463; CHECK-SD-NEXT:    ret
464;
465; CHECK-GI-LABEL: larger_umull:
466; CHECK-GI:       // %bb.0: // %entry
467; CHECK-GI-NEXT:    cmp w3, #0
468; CHECK-GI-NEXT:    b.le .LBB4_7
469; CHECK-GI-NEXT:  // %bb.1: // %for.body.preheader
470; CHECK-GI-NEXT:    mov x8, xzr
471; CHECK-GI-NEXT:    cmp w3, #16
472; CHECK-GI-NEXT:    mov w9, w3
473; CHECK-GI-NEXT:    b.lo .LBB4_5
474; CHECK-GI-NEXT:  // %bb.2: // %vector.ph
475; CHECK-GI-NEXT:    and x8, x9, #0xfffffff0
476; CHECK-GI-NEXT:    add x10, x2, #32
477; CHECK-GI-NEXT:    add x11, x0, #16
478; CHECK-GI-NEXT:    mov x12, x8
479; CHECK-GI-NEXT:  .LBB4_3: // %vector.body
480; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
481; CHECK-GI-NEXT:    ldp q0, q1, [x11, #-16]
482; CHECK-GI-NEXT:    and w13, w1, #0xffff
483; CHECK-GI-NEXT:    dup v2.4s, w13
484; CHECK-GI-NEXT:    mov x13, x10
485; CHECK-GI-NEXT:    subs x12, x12, #16
486; CHECK-GI-NEXT:    add x11, x11, #32
487; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
488; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
489; CHECK-GI-NEXT:    ushll v4.4s, v1.4h, #0
490; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
491; CHECK-GI-NEXT:    mul v3.4s, v2.4s, v3.4s
492; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
493; CHECK-GI-NEXT:    mul v4.4s, v2.4s, v4.4s
494; CHECK-GI-NEXT:    mul v1.4s, v2.4s, v1.4s
495; CHECK-GI-NEXT:    stp q3, q0, [x13, #-32]!
496; CHECK-GI-NEXT:    stp q4, q1, [x10], #64
497; CHECK-GI-NEXT:    b.ne .LBB4_3
498; CHECK-GI-NEXT:  // %bb.4: // %middle.block
499; CHECK-GI-NEXT:    cmp x8, x9
500; CHECK-GI-NEXT:    b.eq .LBB4_7
501; CHECK-GI-NEXT:  .LBB4_5: // %for.body.preheader1
502; CHECK-GI-NEXT:    add x10, x2, x8, lsl #2
503; CHECK-GI-NEXT:    add x11, x0, x8, lsl #1
504; CHECK-GI-NEXT:    and w12, w1, #0xffff
505; CHECK-GI-NEXT:    sub x8, x9, x8
506; CHECK-GI-NEXT:  .LBB4_6: // %for.body
507; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
508; CHECK-GI-NEXT:    ldrh w9, [x11], #2
509; CHECK-GI-NEXT:    subs x8, x8, #1
510; CHECK-GI-NEXT:    mul w9, w9, w12
511; CHECK-GI-NEXT:    str w9, [x10], #4
512; CHECK-GI-NEXT:    b.ne .LBB4_6
513; CHECK-GI-NEXT:  .LBB4_7: // %for.cond.cleanup
514; CHECK-GI-NEXT:    ret
515entry:
516  %conv1 = zext i16 %y to i32
517  %cmp8 = icmp sgt i32 %n, 0
518  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
519
520for.body.preheader:                               ; preds = %entry
521  %wide.trip.count = zext i32 %n to i64
522  %min.iters.check = icmp ult i32 %n, 16
523  br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
524
525vector.ph:                                        ; preds = %for.body.preheader
526  %n.vec = and i64 %wide.trip.count, 4294967280
527  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
528  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
529  %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
530  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
531  br label %vector.body
532
533vector.body:                                      ; preds = %vector.body, %vector.ph
534  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
535  %0 = getelementptr inbounds i16, ptr %x, i64 %index
536  %1 = bitcast ptr %0 to ptr
537  %wide.load = load <8 x i16>, ptr %1, align 2
538  %2 = getelementptr inbounds i16, ptr %0, i64 8
539  %3 = bitcast ptr %2 to ptr
540  %wide.load11 = load <8 x i16>, ptr %3, align 2
541  %4 = zext <8 x i16> %wide.load to <8 x i32>
542  %5 = zext <8 x i16> %wide.load11 to <8 x i32>
543  %6 = mul nuw <8 x i32> %broadcast.splat, %4
544  %7 = mul nuw <8 x i32> %broadcast.splat13, %5
545  %8 = getelementptr inbounds i32, ptr %s, i64 %index
546  %9 = bitcast ptr %8 to ptr
547  store <8 x i32> %6, ptr %9, align 4
548  %10 = getelementptr inbounds i32, ptr %8, i64 8
549  %11 = bitcast ptr %10 to ptr
550  store <8 x i32> %7, ptr %11, align 4
551  %index.next = add nuw i64 %index, 16
552  %12 = icmp eq i64 %index.next, %n.vec
553  br i1 %12, label %middle.block, label %vector.body
554
555middle.block:                                     ; preds = %vector.body
556  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
557  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
558
559for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
560  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
561  br label %for.body
562
563for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
564  ret void
565
566for.body:                                         ; preds = %for.body.preheader14, %for.body
567  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
568  %arrayidx = getelementptr inbounds i16, ptr %x, i64 %indvars.iv
569  %13 = load i16, ptr %arrayidx, align 2
570  %conv = zext i16 %13 to i32
571  %mul = mul nuw i32 %conv, %conv1
572  %arrayidx3 = getelementptr inbounds i32, ptr %s, i64 %indvars.iv
573  store i32 %mul, ptr %arrayidx3, align 4
574  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
575  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
576  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
577}
578
579
580define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, i8 noundef %B, i32 noundef %n) {
581; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s16:
582; CHECK-SD:       // %bb.0: // %entry
583; CHECK-SD-NEXT:    cbz w2, .LBB5_3
584; CHECK-SD-NEXT:  // %bb.1: // %for.body.preheader
585; CHECK-SD-NEXT:    sxtb w9, w1
586; CHECK-SD-NEXT:    cmp w2, #15
587; CHECK-SD-NEXT:    mov w10, w2
588; CHECK-SD-NEXT:    b.hi .LBB5_4
589; CHECK-SD-NEXT:  // %bb.2:
590; CHECK-SD-NEXT:    mov x11, xzr
591; CHECK-SD-NEXT:    mov w8, wzr
592; CHECK-SD-NEXT:    b .LBB5_7
593; CHECK-SD-NEXT:  .LBB5_3:
594; CHECK-SD-NEXT:    mov w8, wzr
595; CHECK-SD-NEXT:    mov w0, w8
596; CHECK-SD-NEXT:    ret
597; CHECK-SD-NEXT:  .LBB5_4: // %vector.ph
598; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
599; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
600; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
601; CHECK-SD-NEXT:    fmov s2, w9
602; CHECK-SD-NEXT:    add x8, x0, #8
603; CHECK-SD-NEXT:    mov x12, x11
604; CHECK-SD-NEXT:  .LBB5_5: // %vector.body
605; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
606; CHECK-SD-NEXT:    ldp d3, d4, [x8, #-8]
607; CHECK-SD-NEXT:    subs x12, x12, #16
608; CHECK-SD-NEXT:    add x8, x8, #16
609; CHECK-SD-NEXT:    ushll v3.8h, v3.8b, #0
610; CHECK-SD-NEXT:    ushll v4.8h, v4.8b, #0
611; CHECK-SD-NEXT:    mla v0.8h, v3.8h, v2.h[0]
612; CHECK-SD-NEXT:    mla v1.8h, v4.8h, v2.h[0]
613; CHECK-SD-NEXT:    b.ne .LBB5_5
614; CHECK-SD-NEXT:  // %bb.6: // %middle.block
615; CHECK-SD-NEXT:    add v0.8h, v1.8h, v0.8h
616; CHECK-SD-NEXT:    cmp x11, x10
617; CHECK-SD-NEXT:    addv h0, v0.8h
618; CHECK-SD-NEXT:    fmov w8, s0
619; CHECK-SD-NEXT:    b.eq .LBB5_9
620; CHECK-SD-NEXT:  .LBB5_7: // %for.body.preheader1
621; CHECK-SD-NEXT:    sub x10, x10, x11
622; CHECK-SD-NEXT:    add x11, x0, x11
623; CHECK-SD-NEXT:  .LBB5_8: // %for.body
624; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
625; CHECK-SD-NEXT:    ldrb w12, [x11], #1
626; CHECK-SD-NEXT:    subs x10, x10, #1
627; CHECK-SD-NEXT:    madd w8, w12, w9, w8
628; CHECK-SD-NEXT:    b.ne .LBB5_8
629; CHECK-SD-NEXT:  .LBB5_9: // %for.cond.cleanup
630; CHECK-SD-NEXT:    mov w0, w8
631; CHECK-SD-NEXT:    ret
632;
633; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s16:
634; CHECK-GI:       // %bb.0: // %entry
635; CHECK-GI-NEXT:    cbz w2, .LBB5_3
636; CHECK-GI-NEXT:  // %bb.1: // %for.body.preheader
637; CHECK-GI-NEXT:    cmp w2, #16
638; CHECK-GI-NEXT:    mov w8, w2
639; CHECK-GI-NEXT:    b.hs .LBB5_4
640; CHECK-GI-NEXT:  // %bb.2:
641; CHECK-GI-NEXT:    mov w10, #0 // =0x0
642; CHECK-GI-NEXT:    mov x9, xzr
643; CHECK-GI-NEXT:    fmov s0, w10
644; CHECK-GI-NEXT:    b .LBB5_8
645; CHECK-GI-NEXT:  .LBB5_3:
646; CHECK-GI-NEXT:    mov w0, wzr
647; CHECK-GI-NEXT:    ret
648; CHECK-GI-NEXT:  .LBB5_4: // %vector.ph
649; CHECK-GI-NEXT:    lsl w9, w1, #8
650; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
651; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
652; CHECK-GI-NEXT:    add x10, x0, #8
653; CHECK-GI-NEXT:    sbfx w9, w9, #8, #8
654; CHECK-GI-NEXT:    dup v2.8h, w9
655; CHECK-GI-NEXT:    and x9, x8, #0xfffffff0
656; CHECK-GI-NEXT:    mov x11, x9
657; CHECK-GI-NEXT:  .LBB5_5: // %vector.body
658; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
659; CHECK-GI-NEXT:    ldp d3, d4, [x10, #-8]
660; CHECK-GI-NEXT:    subs x11, x11, #16
661; CHECK-GI-NEXT:    add x10, x10, #16
662; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
663; CHECK-GI-NEXT:    ushll v4.8h, v4.8b, #0
664; CHECK-GI-NEXT:    mla v0.8h, v2.8h, v3.8h
665; CHECK-GI-NEXT:    mla v1.8h, v2.8h, v4.8h
666; CHECK-GI-NEXT:    b.ne .LBB5_5
667; CHECK-GI-NEXT:  // %bb.6: // %middle.block
668; CHECK-GI-NEXT:    add v0.8h, v1.8h, v0.8h
669; CHECK-GI-NEXT:    cmp x9, x8
670; CHECK-GI-NEXT:    addv h0, v0.8h
671; CHECK-GI-NEXT:    b.ne .LBB5_8
672; CHECK-GI-NEXT:  // %bb.7:
673; CHECK-GI-NEXT:    fmov w0, s0
674; CHECK-GI-NEXT:    ret
675; CHECK-GI-NEXT:  .LBB5_8: // %for.body.preheader1
676; CHECK-GI-NEXT:    sxtb w10, w1
677; CHECK-GI-NEXT:    sub x8, x8, x9
678; CHECK-GI-NEXT:    add x9, x0, x9
679; CHECK-GI-NEXT:  .LBB5_9: // %for.body
680; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
681; CHECK-GI-NEXT:    ldrb w11, [x9], #1
682; CHECK-GI-NEXT:    fmov w12, s0
683; CHECK-GI-NEXT:    subs x8, x8, #1
684; CHECK-GI-NEXT:    mul w11, w11, w10
685; CHECK-GI-NEXT:    add w0, w11, w12, uxth
686; CHECK-GI-NEXT:    fmov s0, w0
687; CHECK-GI-NEXT:    b.ne .LBB5_9
688; CHECK-GI-NEXT:  // %bb.10: // %for.cond.cleanup
689; CHECK-GI-NEXT:    ret
690entry:
691  %conv2 = sext i8 %B to i16
692  %cmp10.not = icmp eq i32 %n, 0
693  br i1 %cmp10.not, label %for.cond.cleanup, label %for.body.preheader
694
695for.body.preheader:                               ; preds = %entry
696  %wide.trip.count = zext i32 %n to i64
697  %min.iters.check = icmp ult i32 %n, 16
698  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
699
700vector.ph:                                        ; preds = %for.body.preheader
701  %n.vec = and i64 %wide.trip.count, 4294967280
702  %broadcast.splatinsert = insertelement <8 x i16> poison, i16 %conv2, i64 0
703  %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
704  %broadcast.splatinsert15 = insertelement <8 x i16> poison, i16 %conv2, i64 0
705  %broadcast.splat16 = shufflevector <8 x i16> %broadcast.splatinsert15, <8 x i16> poison, <8 x i32> zeroinitializer
706  br label %vector.body
707
708vector.body:                                      ; preds = %vector.body, %vector.ph
709  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
710  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ]
711  %vec.phi13 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
712  %0 = getelementptr inbounds i8, ptr %A, i64 %index
713  %1 = bitcast ptr %0 to ptr
714  %wide.load = load <8 x i8>, ptr %1, align 1
715  %2 = getelementptr inbounds i8, ptr %0, i64 8
716  %3 = bitcast ptr %2 to ptr
717  %wide.load14 = load <8 x i8>, ptr %3, align 1
718  %4 = zext <8 x i8> %wide.load to <8 x i16>
719  %5 = zext <8 x i8> %wide.load14 to <8 x i16>
720  %6 = mul nsw <8 x i16> %broadcast.splat, %4
721  %7 = mul nsw <8 x i16> %broadcast.splat16, %5
722  %8 = add <8 x i16> %6, %vec.phi
723  %9 = add <8 x i16> %7, %vec.phi13
724  %index.next = add nuw i64 %index, 16
725  %10 = icmp eq i64 %index.next, %n.vec
726  br i1 %10, label %middle.block, label %vector.body
727
728middle.block:                                     ; preds = %vector.body
729  %bin.rdx = add <8 x i16> %9, %8
730  %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
731  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
732  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
733
734for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
735  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
736  %s.011.ph = phi i16 [ 0, %for.body.preheader ], [ %11, %middle.block ]
737  br label %for.body
738
739for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
740  %s.0.lcssa = phi i16 [ 0, %entry ], [ %11, %middle.block ], [ %add, %for.body ]
741  ret i16 %s.0.lcssa
742
743for.body:                                         ; preds = %for.body.preheader17, %for.body
744  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
745  %s.011 = phi i16 [ %add, %for.body ], [ %s.011.ph, %for.body.preheader17 ]
746  %arrayidx = getelementptr inbounds i8, ptr %A, i64 %indvars.iv
747  %12 = load i8, ptr %arrayidx, align 1
748  %13 = zext i8 %12 to i16
749  %mul = mul nsw i16 %13, %conv2
750  %add = add i16 %mul, %s.011
751  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
752  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
753  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
754}
755
756define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
757; CHECK-SD-LABEL: sink_v2z64_1:
758; CHECK-SD:       // %bb.0: // %entry
759; CHECK-SD-NEXT:    mov x8, xzr
760; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
761; CHECK-SD-NEXT:  .LBB6_1: // %loop
762; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
763; CHECK-SD-NEXT:    ldr d1, [x0]
764; CHECK-SD-NEXT:    subs x2, x2, #8
765; CHECK-SD-NEXT:    add x8, x8, #8
766; CHECK-SD-NEXT:    umull v1.2d, v1.2s, v0.s[1]
767; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #15
768; CHECK-SD-NEXT:    str d1, [x0], #32
769; CHECK-SD-NEXT:    b.ne .LBB6_1
770; CHECK-SD-NEXT:  // %bb.2: // %exit
771; CHECK-SD-NEXT:    ret
772;
773; CHECK-GI-LABEL: sink_v2z64_1:
774; CHECK-GI:       // %bb.0: // %entry
775; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
776; CHECK-GI-NEXT:    mov x8, xzr
777; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
778; CHECK-GI-NEXT:    mov x9, v0.d[1]
779; CHECK-GI-NEXT:    fmov x10, d0
780; CHECK-GI-NEXT:  .LBB6_1: // %loop
781; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
782; CHECK-GI-NEXT:    ldr d0, [x0]
783; CHECK-GI-NEXT:    subs x2, x2, #8
784; CHECK-GI-NEXT:    add x8, x8, #8
785; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
786; CHECK-GI-NEXT:    fmov x11, d0
787; CHECK-GI-NEXT:    mov x12, v0.d[1]
788; CHECK-GI-NEXT:    mul x11, x11, x10
789; CHECK-GI-NEXT:    mul x12, x12, x9
790; CHECK-GI-NEXT:    mov v0.d[0], x11
791; CHECK-GI-NEXT:    mov v0.d[1], x12
792; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #15
793; CHECK-GI-NEXT:    str d0, [x0], #32
794; CHECK-GI-NEXT:    b.ne .LBB6_1
795; CHECK-GI-NEXT:  // %bb.2: // %exit
796; CHECK-GI-NEXT:    ret
797entry:
798  %ext = zext <2 x i32> %a to <2 x i64>
799  %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
800  br label %loop
801
802loop:
803  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
804  %g = getelementptr inbounds i32, ptr %p, i64 %index
805  %gb = bitcast ptr %g to ptr
806  %l = load <2 x i32>, ptr %gb, align 4
807  %e = zext <2 x i32> %l to <2 x i64>
808  %m = mul <2 x i64> %e, %broadcast.splat
809  %s = ashr <2 x i64> %m, <i64 15, i64 15>
810  %t = trunc <2 x i64> %s to <2 x i32>
811  %h = getelementptr inbounds i32, ptr %d, i64 %index
812  %hb = bitcast ptr %g to ptr
813  store <2 x i32> %t, ptr %hb, align 4
814  %index.next = add nuw i64 %index, 8
815  %c = icmp eq i64 %index.next, %n
816  br i1 %c, label %exit, label %loop
817
818exit:
819  ret void
820}
821
822define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
823; CHECK-SD-LABEL: sink_v4i64_1:
824; CHECK-SD:       // %bb.0: // %entry
825; CHECK-SD-NEXT:    mov x8, xzr
826; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
827; CHECK-SD-NEXT:  .LBB7_1: // %loop
828; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
829; CHECK-SD-NEXT:    ldr q1, [x0]
830; CHECK-SD-NEXT:    subs x2, x2, #8
831; CHECK-SD-NEXT:    add x8, x8, #8
832; CHECK-SD-NEXT:    smull v2.2d, v1.2s, v0.s[1]
833; CHECK-SD-NEXT:    smull2 v1.2d, v1.4s, v0.s[1]
834; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #15
835; CHECK-SD-NEXT:    shrn2 v2.4s, v1.2d, #15
836; CHECK-SD-NEXT:    str q2, [x0], #32
837; CHECK-SD-NEXT:    b.ne .LBB7_1
838; CHECK-SD-NEXT:  // %bb.2: // %exit
839; CHECK-SD-NEXT:    ret
840;
841; CHECK-GI-LABEL: sink_v4i64_1:
842; CHECK-GI:       // %bb.0: // %entry
843; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
844; CHECK-GI-NEXT:    mov x8, xzr
845; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
846; CHECK-GI-NEXT:    mov x9, v0.d[1]
847; CHECK-GI-NEXT:    fmov x10, d0
848; CHECK-GI-NEXT:  .LBB7_1: // %loop
849; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
850; CHECK-GI-NEXT:    ldr q0, [x0]
851; CHECK-GI-NEXT:    subs x2, x2, #8
852; CHECK-GI-NEXT:    add x8, x8, #8
853; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
854; CHECK-GI-NEXT:    sshll2 v0.2d, v0.4s, #0
855; CHECK-GI-NEXT:    fmov x11, d1
856; CHECK-GI-NEXT:    mov x12, v1.d[1]
857; CHECK-GI-NEXT:    fmov x13, d0
858; CHECK-GI-NEXT:    mov x14, v0.d[1]
859; CHECK-GI-NEXT:    mul x11, x11, x10
860; CHECK-GI-NEXT:    mul x13, x13, x10
861; CHECK-GI-NEXT:    mul x12, x12, x9
862; CHECK-GI-NEXT:    mov v0.d[0], x11
863; CHECK-GI-NEXT:    mul x11, x14, x9
864; CHECK-GI-NEXT:    mov v1.d[0], x13
865; CHECK-GI-NEXT:    mov v0.d[1], x12
866; CHECK-GI-NEXT:    mov v1.d[1], x11
867; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #15
868; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #15
869; CHECK-GI-NEXT:    str q0, [x0], #32
870; CHECK-GI-NEXT:    b.ne .LBB7_1
871; CHECK-GI-NEXT:  // %bb.2: // %exit
872; CHECK-GI-NEXT:    ret
873entry:
874  %ext = sext <2 x i32> %a to <2 x i64>
875  %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
876  br label %loop
877
878loop:
879  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
880  %g = getelementptr inbounds i32, ptr %p, i64 %index
881  %gb = bitcast ptr %g to ptr
882  %l = load <4 x i32>, ptr %gb, align 4
883  %e = sext <4 x i32> %l to <4 x i64>
884  %m = mul <4 x i64> %e, %broadcast.splat
885  %s = ashr <4 x i64> %m, <i64 15, i64 15, i64 15, i64 15>
886  %t = trunc <4 x i64> %s to <4 x i32>
887  %h = getelementptr inbounds i32, ptr %d, i64 %index
888  %hb = bitcast ptr %g to ptr
889  store <4 x i32> %t, ptr %hb, align 4
890  %index.next = add nuw i64 %index, 8
891  %c = icmp eq i64 %index.next, %n
892  br i1 %c, label %exit, label %loop
893
894exit:
895  ret void
896}
897
898define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
899; CHECK-SD-LABEL: sink_v8z16_0:
900; CHECK-SD:       // %bb.0: // %entry
901; CHECK-SD-NEXT:    dup v0.8b, v0.b[0]
902; CHECK-SD-NEXT:    mov x8, xzr
903; CHECK-SD-NEXT:  .LBB8_1: // %loop
904; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
905; CHECK-SD-NEXT:    ldr d1, [x0]
906; CHECK-SD-NEXT:    subs x2, x2, #8
907; CHECK-SD-NEXT:    add x8, x8, #8
908; CHECK-SD-NEXT:    umull v1.8h, v1.8b, v0.8b
909; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
910; CHECK-SD-NEXT:    xtn v1.8b, v1.8h
911; CHECK-SD-NEXT:    str d1, [x0], #32
912; CHECK-SD-NEXT:    b.ne .LBB8_1
913; CHECK-SD-NEXT:  // %bb.2: // %exit
914; CHECK-SD-NEXT:    ret
915;
916; CHECK-GI-LABEL: sink_v8z16_0:
917; CHECK-GI:       // %bb.0: // %entry
918; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
919; CHECK-GI-NEXT:    mov x8, xzr
920; CHECK-GI-NEXT:  .LBB8_1: // %loop
921; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
922; CHECK-GI-NEXT:    ldr d1, [x0]
923; CHECK-GI-NEXT:    subs x2, x2, #8
924; CHECK-GI-NEXT:    add x8, x8, #8
925; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
926; CHECK-GI-NEXT:    mul v1.8h, v1.8h, v0.h[0]
927; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #15
928; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
929; CHECK-GI-NEXT:    str d1, [x0], #32
930; CHECK-GI-NEXT:    b.ne .LBB8_1
931; CHECK-GI-NEXT:  // %bb.2: // %exit
932; CHECK-GI-NEXT:    ret
933entry:
934  %ext = zext <16 x i8> %a to <16 x i16>
935  %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
936  br label %loop
937
938loop:
939  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
940  %g = getelementptr inbounds i32, ptr %p, i64 %index
941  %gb = bitcast ptr %g to ptr
942  %l = load <8 x i8>, ptr %gb, align 4
943  %e = zext <8 x i8> %l to <8 x i16>
944  %m = mul <8 x i16> %e, %broadcast.splat
945  %s = ashr <8 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
946  %t = trunc <8 x i16> %s to <8 x i8>
947  %h = getelementptr inbounds i32, ptr %d, i64 %index
948  %hb = bitcast ptr %g to ptr
949  store <8 x i8> %t, ptr %hb, align 4
950  %index.next = add nuw i64 %index, 8
951  %c = icmp eq i64 %index.next, %n
952  br i1 %c, label %exit, label %loop
953
954exit:
955  ret void
956}
957
958define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
959; CHECK-SD-LABEL: sink_v16s16_8:
960; CHECK-SD:       // %bb.0: // %entry
961; CHECK-SD-NEXT:    dup v0.16b, v0.b[10]
962; CHECK-SD-NEXT:    mov x8, xzr
963; CHECK-SD-NEXT:  .LBB9_1: // %loop
964; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
965; CHECK-SD-NEXT:    ldr q1, [x0]
966; CHECK-SD-NEXT:    subs x2, x2, #8
967; CHECK-SD-NEXT:    add x8, x8, #8
968; CHECK-SD-NEXT:    smull v2.8h, v1.8b, v0.8b
969; CHECK-SD-NEXT:    smull2 v1.8h, v1.16b, v0.16b
970; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
971; CHECK-SD-NEXT:    cmlt v2.8h, v2.8h, #0
972; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
973; CHECK-SD-NEXT:    str q1, [x0], #32
974; CHECK-SD-NEXT:    b.ne .LBB9_1
975; CHECK-SD-NEXT:  // %bb.2: // %exit
976; CHECK-SD-NEXT:    ret
977;
978; CHECK-GI-LABEL: sink_v16s16_8:
979; CHECK-GI:       // %bb.0: // %entry
980; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
981; CHECK-GI-NEXT:    mov x8, xzr
982; CHECK-GI-NEXT:  .LBB9_1: // %loop
983; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
984; CHECK-GI-NEXT:    ldr q1, [x0]
985; CHECK-GI-NEXT:    subs x2, x2, #8
986; CHECK-GI-NEXT:    add x8, x8, #8
987; CHECK-GI-NEXT:    sshll v2.8h, v1.8b, #0
988; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
989; CHECK-GI-NEXT:    mul v2.8h, v2.8h, v0.h[2]
990; CHECK-GI-NEXT:    mul v1.8h, v1.8h, v0.h[2]
991; CHECK-GI-NEXT:    sshr v2.8h, v2.8h, #15
992; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #15
993; CHECK-GI-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
994; CHECK-GI-NEXT:    str q1, [x0], #32
995; CHECK-GI-NEXT:    b.ne .LBB9_1
996; CHECK-GI-NEXT:  // %bb.2: // %exit
997; CHECK-GI-NEXT:    ret
998entry:
999  %ext = sext <16 x i8> %a to <16 x i16>
1000  %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <16 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
1001  br label %loop
1002
1003loop:
1004  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
1005  %g = getelementptr inbounds i32, ptr %p, i64 %index
1006  %gb = bitcast ptr %g to ptr
1007  %l = load <16 x i8>, ptr %gb, align 4
1008  %e = sext <16 x i8> %l to <16 x i16>
1009  %m = mul <16 x i16> %e, %broadcast.splat
1010  %s = ashr <16 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
1011  %t = trunc <16 x i16> %s to <16 x i8>
1012  %h = getelementptr inbounds i32, ptr %d, i64 %index
1013  %hb = bitcast ptr %g to ptr
1014  store <16 x i8> %t, ptr %hb, align 4
1015  %index.next = add nuw i64 %index, 8
1016  %c = icmp eq i64 %index.next, %n
1017  br i1 %c, label %exit, label %loop
1018
1019exit:
1020  ret void
1021}
1022
1023define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
1024; CHECK-SD-LABEL: matrix_mul_unsigned_and:
1025; CHECK-SD:       // %bb.0: // %vector.header
1026; CHECK-SD-NEXT:    dup v0.4h, w3
1027; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
1028; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
1029; CHECK-SD-NEXT:  .LBB10_1: // %vector.body
1030; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
1031; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
1032; CHECK-SD-NEXT:    subs x8, x8, #8
1033; CHECK-SD-NEXT:    ldp d1, d2, [x9]
1034; CHECK-SD-NEXT:    add x9, x1, w0, uxtw #2
1035; CHECK-SD-NEXT:    add w0, w0, #8
1036; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
1037; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
1038; CHECK-SD-NEXT:    stp q1, q2, [x9]
1039; CHECK-SD-NEXT:    b.ne .LBB10_1
1040; CHECK-SD-NEXT:  // %bb.2: // %for.end12
1041; CHECK-SD-NEXT:    ret
1042;
1043; CHECK-GI-LABEL: matrix_mul_unsigned_and:
1044; CHECK-GI:       // %bb.0: // %vector.header
1045; CHECK-GI-NEXT:    and w8, w3, #0xffff
1046; CHECK-GI-NEXT:    dup v0.4s, w8
1047; CHECK-GI-NEXT:    mov w8, w0
1048; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
1049; CHECK-GI-NEXT:  .LBB10_1: // %vector.body
1050; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
1051; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
1052; CHECK-GI-NEXT:    subs x8, x8, #8
1053; CHECK-GI-NEXT:    ldp d1, d2, [x9]
1054; CHECK-GI-NEXT:    add x9, x1, w0, uxtw #2
1055; CHECK-GI-NEXT:    add w0, w0, #8
1056; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
1057; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
1058; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
1059; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
1060; CHECK-GI-NEXT:    stp q1, q2, [x9]
1061; CHECK-GI-NEXT:    b.ne .LBB10_1
1062; CHECK-GI-NEXT:  // %bb.2: // %for.end12
1063; CHECK-GI-NEXT:    ret
1064vector.header:
1065  %conv4 = and i32 %val, 65535
1066  %wide.trip.count = zext i32 %N to i64
1067  %0 = add nsw i64 %wide.trip.count, -1
1068  %min.iters.check = icmp ult i32 %N, 8
1069  %1 = trunc i64 %0 to i32
1070  %2 = icmp ugt i64 %0, 4294967295
1071  %n.vec = and i64 %wide.trip.count, 4294967288
1072  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
1073  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1074  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
1075  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
1076  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
1077  br label %vector.body
1078
1079vector.body:                                      ; preds = %vector.header, %vector.body
1080  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
1081  %3 = trunc i64 %index to i32
1082  %4 = add i32 %N, %3
1083  %5 = zext i32 %4 to i64
1084  %6 = getelementptr inbounds i16, ptr %A, i64 %5
1085  %7 = bitcast ptr %6 to ptr
1086  %wide.load = load <4 x i16>, ptr %7, align 2
1087  %8 = getelementptr inbounds i16, ptr %6, i64 4
1088  %9 = bitcast ptr %8 to ptr
1089  %wide.load30 = load <4 x i16>, ptr %9, align 2
1090  %10 = zext <4 x i16> %wide.load to <4 x i32>
1091  %11 = zext <4 x i16> %wide.load30 to <4 x i32>
1092  %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
1093  %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
1094  %14 = getelementptr inbounds i32, ptr %C, i64 %5
1095  %15 = bitcast ptr %14 to ptr
1096  store <4 x i32> %12, ptr %15, align 4
1097  %16 = getelementptr inbounds i32, ptr %14, i64 4
1098  %17 = bitcast ptr %16 to ptr
1099  store <4 x i32> %13, ptr %17, align 4
1100  %index.next = add i64 %index, 8
1101  %18 = icmp eq i64 %index.next, %n.vec
1102  br i1 %18, label %for.end12, label %vector.body
1103
1104for.end12:                                        ; preds = %vector.body
1105  ret void
1106}
1107
1108define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
1109; CHECK-SD-LABEL: matrix_mul_unsigned_and_double:
1110; CHECK-SD:       // %bb.0: // %vector.header
1111; CHECK-SD-NEXT:    dup v0.8h, w3
1112; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
1113; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
1114; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
1115; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
1116; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
1117; CHECK-SD-NEXT:    subs x8, x8, #16
1118; CHECK-SD-NEXT:    ldr q1, [x9]
1119; CHECK-SD-NEXT:    ldur q2, [x9, #8]
1120; CHECK-SD-NEXT:    add x9, x1, w0, uxtw #2
1121; CHECK-SD-NEXT:    add w0, w0, #16
1122; CHECK-SD-NEXT:    umull2 v3.4s, v0.8h, v1.8h
1123; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
1124; CHECK-SD-NEXT:    umull2 v4.4s, v0.8h, v2.8h
1125; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
1126; CHECK-SD-NEXT:    stp q1, q3, [x9]
1127; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
1128; CHECK-SD-NEXT:    b.ne .LBB11_1
1129; CHECK-SD-NEXT:  // %bb.2: // %for.end12
1130; CHECK-SD-NEXT:    ret
1131;
1132; CHECK-GI-LABEL: matrix_mul_unsigned_and_double:
1133; CHECK-GI:       // %bb.0: // %vector.header
1134; CHECK-GI-NEXT:    and w8, w3, #0xffff
1135; CHECK-GI-NEXT:    dup v0.4s, w8
1136; CHECK-GI-NEXT:    mov w8, w0
1137; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
1138; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
1139; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
1140; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
1141; CHECK-GI-NEXT:    subs x8, x8, #16
1142; CHECK-GI-NEXT:    ldr q1, [x9]
1143; CHECK-GI-NEXT:    ldur q2, [x9, #8]
1144; CHECK-GI-NEXT:    add x9, x1, w0, uxtw #2
1145; CHECK-GI-NEXT:    add w0, w0, #16
1146; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
1147; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
1148; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
1149; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
1150; CHECK-GI-NEXT:    mul v3.4s, v0.4s, v3.4s
1151; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
1152; CHECK-GI-NEXT:    mul v4.4s, v0.4s, v4.4s
1153; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
1154; CHECK-GI-NEXT:    stp q3, q1, [x9]
1155; CHECK-GI-NEXT:    stp q4, q2, [x9, #32]!
1156; CHECK-GI-NEXT:    b.ne .LBB11_1
1157; CHECK-GI-NEXT:  // %bb.2: // %for.end12
1158; CHECK-GI-NEXT:    ret
1159vector.header:
1160  %conv4 = and i32 %val, 65535
1161  %wide.trip.count = zext i32 %N to i64
1162  %0 = add nsw i64 %wide.trip.count, -1
1163  %min.iters.check = icmp ult i32 %N, 16
1164  %1 = trunc i64 %0 to i32
1165  %2 = icmp ugt i64 %0, 4294967295
1166  %n.vec = and i64 %wide.trip.count, 4294967280
1167  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0
1168  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1169  %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0
1170  %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer
1171  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
1172  br label %vector.body
1173
1174vector.body:                                      ; preds = %vector.header, %vector.body
1175  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
1176  %3 = trunc i64 %index to i32
1177  %4 = add i32 %N, %3
1178  %5 = zext i32 %4 to i64
1179  %6 = getelementptr inbounds i16, ptr %A, i64 %5
1180  %7 = bitcast ptr %6 to ptr
1181  %wide.load = load <8 x i16>, ptr %7, align 2
1182  %8 = getelementptr inbounds i16, ptr %6, i64 4
1183  %9 = bitcast ptr %8 to ptr
1184  %wide.load30 = load <8 x i16>, ptr %9, align 2
1185  %10 = zext <8 x i16> %wide.load to <8 x i32>
1186  %11 = zext <8 x i16> %wide.load30 to <8 x i32>
1187  %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10
1188  %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11
1189  %14 = getelementptr inbounds i32, ptr %C, i64 %5
1190  %15 = bitcast ptr %14 to ptr
1191  store <8 x i32> %12, ptr %15, align 4
1192  %16 = getelementptr inbounds i32, ptr %14, i64 8
1193  %17 = bitcast ptr %16 to ptr
1194  store <8 x i32> %13, ptr %17, align 4
1195  %index.next = add i64 %index, 16
1196  %18 = icmp eq i64 %index.next, %n.vec
1197  br i1 %18, label %for.end12, label %vector.body
1198
1199for.end12:                                        ; preds = %vector.body
1200  ret void
1201}
1202
1203define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
1204; CHECK-SD-LABEL: matrix_mul_signed_and:
1205; CHECK-SD:       // %bb.0: // %vector.header
1206; CHECK-SD-NEXT:    and w9, w3, #0xffff
1207; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
1208; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
1209; CHECK-SD-NEXT:    fmov s0, w9
1210; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
1211; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
1212; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
1213; CHECK-SD-NEXT:    subs x8, x8, #8
1214; CHECK-SD-NEXT:    ldp d1, d2, [x9]
1215; CHECK-SD-NEXT:    add x9, x1, w0, uxtw #2
1216; CHECK-SD-NEXT:    add w0, w0, #8
1217; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
1218; CHECK-SD-NEXT:    sshll v2.4s, v2.4h, #0
1219; CHECK-SD-NEXT:    mul v1.4s, v1.4s, v0.s[0]
1220; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
1221; CHECK-SD-NEXT:    stp q1, q2, [x9]
1222; CHECK-SD-NEXT:    b.ne .LBB12_1
1223; CHECK-SD-NEXT:  // %bb.2: // %for.end12
1224; CHECK-SD-NEXT:    ret
1225;
1226; CHECK-GI-LABEL: matrix_mul_signed_and:
1227; CHECK-GI:       // %bb.0: // %vector.header
1228; CHECK-GI-NEXT:    and w8, w3, #0xffff
1229; CHECK-GI-NEXT:    dup v0.4s, w8
1230; CHECK-GI-NEXT:    mov w8, w0
1231; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
1232; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
1233; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
1234; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
1235; CHECK-GI-NEXT:    subs x8, x8, #8
1236; CHECK-GI-NEXT:    ldp d1, d2, [x9]
1237; CHECK-GI-NEXT:    add x9, x1, w0, uxtw #2
1238; CHECK-GI-NEXT:    add w0, w0, #8
1239; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
1240; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
1241; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
1242; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
1243; CHECK-GI-NEXT:    stp q1, q2, [x9]
1244; CHECK-GI-NEXT:    b.ne .LBB12_1
1245; CHECK-GI-NEXT:  // %bb.2: // %for.end12
1246; CHECK-GI-NEXT:    ret
1247vector.header:
1248  %conv4 = and i32 %val, 65535
1249  %wide.trip.count = zext i32 %N to i64
1250  %0 = add nsw i64 %wide.trip.count, -1
1251  %min.iters.check = icmp ult i32 %N, 8
1252  %1 = trunc i64 %0 to i32
1253  %2 = icmp ugt i64 %0, 4294967295
1254  %n.vec = and i64 %wide.trip.count, 4294967288
1255  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
1256  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1257  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
1258  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
1259  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
1260  br label %vector.body
1261
1262vector.body:                                      ; preds = %vector.header, %vector.body
1263  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
1264  %3 = trunc i64 %index to i32
1265  %4 = add i32 %N, %3
1266  %5 = zext i32 %4 to i64
1267  %6 = getelementptr inbounds i16, ptr %A, i64 %5
1268  %7 = bitcast ptr %6 to ptr
1269  %wide.load = load <4 x i16>, ptr %7, align 2
1270  %8 = getelementptr inbounds i16, ptr %6, i64 4
1271  %9 = bitcast ptr %8 to ptr
1272  %wide.load30 = load <4 x i16>, ptr %9, align 2
1273  %10 = sext <4 x i16> %wide.load to <4 x i32>
1274  %11 = sext <4 x i16> %wide.load30 to <4 x i32>
1275  %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
1276  %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
1277  %14 = getelementptr inbounds i32, ptr %C, i64 %5
1278  %15 = bitcast ptr %14 to ptr
1279  store <4 x i32> %12, ptr %15, align 4
1280  %16 = getelementptr inbounds i32, ptr %14, i64 4
1281  %17 = bitcast ptr %16 to ptr
1282  store <4 x i32> %13, ptr %17, align 4
1283  %index.next = add i64 %index, 8
1284  %18 = icmp eq i64 %index.next, %n.vec
1285  br i1 %18, label %for.end12, label %vector.body
1286
1287for.end12:                                        ; preds = %vector.body
1288  ret void
1289}
1290
1291define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
1292; CHECK-SD-LABEL: matrix_mul_signed_and_double:
1293; CHECK-SD:       // %bb.0: // %vector.header
1294; CHECK-SD-NEXT:    and w9, w3, #0xffff
1295; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
1296; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
1297; CHECK-SD-NEXT:    fmov s0, w9
1298; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
1299; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
1300; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
1301; CHECK-SD-NEXT:    subs x8, x8, #16
1302; CHECK-SD-NEXT:    ldr q1, [x9]
1303; CHECK-SD-NEXT:    ldur q2, [x9, #8]
1304; CHECK-SD-NEXT:    add x9, x1, w0, uxtw #2
1305; CHECK-SD-NEXT:    add w0, w0, #16
1306; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
1307; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
1308; CHECK-SD-NEXT:    sshll2 v4.4s, v2.8h, #0
1309; CHECK-SD-NEXT:    sshll v2.4s, v2.4h, #0
1310; CHECK-SD-NEXT:    mul v3.4s, v3.4s, v0.s[0]
1311; CHECK-SD-NEXT:    mul v1.4s, v1.4s, v0.s[0]
1312; CHECK-SD-NEXT:    mul v4.4s, v4.4s, v0.s[0]
1313; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
1314; CHECK-SD-NEXT:    stp q1, q3, [x9]
1315; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
1316; CHECK-SD-NEXT:    b.ne .LBB13_1
1317; CHECK-SD-NEXT:  // %bb.2: // %for.end12
1318; CHECK-SD-NEXT:    ret
1319;
1320; CHECK-GI-LABEL: matrix_mul_signed_and_double:
1321; CHECK-GI:       // %bb.0: // %vector.header
1322; CHECK-GI-NEXT:    and w8, w3, #0xffff
1323; CHECK-GI-NEXT:    dup v0.4s, w8
1324; CHECK-GI-NEXT:    mov w8, w0
1325; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
1326; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
1327; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
1328; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
1329; CHECK-GI-NEXT:    subs x8, x8, #16
1330; CHECK-GI-NEXT:    ldr q1, [x9]
1331; CHECK-GI-NEXT:    ldur q2, [x9, #8]
1332; CHECK-GI-NEXT:    add x9, x1, w0, uxtw #2
1333; CHECK-GI-NEXT:    add w0, w0, #16
1334; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
1335; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
1336; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
1337; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
1338; CHECK-GI-NEXT:    mul v3.4s, v0.4s, v3.4s
1339; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
1340; CHECK-GI-NEXT:    mul v4.4s, v0.4s, v4.4s
1341; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
1342; CHECK-GI-NEXT:    stp q3, q1, [x9]
1343; CHECK-GI-NEXT:    stp q4, q2, [x9, #32]!
1344; CHECK-GI-NEXT:    b.ne .LBB13_1
1345; CHECK-GI-NEXT:  // %bb.2: // %for.end12
1346; CHECK-GI-NEXT:    ret
1347vector.header:
1348  %conv4 = and i32 %val, 65535
1349  %wide.trip.count = zext i32 %N to i64
1350  %0 = add nsw i64 %wide.trip.count, -1
1351  %min.iters.check = icmp ult i32 %N, 16
1352  %1 = trunc i64 %0 to i32
1353  %2 = icmp ugt i64 %0, 4294967295
1354  %n.vec = and i64 %wide.trip.count, 4294967280
1355  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0
1356  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1357  %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0
1358  %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer
1359  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
1360  br label %vector.body
1361
1362vector.body:                                      ; preds = %vector.header, %vector.body
1363  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
1364  %3 = trunc i64 %index to i32
1365  %4 = add i32 %N, %3
1366  %5 = zext i32 %4 to i64
1367  %6 = getelementptr inbounds i16, ptr %A, i64 %5
1368  %7 = bitcast ptr %6 to ptr
1369  %wide.load = load <8 x i16>, ptr %7, align 2
1370  %8 = getelementptr inbounds i16, ptr %6, i64 4
1371  %9 = bitcast ptr %8 to ptr
1372  %wide.load30 = load <8 x i16>, ptr %9, align 2
1373  %10 = sext <8 x i16> %wide.load to <8 x i32>
1374  %11 = sext <8 x i16> %wide.load30 to <8 x i32>
1375  %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10
1376  %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11
1377  %14 = getelementptr inbounds i32, ptr %C, i64 %5
1378  %15 = bitcast ptr %14 to ptr
1379  store <8 x i32> %12, ptr %15, align 4
1380  %16 = getelementptr inbounds i32, ptr %14, i64 8
1381  %17 = bitcast ptr %16 to ptr
1382  store <8 x i32> %13, ptr %17, align 4
1383  %index.next = add i64 %index, 16
1384  %18 = icmp eq i64 %index.next, %n.vec
1385  br i1 %18, label %for.end12, label %vector.body
1386
1387for.end12:                                        ; preds = %vector.body
1388  ret void
1389}
1390
1391declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1392
1393;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1394; CHECK: {{.*}}
1395