1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD 3; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI 4 5define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { 6; CHECK-SD-LABEL: matrix_mul_unsigned: 7; CHECK-SD: // %bb.0: // %vector.header 8; CHECK-SD-NEXT: dup v0.4h, w3 9; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 10; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 11; CHECK-SD-NEXT: .LBB0_1: // %vector.body 12; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 13; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 14; CHECK-SD-NEXT: subs x8, x8, #8 15; CHECK-SD-NEXT: ldp d1, d2, [x9] 16; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 17; CHECK-SD-NEXT: add w0, w0, #8 18; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h 19; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h 20; CHECK-SD-NEXT: stp q1, q2, [x9] 21; CHECK-SD-NEXT: b.ne .LBB0_1 22; CHECK-SD-NEXT: // %bb.2: // %for.end12 23; CHECK-SD-NEXT: ret 24; 25; CHECK-GI-LABEL: matrix_mul_unsigned: 26; CHECK-GI: // %bb.0: // %vector.header 27; CHECK-GI-NEXT: and w8, w3, #0xffff 28; CHECK-GI-NEXT: dup v0.4s, w8 29; CHECK-GI-NEXT: mov w8, w0 30; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 31; CHECK-GI-NEXT: .LBB0_1: // %vector.body 32; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 33; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 34; CHECK-GI-NEXT: subs x8, x8, #8 35; CHECK-GI-NEXT: ldp d1, d2, [x9] 36; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 37; CHECK-GI-NEXT: add w0, w0, #8 38; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 39; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 40; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s 41; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 42; CHECK-GI-NEXT: stp q1, q2, [x9] 43; CHECK-GI-NEXT: b.ne .LBB0_1 44; CHECK-GI-NEXT: // %bb.2: // %for.end12 45; CHECK-GI-NEXT: ret 46vector.header: 47 %conv4 = zext i16 %val to i32 48 %wide.trip.count = zext i32 %N to i64 49 %0 = add nsw i64 %wide.trip.count, -1 50 %min.iters.check = icmp ult i32 %N, 8 51 %1 = trunc i64 %0 to i32 52 %2 = icmp ugt i64 %0, 4294967295 53 %n.vec = and i64 %wide.trip.count, 4294967288 54 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 55 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 56 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 57 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer 58 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 59 br label %vector.body 60 61vector.body: ; preds = %vector.header, %vector.body 62 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] 63 %3 = trunc i64 %index to i32 64 %4 = add i32 %N, %3 65 %5 = zext i32 %4 to i64 66 %6 = getelementptr inbounds i16, ptr %A, i64 %5 67 %7 = bitcast ptr %6 to ptr 68 %wide.load = load <4 x i16>, ptr %7, align 2 69 %8 = getelementptr inbounds i16, ptr %6, i64 4 70 %9 = bitcast ptr %8 to ptr 71 %wide.load30 = load <4 x i16>, ptr %9, align 2 72 %10 = zext <4 x i16> %wide.load to <4 x i32> 73 %11 = zext <4 x i16> %wide.load30 to <4 x i32> 74 %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10 75 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11 76 %14 = getelementptr inbounds i32, ptr %C, i64 %5 77 %15 = bitcast ptr %14 to ptr 78 store <4 x i32> %12, ptr %15, align 4 79 %16 = getelementptr inbounds i32, ptr %14, i64 4 80 %17 = bitcast ptr %16 to ptr 81 store <4 x i32> %13, ptr %17, align 4 82 %index.next = add i64 %index, 8 83 %18 = icmp eq i64 %index.next, %n.vec 84 br i1 %18, label %for.end12, label %vector.body 85 86for.end12: ; preds = %vector.body 87 ret void 88} 89 90define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { 91; CHECK-SD-LABEL: matrix_mul_signed: 92; CHECK-SD: // %bb.0: // %vector.header 93; CHECK-SD-NEXT: dup v0.4h, w3 94; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 95; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 96; CHECK-SD-NEXT: .LBB1_1: // %vector.body 97; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 98; CHECK-SD-NEXT: add x9, x2, w0, sxtw #1 99; CHECK-SD-NEXT: subs x8, x8, #8 100; CHECK-SD-NEXT: ldp d1, d2, [x9] 101; CHECK-SD-NEXT: add x9, x1, w0, sxtw #2 102; CHECK-SD-NEXT: add w0, w0, #8 103; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h 104; CHECK-SD-NEXT: smull v2.4s, v0.4h, v2.4h 105; CHECK-SD-NEXT: stp q1, q2, [x9] 106; CHECK-SD-NEXT: b.ne .LBB1_1 107; CHECK-SD-NEXT: // %bb.2: // %for.end12 108; CHECK-SD-NEXT: ret 109; 110; CHECK-GI-LABEL: matrix_mul_signed: 111; CHECK-GI: // %bb.0: // %vector.header 112; CHECK-GI-NEXT: sxth w9, w3 113; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 114; CHECK-GI-NEXT: sxtw x8, w0 115; CHECK-GI-NEXT: dup v0.4s, w9 116; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 117; CHECK-GI-NEXT: .LBB1_1: // %vector.body 118; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 119; CHECK-GI-NEXT: add x9, x2, w0, sxtw #1 120; CHECK-GI-NEXT: subs x8, x8, #8 121; CHECK-GI-NEXT: ldp d1, d2, [x9] 122; CHECK-GI-NEXT: add x9, x1, w0, sxtw #2 123; CHECK-GI-NEXT: add w0, w0, #8 124; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 125; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 126; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s 127; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 128; CHECK-GI-NEXT: stp q1, q2, [x9] 129; CHECK-GI-NEXT: b.ne .LBB1_1 130; CHECK-GI-NEXT: // %bb.2: // %for.end12 131; CHECK-GI-NEXT: ret 132vector.header: 133 %conv4 = sext i16 %val to i32 134 %wide.trip.count = sext i32 %N to i64 135 %0 = add nsw i64 %wide.trip.count, -1 136 %min.iters.check = icmp ult i32 %N, 8 137 %1 = trunc i64 %0 to i32 138 %2 = icmp ugt i64 %0, 4294967295 139 %n.vec = and i64 %wide.trip.count, 4294967288 140 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 141 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 142 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 143 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer 144 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 145 br label %vector.body 146 147vector.body: ; preds = %vector.header, %vector.body 148 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] 149 %3 = trunc i64 %index to i32 150 %4 = add i32 %N, %3 151 %5 = sext i32 %4 to i64 152 %6 = getelementptr inbounds i16, ptr %A, i64 %5 153 %7 = bitcast ptr %6 to ptr 154 %wide.load = load <4 x i16>, ptr %7, align 2 155 %8 = getelementptr inbounds i16, ptr %6, i64 4 156 %9 = bitcast ptr %8 to ptr 157 %wide.load30 = load <4 x i16>, ptr %9, align 2 158 %10 = sext <4 x i16> %wide.load to <4 x i32> 159 %11 = sext <4 x i16> %wide.load30 to <4 x i32> 160 %12 = mul nsw <4 x i32> %broadcast.splat, %10 161 %13 = mul nsw <4 x i32> %broadcast.splat32, %11 162 %14 = getelementptr inbounds i32, ptr %C, i64 %5 163 %15 = bitcast ptr %14 to ptr 164 store <4 x i32> %12, ptr %15, align 4 165 %16 = getelementptr inbounds i32, ptr %14, i64 4 166 %17 = bitcast ptr %16 to ptr 167 store <4 x i32> %13, ptr %17, align 4 168 %index.next = add i64 %index, 8 169 %18 = icmp eq i64 %index.next, %n.vec 170 br i1 %18, label %for.end12, label %vector.body 171 172for.end12: ; preds = %vector.body 173 ret void 174} 175 176 177define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { 178; CHECK-SD-LABEL: matrix_mul_double_shuffle: 179; CHECK-SD: // %bb.0: // %vector.header 180; CHECK-SD-NEXT: dup v0.4h, w3 181; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 182; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 183; CHECK-SD-NEXT: // kill: def $w0 killed $w0 killed $x0 def $x0 184; CHECK-SD-NEXT: .LBB2_1: // %vector.body 185; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 186; CHECK-SD-NEXT: ldrh w9, [x2], #16 187; CHECK-SD-NEXT: subs x8, x8, #8 188; CHECK-SD-NEXT: dup v1.4h, w9 189; CHECK-SD-NEXT: ubfiz x9, x0, #2, #32 190; CHECK-SD-NEXT: add w0, w0, #8 191; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h 192; CHECK-SD-NEXT: str q1, [x1, x9] 193; CHECK-SD-NEXT: b.ne .LBB2_1 194; CHECK-SD-NEXT: // %bb.2: // %for.end12 195; CHECK-SD-NEXT: ret 196; 197; CHECK-GI-LABEL: matrix_mul_double_shuffle: 198; CHECK-GI: // %bb.0: // %vector.header 199; CHECK-GI-NEXT: and w9, w3, #0xffff 200; CHECK-GI-NEXT: adrp x8, .LCPI2_0 201; CHECK-GI-NEXT: dup v0.4s, w9 202; CHECK-GI-NEXT: mov w9, w0 203; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] 204; CHECK-GI-NEXT: and x8, x9, #0xfffffff8 205; CHECK-GI-NEXT: .LBB2_1: // %vector.body 206; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 207; CHECK-GI-NEXT: ldrh w9, [x2], #16 208; CHECK-GI-NEXT: subs x8, x8, #8 209; CHECK-GI-NEXT: mov v2.s[0], w9 210; CHECK-GI-NEXT: mov w9, w0 211; CHECK-GI-NEXT: add w0, w0, #8 212; CHECK-GI-NEXT: lsl x9, x9, #2 213; CHECK-GI-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v1.16b 214; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 215; CHECK-GI-NEXT: str q2, [x1, x9] 216; CHECK-GI-NEXT: b.ne .LBB2_1 217; CHECK-GI-NEXT: // %bb.2: // %for.end12 218; CHECK-GI-NEXT: ret 219vector.header: 220 %conv4 = zext i16 %val to i32 221 %wide.trip.count = zext i32 %N to i64 222 %0 = add nsw i64 %wide.trip.count, -1 223 %min.iters.check = icmp ult i32 %N, 8 224 %1 = trunc i64 %0 to i32 225 %2 = icmp ugt i64 %0, 4294967295 226 %n.vec = and i64 %wide.trip.count, 4294967288 227 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 228 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 229 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 230 br label %vector.body 231 232vector.body: ; preds = %vector.header, %vector.body 233 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] 234 %g = getelementptr inbounds i16, ptr %A, i64 %index 235 %val1 = load i16, ptr %g 236 %splat.input.ext = zext i16 %val1 to i32 237 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0 238 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 239 %3 = trunc i64 %index to i32 240 %4 = add i32 %N, %3 241 %5 = zext i32 %4 to i64 242 %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32 243 %7 = getelementptr inbounds i32, ptr %C, i64 %5 244 %8 = bitcast ptr %7 to ptr 245 store <4 x i32> %6, ptr %8, align 4 246 %index.next = add i64 %index, 8 247 %9 = icmp eq i64 %index.next, %n.vec 248 br i1 %9, label %for.end12, label %vector.body 249 250for.end12: ; preds = %vector.body 251 ret void 252} 253 254 255define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr noalias nocapture noundef writeonly %s, i32 noundef %n) { 256; CHECK-SD-LABEL: larger_smull: 257; CHECK-SD: // %bb.0: // %entry 258; CHECK-SD-NEXT: cmp w3, #1 259; CHECK-SD-NEXT: b.lt .LBB3_8 260; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader 261; CHECK-SD-NEXT: cmp w3, #15 262; CHECK-SD-NEXT: mov w8, w3 263; CHECK-SD-NEXT: b.hi .LBB3_3 264; CHECK-SD-NEXT: // %bb.2: 265; CHECK-SD-NEXT: mov x9, xzr 266; CHECK-SD-NEXT: b .LBB3_6 267; CHECK-SD-NEXT: .LBB3_3: // %vector.ph 268; CHECK-SD-NEXT: dup v0.8h, w1 269; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 270; CHECK-SD-NEXT: add x10, x2, #32 271; CHECK-SD-NEXT: add x11, x0, #16 272; CHECK-SD-NEXT: mov x12, x9 273; CHECK-SD-NEXT: .LBB3_4: // %vector.body 274; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 275; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] 276; CHECK-SD-NEXT: subs x12, x12, #16 277; CHECK-SD-NEXT: add x11, x11, #32 278; CHECK-SD-NEXT: smull2 v3.4s, v0.8h, v1.8h 279; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h 280; CHECK-SD-NEXT: smull2 v4.4s, v0.8h, v2.8h 281; CHECK-SD-NEXT: smull v2.4s, v0.4h, v2.4h 282; CHECK-SD-NEXT: stp q1, q3, [x10, #-32] 283; CHECK-SD-NEXT: stp q2, q4, [x10], #64 284; CHECK-SD-NEXT: b.ne .LBB3_4 285; CHECK-SD-NEXT: // %bb.5: // %middle.block 286; CHECK-SD-NEXT: cmp x9, x8 287; CHECK-SD-NEXT: b.eq .LBB3_8 288; CHECK-SD-NEXT: .LBB3_6: // %for.body.preheader1 289; CHECK-SD-NEXT: sxth w10, w1 290; CHECK-SD-NEXT: add x11, x2, x9, lsl #2 291; CHECK-SD-NEXT: add x12, x0, x9, lsl #1 292; CHECK-SD-NEXT: sub x8, x8, x9 293; CHECK-SD-NEXT: .LBB3_7: // %for.body 294; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 295; CHECK-SD-NEXT: ldrsh w9, [x12], #2 296; CHECK-SD-NEXT: subs x8, x8, #1 297; CHECK-SD-NEXT: mul w9, w9, w10 298; CHECK-SD-NEXT: str w9, [x11], #4 299; CHECK-SD-NEXT: b.ne .LBB3_7 300; CHECK-SD-NEXT: .LBB3_8: // %for.cond.cleanup 301; CHECK-SD-NEXT: ret 302; 303; CHECK-GI-LABEL: larger_smull: 304; CHECK-GI: // %bb.0: // %entry 305; CHECK-GI-NEXT: cmp w3, #0 306; CHECK-GI-NEXT: b.le .LBB3_7 307; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader 308; CHECK-GI-NEXT: sxth w8, w1 309; CHECK-GI-NEXT: mov x9, xzr 310; CHECK-GI-NEXT: cmp w3, #16 311; CHECK-GI-NEXT: mov w10, w3 312; CHECK-GI-NEXT: b.lo .LBB3_5 313; CHECK-GI-NEXT: // %bb.2: // %vector.ph 314; CHECK-GI-NEXT: dup v0.4s, w8 315; CHECK-GI-NEXT: and x9, x10, #0xfffffff0 316; CHECK-GI-NEXT: add x11, x2, #32 317; CHECK-GI-NEXT: add x12, x0, #16 318; CHECK-GI-NEXT: mov x13, x9 319; CHECK-GI-NEXT: .LBB3_3: // %vector.body 320; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 321; CHECK-GI-NEXT: ldp q1, q2, [x12, #-16] 322; CHECK-GI-NEXT: mov x14, x11 323; CHECK-GI-NEXT: subs x13, x13, #16 324; CHECK-GI-NEXT: add x12, x12, #32 325; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 326; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 327; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 328; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 329; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s 330; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s 331; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s 332; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 333; CHECK-GI-NEXT: stp q3, q1, [x14, #-32]! 334; CHECK-GI-NEXT: stp q4, q2, [x11], #64 335; CHECK-GI-NEXT: b.ne .LBB3_3 336; CHECK-GI-NEXT: // %bb.4: // %middle.block 337; CHECK-GI-NEXT: cmp x9, x10 338; CHECK-GI-NEXT: b.eq .LBB3_7 339; CHECK-GI-NEXT: .LBB3_5: // %for.body.preheader1 340; CHECK-GI-NEXT: add x11, x2, x9, lsl #2 341; CHECK-GI-NEXT: add x12, x0, x9, lsl #1 342; CHECK-GI-NEXT: sub x9, x10, x9 343; CHECK-GI-NEXT: .LBB3_6: // %for.body 344; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 345; CHECK-GI-NEXT: ldrsh w10, [x12], #2 346; CHECK-GI-NEXT: subs x9, x9, #1 347; CHECK-GI-NEXT: mul w10, w10, w8 348; CHECK-GI-NEXT: str w10, [x11], #4 349; CHECK-GI-NEXT: b.ne .LBB3_6 350; CHECK-GI-NEXT: .LBB3_7: // %for.cond.cleanup 351; CHECK-GI-NEXT: ret 352entry: 353 %conv1 = sext i16 %y to i32 354 %cmp8 = icmp sgt i32 %n, 0 355 br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup 356 357for.body.preheader: ; preds = %entry 358 %wide.trip.count = zext i32 %n to i64 359 %min.iters.check = icmp ult i32 %n, 16 360 br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph 361 362vector.ph: ; preds = %for.body.preheader 363 %n.vec = and i64 %wide.trip.count, 4294967280 364 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0 365 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer 366 %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0 367 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer 368 br label %vector.body 369 370vector.body: ; preds = %vector.body, %vector.ph 371 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 372 %0 = getelementptr inbounds i16, ptr %x, i64 %index 373 %1 = bitcast ptr %0 to ptr 374 %wide.load = load <8 x i16>, ptr %1, align 2 375 %2 = getelementptr inbounds i16, ptr %0, i64 8 376 %3 = bitcast ptr %2 to ptr 377 %wide.load11 = load <8 x i16>, ptr %3, align 2 378 %4 = sext <8 x i16> %wide.load to <8 x i32> 379 %5 = sext <8 x i16> %wide.load11 to <8 x i32> 380 %6 = mul nsw <8 x i32> %broadcast.splat, %4 381 %7 = mul nsw <8 x i32> %broadcast.splat13, %5 382 %8 = getelementptr inbounds i32, ptr %s, i64 %index 383 %9 = bitcast ptr %8 to ptr 384 store <8 x i32> %6, ptr %9, align 4 385 %10 = getelementptr inbounds i32, ptr %8, i64 8 386 %11 = bitcast ptr %10 to ptr 387 store <8 x i32> %7, ptr %11, align 4 388 %index.next = add nuw i64 %index, 16 389 %12 = icmp eq i64 %index.next, %n.vec 390 br i1 %12, label %middle.block, label %vector.body 391 392middle.block: ; preds = %vector.body 393 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 394 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14 395 396for.body.preheader14: ; preds = %for.body.preheader, %middle.block 397 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 398 br label %for.body 399 400for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 401 ret void 402 403for.body: ; preds = %for.body.preheader14, %for.body 404 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ] 405 %arrayidx = getelementptr inbounds i16, ptr %x, i64 %indvars.iv 406 %13 = load i16, ptr %arrayidx, align 2 407 %conv = sext i16 %13 to i32 408 %mul = mul nsw i32 %conv, %conv1 409 %arrayidx3 = getelementptr inbounds i32, ptr %s, i64 %indvars.iv 410 store i32 %mul, ptr %arrayidx3, align 4 411 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 412 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count 413 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 414} 415 416 417define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr noalias nocapture noundef writeonly %s, i32 noundef %n) { 418; CHECK-SD-LABEL: larger_umull: 419; CHECK-SD: // %bb.0: // %entry 420; CHECK-SD-NEXT: cmp w3, #1 421; CHECK-SD-NEXT: b.lt .LBB4_8 422; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader 423; CHECK-SD-NEXT: cmp w3, #15 424; CHECK-SD-NEXT: mov w8, w3 425; CHECK-SD-NEXT: b.hi .LBB4_3 426; CHECK-SD-NEXT: // %bb.2: 427; CHECK-SD-NEXT: mov x9, xzr 428; CHECK-SD-NEXT: b .LBB4_6 429; CHECK-SD-NEXT: .LBB4_3: // %vector.ph 430; CHECK-SD-NEXT: dup v0.8h, w1 431; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 432; CHECK-SD-NEXT: add x10, x2, #32 433; CHECK-SD-NEXT: add x11, x0, #16 434; CHECK-SD-NEXT: mov x12, x9 435; CHECK-SD-NEXT: .LBB4_4: // %vector.body 436; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 437; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] 438; CHECK-SD-NEXT: subs x12, x12, #16 439; CHECK-SD-NEXT: add x11, x11, #32 440; CHECK-SD-NEXT: umull2 v3.4s, v0.8h, v1.8h 441; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h 442; CHECK-SD-NEXT: umull2 v4.4s, v0.8h, v2.8h 443; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h 444; CHECK-SD-NEXT: stp q1, q3, [x10, #-32] 445; CHECK-SD-NEXT: stp q2, q4, [x10], #64 446; CHECK-SD-NEXT: b.ne .LBB4_4 447; CHECK-SD-NEXT: // %bb.5: // %middle.block 448; CHECK-SD-NEXT: cmp x9, x8 449; CHECK-SD-NEXT: b.eq .LBB4_8 450; CHECK-SD-NEXT: .LBB4_6: // %for.body.preheader1 451; CHECK-SD-NEXT: add x10, x2, x9, lsl #2 452; CHECK-SD-NEXT: add x11, x0, x9, lsl #1 453; CHECK-SD-NEXT: and w12, w1, #0xffff 454; CHECK-SD-NEXT: sub x8, x8, x9 455; CHECK-SD-NEXT: .LBB4_7: // %for.body 456; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 457; CHECK-SD-NEXT: ldrh w9, [x11], #2 458; CHECK-SD-NEXT: subs x8, x8, #1 459; CHECK-SD-NEXT: mul w9, w9, w12 460; CHECK-SD-NEXT: str w9, [x10], #4 461; CHECK-SD-NEXT: b.ne .LBB4_7 462; CHECK-SD-NEXT: .LBB4_8: // %for.cond.cleanup 463; CHECK-SD-NEXT: ret 464; 465; CHECK-GI-LABEL: larger_umull: 466; CHECK-GI: // %bb.0: // %entry 467; CHECK-GI-NEXT: cmp w3, #0 468; CHECK-GI-NEXT: b.le .LBB4_7 469; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader 470; CHECK-GI-NEXT: mov x8, xzr 471; CHECK-GI-NEXT: cmp w3, #16 472; CHECK-GI-NEXT: mov w9, w3 473; CHECK-GI-NEXT: b.lo .LBB4_5 474; CHECK-GI-NEXT: // %bb.2: // %vector.ph 475; CHECK-GI-NEXT: and x8, x9, #0xfffffff0 476; CHECK-GI-NEXT: add x10, x2, #32 477; CHECK-GI-NEXT: add x11, x0, #16 478; CHECK-GI-NEXT: mov x12, x8 479; CHECK-GI-NEXT: .LBB4_3: // %vector.body 480; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 481; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16] 482; CHECK-GI-NEXT: and w13, w1, #0xffff 483; CHECK-GI-NEXT: dup v2.4s, w13 484; CHECK-GI-NEXT: mov x13, x10 485; CHECK-GI-NEXT: subs x12, x12, #16 486; CHECK-GI-NEXT: add x11, x11, #32 487; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 488; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 489; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0 490; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 491; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s 492; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s 493; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s 494; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s 495; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]! 496; CHECK-GI-NEXT: stp q4, q1, [x10], #64 497; CHECK-GI-NEXT: b.ne .LBB4_3 498; CHECK-GI-NEXT: // %bb.4: // %middle.block 499; CHECK-GI-NEXT: cmp x8, x9 500; CHECK-GI-NEXT: b.eq .LBB4_7 501; CHECK-GI-NEXT: .LBB4_5: // %for.body.preheader1 502; CHECK-GI-NEXT: add x10, x2, x8, lsl #2 503; CHECK-GI-NEXT: add x11, x0, x8, lsl #1 504; CHECK-GI-NEXT: and w12, w1, #0xffff 505; CHECK-GI-NEXT: sub x8, x9, x8 506; CHECK-GI-NEXT: .LBB4_6: // %for.body 507; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 508; CHECK-GI-NEXT: ldrh w9, [x11], #2 509; CHECK-GI-NEXT: subs x8, x8, #1 510; CHECK-GI-NEXT: mul w9, w9, w12 511; CHECK-GI-NEXT: str w9, [x10], #4 512; CHECK-GI-NEXT: b.ne .LBB4_6 513; CHECK-GI-NEXT: .LBB4_7: // %for.cond.cleanup 514; CHECK-GI-NEXT: ret 515entry: 516 %conv1 = zext i16 %y to i32 517 %cmp8 = icmp sgt i32 %n, 0 518 br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup 519 520for.body.preheader: ; preds = %entry 521 %wide.trip.count = zext i32 %n to i64 522 %min.iters.check = icmp ult i32 %n, 16 523 br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph 524 525vector.ph: ; preds = %for.body.preheader 526 %n.vec = and i64 %wide.trip.count, 4294967280 527 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0 528 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer 529 %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0 530 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer 531 br label %vector.body 532 533vector.body: ; preds = %vector.body, %vector.ph 534 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 535 %0 = getelementptr inbounds i16, ptr %x, i64 %index 536 %1 = bitcast ptr %0 to ptr 537 %wide.load = load <8 x i16>, ptr %1, align 2 538 %2 = getelementptr inbounds i16, ptr %0, i64 8 539 %3 = bitcast ptr %2 to ptr 540 %wide.load11 = load <8 x i16>, ptr %3, align 2 541 %4 = zext <8 x i16> %wide.load to <8 x i32> 542 %5 = zext <8 x i16> %wide.load11 to <8 x i32> 543 %6 = mul nuw <8 x i32> %broadcast.splat, %4 544 %7 = mul nuw <8 x i32> %broadcast.splat13, %5 545 %8 = getelementptr inbounds i32, ptr %s, i64 %index 546 %9 = bitcast ptr %8 to ptr 547 store <8 x i32> %6, ptr %9, align 4 548 %10 = getelementptr inbounds i32, ptr %8, i64 8 549 %11 = bitcast ptr %10 to ptr 550 store <8 x i32> %7, ptr %11, align 4 551 %index.next = add nuw i64 %index, 16 552 %12 = icmp eq i64 %index.next, %n.vec 553 br i1 %12, label %middle.block, label %vector.body 554 555middle.block: ; preds = %vector.body 556 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 557 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14 558 559for.body.preheader14: ; preds = %for.body.preheader, %middle.block 560 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 561 br label %for.body 562 563for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 564 ret void 565 566for.body: ; preds = %for.body.preheader14, %for.body 567 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ] 568 %arrayidx = getelementptr inbounds i16, ptr %x, i64 %indvars.iv 569 %13 = load i16, ptr %arrayidx, align 2 570 %conv = zext i16 %13 to i32 571 %mul = mul nuw i32 %conv, %conv1 572 %arrayidx3 = getelementptr inbounds i32, ptr %s, i64 %indvars.iv 573 store i32 %mul, ptr %arrayidx3, align 4 574 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 575 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count 576 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 577} 578 579 580define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, i8 noundef %B, i32 noundef %n) { 581; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s16: 582; CHECK-SD: // %bb.0: // %entry 583; CHECK-SD-NEXT: cbz w2, .LBB5_3 584; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader 585; CHECK-SD-NEXT: sxtb w9, w1 586; CHECK-SD-NEXT: cmp w2, #15 587; CHECK-SD-NEXT: mov w10, w2 588; CHECK-SD-NEXT: b.hi .LBB5_4 589; CHECK-SD-NEXT: // %bb.2: 590; CHECK-SD-NEXT: mov x11, xzr 591; CHECK-SD-NEXT: mov w8, wzr 592; CHECK-SD-NEXT: b .LBB5_7 593; CHECK-SD-NEXT: .LBB5_3: 594; CHECK-SD-NEXT: mov w8, wzr 595; CHECK-SD-NEXT: mov w0, w8 596; CHECK-SD-NEXT: ret 597; CHECK-SD-NEXT: .LBB5_4: // %vector.ph 598; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 599; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 600; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 601; CHECK-SD-NEXT: fmov s2, w9 602; CHECK-SD-NEXT: add x8, x0, #8 603; CHECK-SD-NEXT: mov x12, x11 604; CHECK-SD-NEXT: .LBB5_5: // %vector.body 605; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 606; CHECK-SD-NEXT: ldp d3, d4, [x8, #-8] 607; CHECK-SD-NEXT: subs x12, x12, #16 608; CHECK-SD-NEXT: add x8, x8, #16 609; CHECK-SD-NEXT: ushll v3.8h, v3.8b, #0 610; CHECK-SD-NEXT: ushll v4.8h, v4.8b, #0 611; CHECK-SD-NEXT: mla v0.8h, v3.8h, v2.h[0] 612; CHECK-SD-NEXT: mla v1.8h, v4.8h, v2.h[0] 613; CHECK-SD-NEXT: b.ne .LBB5_5 614; CHECK-SD-NEXT: // %bb.6: // %middle.block 615; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h 616; CHECK-SD-NEXT: cmp x11, x10 617; CHECK-SD-NEXT: addv h0, v0.8h 618; CHECK-SD-NEXT: fmov w8, s0 619; CHECK-SD-NEXT: b.eq .LBB5_9 620; CHECK-SD-NEXT: .LBB5_7: // %for.body.preheader1 621; CHECK-SD-NEXT: sub x10, x10, x11 622; CHECK-SD-NEXT: add x11, x0, x11 623; CHECK-SD-NEXT: .LBB5_8: // %for.body 624; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 625; CHECK-SD-NEXT: ldrb w12, [x11], #1 626; CHECK-SD-NEXT: subs x10, x10, #1 627; CHECK-SD-NEXT: madd w8, w12, w9, w8 628; CHECK-SD-NEXT: b.ne .LBB5_8 629; CHECK-SD-NEXT: .LBB5_9: // %for.cond.cleanup 630; CHECK-SD-NEXT: mov w0, w8 631; CHECK-SD-NEXT: ret 632; 633; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s16: 634; CHECK-GI: // %bb.0: // %entry 635; CHECK-GI-NEXT: cbz w2, .LBB5_3 636; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader 637; CHECK-GI-NEXT: cmp w2, #16 638; CHECK-GI-NEXT: mov w8, w2 639; CHECK-GI-NEXT: b.hs .LBB5_4 640; CHECK-GI-NEXT: // %bb.2: 641; CHECK-GI-NEXT: mov w10, #0 // =0x0 642; CHECK-GI-NEXT: mov x9, xzr 643; CHECK-GI-NEXT: fmov s0, w10 644; CHECK-GI-NEXT: b .LBB5_8 645; CHECK-GI-NEXT: .LBB5_3: 646; CHECK-GI-NEXT: mov w0, wzr 647; CHECK-GI-NEXT: ret 648; CHECK-GI-NEXT: .LBB5_4: // %vector.ph 649; CHECK-GI-NEXT: lsl w9, w1, #8 650; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 651; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 652; CHECK-GI-NEXT: add x10, x0, #8 653; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 654; CHECK-GI-NEXT: dup v2.8h, w9 655; CHECK-GI-NEXT: and x9, x8, #0xfffffff0 656; CHECK-GI-NEXT: mov x11, x9 657; CHECK-GI-NEXT: .LBB5_5: // %vector.body 658; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 659; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8] 660; CHECK-GI-NEXT: subs x11, x11, #16 661; CHECK-GI-NEXT: add x10, x10, #16 662; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 663; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0 664; CHECK-GI-NEXT: mla v0.8h, v2.8h, v3.8h 665; CHECK-GI-NEXT: mla v1.8h, v2.8h, v4.8h 666; CHECK-GI-NEXT: b.ne .LBB5_5 667; CHECK-GI-NEXT: // %bb.6: // %middle.block 668; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h 669; CHECK-GI-NEXT: cmp x9, x8 670; CHECK-GI-NEXT: addv h0, v0.8h 671; CHECK-GI-NEXT: b.ne .LBB5_8 672; CHECK-GI-NEXT: // %bb.7: 673; CHECK-GI-NEXT: fmov w0, s0 674; CHECK-GI-NEXT: ret 675; CHECK-GI-NEXT: .LBB5_8: // %for.body.preheader1 676; CHECK-GI-NEXT: sxtb w10, w1 677; CHECK-GI-NEXT: sub x8, x8, x9 678; CHECK-GI-NEXT: add x9, x0, x9 679; CHECK-GI-NEXT: .LBB5_9: // %for.body 680; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 681; CHECK-GI-NEXT: ldrb w11, [x9], #1 682; CHECK-GI-NEXT: fmov w12, s0 683; CHECK-GI-NEXT: subs x8, x8, #1 684; CHECK-GI-NEXT: mul w11, w11, w10 685; CHECK-GI-NEXT: add w0, w11, w12, uxth 686; CHECK-GI-NEXT: fmov s0, w0 687; CHECK-GI-NEXT: b.ne .LBB5_9 688; CHECK-GI-NEXT: // %bb.10: // %for.cond.cleanup 689; CHECK-GI-NEXT: ret 690entry: 691 %conv2 = sext i8 %B to i16 692 %cmp10.not = icmp eq i32 %n, 0 693 br i1 %cmp10.not, label %for.cond.cleanup, label %for.body.preheader 694 695for.body.preheader: ; preds = %entry 696 %wide.trip.count = zext i32 %n to i64 697 %min.iters.check = icmp ult i32 %n, 16 698 br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph 699 700vector.ph: ; preds = %for.body.preheader 701 %n.vec = and i64 %wide.trip.count, 4294967280 702 %broadcast.splatinsert = insertelement <8 x i16> poison, i16 %conv2, i64 0 703 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer 704 %broadcast.splatinsert15 = insertelement <8 x i16> poison, i16 %conv2, i64 0 705 %broadcast.splat16 = shufflevector <8 x i16> %broadcast.splatinsert15, <8 x i16> poison, <8 x i32> zeroinitializer 706 br label %vector.body 707 708vector.body: ; preds = %vector.body, %vector.ph 709 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 710 %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ] 711 %vec.phi13 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ] 712 %0 = getelementptr inbounds i8, ptr %A, i64 %index 713 %1 = bitcast ptr %0 to ptr 714 %wide.load = load <8 x i8>, ptr %1, align 1 715 %2 = getelementptr inbounds i8, ptr %0, i64 8 716 %3 = bitcast ptr %2 to ptr 717 %wide.load14 = load <8 x i8>, ptr %3, align 1 718 %4 = zext <8 x i8> %wide.load to <8 x i16> 719 %5 = zext <8 x i8> %wide.load14 to <8 x i16> 720 %6 = mul nsw <8 x i16> %broadcast.splat, %4 721 %7 = mul nsw <8 x i16> %broadcast.splat16, %5 722 %8 = add <8 x i16> %6, %vec.phi 723 %9 = add <8 x i16> %7, %vec.phi13 724 %index.next = add nuw i64 %index, 16 725 %10 = icmp eq i64 %index.next, %n.vec 726 br i1 %10, label %middle.block, label %vector.body 727 728middle.block: ; preds = %vector.body 729 %bin.rdx = add <8 x i16> %9, %8 730 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx) 731 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 732 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17 733 734for.body.preheader17: ; preds = %for.body.preheader, %middle.block 735 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 736 %s.011.ph = phi i16 [ 0, %for.body.preheader ], [ %11, %middle.block ] 737 br label %for.body 738 739for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 740 %s.0.lcssa = phi i16 [ 0, %entry ], [ %11, %middle.block ], [ %add, %for.body ] 741 ret i16 %s.0.lcssa 742 743for.body: ; preds = %for.body.preheader17, %for.body 744 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ] 745 %s.011 = phi i16 [ %add, %for.body ], [ %s.011.ph, %for.body.preheader17 ] 746 %arrayidx = getelementptr inbounds i8, ptr %A, i64 %indvars.iv 747 %12 = load i8, ptr %arrayidx, align 1 748 %13 = zext i8 %12 to i16 749 %mul = mul nsw i16 %13, %conv2 750 %add = add i16 %mul, %s.011 751 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 752 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count 753 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 754} 755 756define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { 757; CHECK-SD-LABEL: sink_v2z64_1: 758; CHECK-SD: // %bb.0: // %entry 759; CHECK-SD-NEXT: mov x8, xzr 760; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 761; CHECK-SD-NEXT: .LBB6_1: // %loop 762; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 763; CHECK-SD-NEXT: ldr d1, [x0] 764; CHECK-SD-NEXT: subs x2, x2, #8 765; CHECK-SD-NEXT: add x8, x8, #8 766; CHECK-SD-NEXT: umull v1.2d, v1.2s, v0.s[1] 767; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #15 768; CHECK-SD-NEXT: str d1, [x0], #32 769; CHECK-SD-NEXT: b.ne .LBB6_1 770; CHECK-SD-NEXT: // %bb.2: // %exit 771; CHECK-SD-NEXT: ret 772; 773; CHECK-GI-LABEL: sink_v2z64_1: 774; CHECK-GI: // %bb.0: // %entry 775; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 776; CHECK-GI-NEXT: mov x8, xzr 777; CHECK-GI-NEXT: dup v0.2d, v0.d[1] 778; CHECK-GI-NEXT: mov x9, v0.d[1] 779; CHECK-GI-NEXT: fmov x10, d0 780; CHECK-GI-NEXT: .LBB6_1: // %loop 781; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 782; CHECK-GI-NEXT: ldr d0, [x0] 783; CHECK-GI-NEXT: subs x2, x2, #8 784; CHECK-GI-NEXT: add x8, x8, #8 785; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 786; CHECK-GI-NEXT: fmov x11, d0 787; CHECK-GI-NEXT: mov x12, v0.d[1] 788; CHECK-GI-NEXT: mul x11, x11, x10 789; CHECK-GI-NEXT: mul x12, x12, x9 790; CHECK-GI-NEXT: mov v0.d[0], x11 791; CHECK-GI-NEXT: mov v0.d[1], x12 792; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15 793; CHECK-GI-NEXT: str d0, [x0], #32 794; CHECK-GI-NEXT: b.ne .LBB6_1 795; CHECK-GI-NEXT: // %bb.2: // %exit 796; CHECK-GI-NEXT: ret 797entry: 798 %ext = zext <2 x i32> %a to <2 x i64> 799 %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <2 x i32> <i32 1, i32 1> 800 br label %loop 801 802loop: 803 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] 804 %g = getelementptr inbounds i32, ptr %p, i64 %index 805 %gb = bitcast ptr %g to ptr 806 %l = load <2 x i32>, ptr %gb, align 4 807 %e = zext <2 x i32> %l to <2 x i64> 808 %m = mul <2 x i64> %e, %broadcast.splat 809 %s = ashr <2 x i64> %m, <i64 15, i64 15> 810 %t = trunc <2 x i64> %s to <2 x i32> 811 %h = getelementptr inbounds i32, ptr %d, i64 %index 812 %hb = bitcast ptr %g to ptr 813 store <2 x i32> %t, ptr %hb, align 4 814 %index.next = add nuw i64 %index, 8 815 %c = icmp eq i64 %index.next, %n 816 br i1 %c, label %exit, label %loop 817 818exit: 819 ret void 820} 821 822define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { 823; CHECK-SD-LABEL: sink_v4i64_1: 824; CHECK-SD: // %bb.0: // %entry 825; CHECK-SD-NEXT: mov x8, xzr 826; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 827; CHECK-SD-NEXT: .LBB7_1: // %loop 828; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 829; CHECK-SD-NEXT: ldr q1, [x0] 830; CHECK-SD-NEXT: subs x2, x2, #8 831; CHECK-SD-NEXT: add x8, x8, #8 832; CHECK-SD-NEXT: smull v2.2d, v1.2s, v0.s[1] 833; CHECK-SD-NEXT: smull2 v1.2d, v1.4s, v0.s[1] 834; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #15 835; CHECK-SD-NEXT: shrn2 v2.4s, v1.2d, #15 836; CHECK-SD-NEXT: str q2, [x0], #32 837; CHECK-SD-NEXT: b.ne .LBB7_1 838; CHECK-SD-NEXT: // %bb.2: // %exit 839; CHECK-SD-NEXT: ret 840; 841; CHECK-GI-LABEL: sink_v4i64_1: 842; CHECK-GI: // %bb.0: // %entry 843; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 844; CHECK-GI-NEXT: mov x8, xzr 845; CHECK-GI-NEXT: dup v0.2d, v0.d[1] 846; CHECK-GI-NEXT: mov x9, v0.d[1] 847; CHECK-GI-NEXT: fmov x10, d0 848; CHECK-GI-NEXT: .LBB7_1: // %loop 849; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 850; CHECK-GI-NEXT: ldr q0, [x0] 851; CHECK-GI-NEXT: subs x2, x2, #8 852; CHECK-GI-NEXT: add x8, x8, #8 853; CHECK-GI-NEXT: sshll v1.2d, v0.2s, #0 854; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0 855; CHECK-GI-NEXT: fmov x11, d1 856; CHECK-GI-NEXT: mov x12, v1.d[1] 857; CHECK-GI-NEXT: fmov x13, d0 858; CHECK-GI-NEXT: mov x14, v0.d[1] 859; CHECK-GI-NEXT: mul x11, x11, x10 860; CHECK-GI-NEXT: mul x13, x13, x10 861; CHECK-GI-NEXT: mul x12, x12, x9 862; CHECK-GI-NEXT: mov v0.d[0], x11 863; CHECK-GI-NEXT: mul x11, x14, x9 864; CHECK-GI-NEXT: mov v1.d[0], x13 865; CHECK-GI-NEXT: mov v0.d[1], x12 866; CHECK-GI-NEXT: mov v1.d[1], x11 867; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15 868; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #15 869; CHECK-GI-NEXT: str q0, [x0], #32 870; CHECK-GI-NEXT: b.ne .LBB7_1 871; CHECK-GI-NEXT: // %bb.2: // %exit 872; CHECK-GI-NEXT: ret 873entry: 874 %ext = sext <2 x i32> %a to <2 x i64> 875 %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 876 br label %loop 877 878loop: 879 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] 880 %g = getelementptr inbounds i32, ptr %p, i64 %index 881 %gb = bitcast ptr %g to ptr 882 %l = load <4 x i32>, ptr %gb, align 4 883 %e = sext <4 x i32> %l to <4 x i64> 884 %m = mul <4 x i64> %e, %broadcast.splat 885 %s = ashr <4 x i64> %m, <i64 15, i64 15, i64 15, i64 15> 886 %t = trunc <4 x i64> %s to <4 x i32> 887 %h = getelementptr inbounds i32, ptr %d, i64 %index 888 %hb = bitcast ptr %g to ptr 889 store <4 x i32> %t, ptr %hb, align 4 890 %index.next = add nuw i64 %index, 8 891 %c = icmp eq i64 %index.next, %n 892 br i1 %c, label %exit, label %loop 893 894exit: 895 ret void 896} 897 898define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { 899; CHECK-SD-LABEL: sink_v8z16_0: 900; CHECK-SD: // %bb.0: // %entry 901; CHECK-SD-NEXT: dup v0.8b, v0.b[0] 902; CHECK-SD-NEXT: mov x8, xzr 903; CHECK-SD-NEXT: .LBB8_1: // %loop 904; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 905; CHECK-SD-NEXT: ldr d1, [x0] 906; CHECK-SD-NEXT: subs x2, x2, #8 907; CHECK-SD-NEXT: add x8, x8, #8 908; CHECK-SD-NEXT: umull v1.8h, v1.8b, v0.8b 909; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0 910; CHECK-SD-NEXT: xtn v1.8b, v1.8h 911; CHECK-SD-NEXT: str d1, [x0], #32 912; CHECK-SD-NEXT: b.ne .LBB8_1 913; CHECK-SD-NEXT: // %bb.2: // %exit 914; CHECK-SD-NEXT: ret 915; 916; CHECK-GI-LABEL: sink_v8z16_0: 917; CHECK-GI: // %bb.0: // %entry 918; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 919; CHECK-GI-NEXT: mov x8, xzr 920; CHECK-GI-NEXT: .LBB8_1: // %loop 921; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 922; CHECK-GI-NEXT: ldr d1, [x0] 923; CHECK-GI-NEXT: subs x2, x2, #8 924; CHECK-GI-NEXT: add x8, x8, #8 925; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 926; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[0] 927; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15 928; CHECK-GI-NEXT: xtn v1.8b, v1.8h 929; CHECK-GI-NEXT: str d1, [x0], #32 930; CHECK-GI-NEXT: b.ne .LBB8_1 931; CHECK-GI-NEXT: // %bb.2: // %exit 932; CHECK-GI-NEXT: ret 933entry: 934 %ext = zext <16 x i8> %a to <16 x i16> 935 %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 936 br label %loop 937 938loop: 939 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] 940 %g = getelementptr inbounds i32, ptr %p, i64 %index 941 %gb = bitcast ptr %g to ptr 942 %l = load <8 x i8>, ptr %gb, align 4 943 %e = zext <8 x i8> %l to <8 x i16> 944 %m = mul <8 x i16> %e, %broadcast.splat 945 %s = ashr <8 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 946 %t = trunc <8 x i16> %s to <8 x i8> 947 %h = getelementptr inbounds i32, ptr %d, i64 %index 948 %hb = bitcast ptr %g to ptr 949 store <8 x i8> %t, ptr %hb, align 4 950 %index.next = add nuw i64 %index, 8 951 %c = icmp eq i64 %index.next, %n 952 br i1 %c, label %exit, label %loop 953 954exit: 955 ret void 956} 957 958define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { 959; CHECK-SD-LABEL: sink_v16s16_8: 960; CHECK-SD: // %bb.0: // %entry 961; CHECK-SD-NEXT: dup v0.16b, v0.b[10] 962; CHECK-SD-NEXT: mov x8, xzr 963; CHECK-SD-NEXT: .LBB9_1: // %loop 964; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 965; CHECK-SD-NEXT: ldr q1, [x0] 966; CHECK-SD-NEXT: subs x2, x2, #8 967; CHECK-SD-NEXT: add x8, x8, #8 968; CHECK-SD-NEXT: smull v2.8h, v1.8b, v0.8b 969; CHECK-SD-NEXT: smull2 v1.8h, v1.16b, v0.16b 970; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0 971; CHECK-SD-NEXT: cmlt v2.8h, v2.8h, #0 972; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b 973; CHECK-SD-NEXT: str q1, [x0], #32 974; CHECK-SD-NEXT: b.ne .LBB9_1 975; CHECK-SD-NEXT: // %bb.2: // %exit 976; CHECK-SD-NEXT: ret 977; 978; CHECK-GI-LABEL: sink_v16s16_8: 979; CHECK-GI: // %bb.0: // %entry 980; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 981; CHECK-GI-NEXT: mov x8, xzr 982; CHECK-GI-NEXT: .LBB9_1: // %loop 983; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 984; CHECK-GI-NEXT: ldr q1, [x0] 985; CHECK-GI-NEXT: subs x2, x2, #8 986; CHECK-GI-NEXT: add x8, x8, #8 987; CHECK-GI-NEXT: sshll v2.8h, v1.8b, #0 988; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 989; CHECK-GI-NEXT: mul v2.8h, v2.8h, v0.h[2] 990; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[2] 991; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15 992; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15 993; CHECK-GI-NEXT: uzp1 v1.16b, v2.16b, v1.16b 994; CHECK-GI-NEXT: str q1, [x0], #32 995; CHECK-GI-NEXT: b.ne .LBB9_1 996; CHECK-GI-NEXT: // %bb.2: // %exit 997; CHECK-GI-NEXT: ret 998entry: 999 %ext = sext <16 x i8> %a to <16 x i16> 1000 %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <16 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> 1001 br label %loop 1002 1003loop: 1004 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] 1005 %g = getelementptr inbounds i32, ptr %p, i64 %index 1006 %gb = bitcast ptr %g to ptr 1007 %l = load <16 x i8>, ptr %gb, align 4 1008 %e = sext <16 x i8> %l to <16 x i16> 1009 %m = mul <16 x i16> %e, %broadcast.splat 1010 %s = ashr <16 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 1011 %t = trunc <16 x i16> %s to <16 x i8> 1012 %h = getelementptr inbounds i32, ptr %d, i64 %index 1013 %hb = bitcast ptr %g to ptr 1014 store <16 x i8> %t, ptr %hb, align 4 1015 %index.next = add nuw i64 %index, 8 1016 %c = icmp eq i64 %index.next, %n 1017 br i1 %c, label %exit, label %loop 1018 1019exit: 1020 ret void 1021} 1022 1023define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { 1024; CHECK-SD-LABEL: matrix_mul_unsigned_and: 1025; CHECK-SD: // %bb.0: // %vector.header 1026; CHECK-SD-NEXT: dup v0.4h, w3 1027; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 1028; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 1029; CHECK-SD-NEXT: .LBB10_1: // %vector.body 1030; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 1031; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 1032; CHECK-SD-NEXT: subs x8, x8, #8 1033; CHECK-SD-NEXT: ldp d1, d2, [x9] 1034; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 1035; CHECK-SD-NEXT: add w0, w0, #8 1036; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h 1037; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h 1038; CHECK-SD-NEXT: stp q1, q2, [x9] 1039; CHECK-SD-NEXT: b.ne .LBB10_1 1040; CHECK-SD-NEXT: // %bb.2: // %for.end12 1041; CHECK-SD-NEXT: ret 1042; 1043; CHECK-GI-LABEL: matrix_mul_unsigned_and: 1044; CHECK-GI: // %bb.0: // %vector.header 1045; CHECK-GI-NEXT: and w8, w3, #0xffff 1046; CHECK-GI-NEXT: dup v0.4s, w8 1047; CHECK-GI-NEXT: mov w8, w0 1048; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 1049; CHECK-GI-NEXT: .LBB10_1: // %vector.body 1050; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 1051; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 1052; CHECK-GI-NEXT: subs x8, x8, #8 1053; CHECK-GI-NEXT: ldp d1, d2, [x9] 1054; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 1055; CHECK-GI-NEXT: add w0, w0, #8 1056; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 1057; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 1058; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s 1059; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 1060; CHECK-GI-NEXT: stp q1, q2, [x9] 1061; CHECK-GI-NEXT: b.ne .LBB10_1 1062; CHECK-GI-NEXT: // %bb.2: // %for.end12 1063; CHECK-GI-NEXT: ret 1064vector.header: 1065 %conv4 = and i32 %val, 65535 1066 %wide.trip.count = zext i32 %N to i64 1067 %0 = add nsw i64 %wide.trip.count, -1 1068 %min.iters.check = icmp ult i32 %N, 8 1069 %1 = trunc i64 %0 to i32 1070 %2 = icmp ugt i64 %0, 4294967295 1071 %n.vec = and i64 %wide.trip.count, 4294967288 1072 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 1073 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 1074 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 1075 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer 1076 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 1077 br label %vector.body 1078 1079vector.body: ; preds = %vector.header, %vector.body 1080 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] 1081 %3 = trunc i64 %index to i32 1082 %4 = add i32 %N, %3 1083 %5 = zext i32 %4 to i64 1084 %6 = getelementptr inbounds i16, ptr %A, i64 %5 1085 %7 = bitcast ptr %6 to ptr 1086 %wide.load = load <4 x i16>, ptr %7, align 2 1087 %8 = getelementptr inbounds i16, ptr %6, i64 4 1088 %9 = bitcast ptr %8 to ptr 1089 %wide.load30 = load <4 x i16>, ptr %9, align 2 1090 %10 = zext <4 x i16> %wide.load to <4 x i32> 1091 %11 = zext <4 x i16> %wide.load30 to <4 x i32> 1092 %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10 1093 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11 1094 %14 = getelementptr inbounds i32, ptr %C, i64 %5 1095 %15 = bitcast ptr %14 to ptr 1096 store <4 x i32> %12, ptr %15, align 4 1097 %16 = getelementptr inbounds i32, ptr %14, i64 4 1098 %17 = bitcast ptr %16 to ptr 1099 store <4 x i32> %13, ptr %17, align 4 1100 %index.next = add i64 %index, 8 1101 %18 = icmp eq i64 %index.next, %n.vec 1102 br i1 %18, label %for.end12, label %vector.body 1103 1104for.end12: ; preds = %vector.body 1105 ret void 1106} 1107 1108define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { 1109; CHECK-SD-LABEL: matrix_mul_unsigned_and_double: 1110; CHECK-SD: // %bb.0: // %vector.header 1111; CHECK-SD-NEXT: dup v0.8h, w3 1112; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 1113; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 1114; CHECK-SD-NEXT: .LBB11_1: // %vector.body 1115; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 1116; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 1117; CHECK-SD-NEXT: subs x8, x8, #16 1118; CHECK-SD-NEXT: ldr q1, [x9] 1119; CHECK-SD-NEXT: ldur q2, [x9, #8] 1120; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 1121; CHECK-SD-NEXT: add w0, w0, #16 1122; CHECK-SD-NEXT: umull2 v3.4s, v0.8h, v1.8h 1123; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h 1124; CHECK-SD-NEXT: umull2 v4.4s, v0.8h, v2.8h 1125; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h 1126; CHECK-SD-NEXT: stp q1, q3, [x9] 1127; CHECK-SD-NEXT: stp q2, q4, [x9, #32] 1128; CHECK-SD-NEXT: b.ne .LBB11_1 1129; CHECK-SD-NEXT: // %bb.2: // %for.end12 1130; CHECK-SD-NEXT: ret 1131; 1132; CHECK-GI-LABEL: matrix_mul_unsigned_and_double: 1133; CHECK-GI: // %bb.0: // %vector.header 1134; CHECK-GI-NEXT: and w8, w3, #0xffff 1135; CHECK-GI-NEXT: dup v0.4s, w8 1136; CHECK-GI-NEXT: mov w8, w0 1137; CHECK-GI-NEXT: and x8, x8, #0xfffffff0 1138; CHECK-GI-NEXT: .LBB11_1: // %vector.body 1139; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 1140; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 1141; CHECK-GI-NEXT: subs x8, x8, #16 1142; CHECK-GI-NEXT: ldr q1, [x9] 1143; CHECK-GI-NEXT: ldur q2, [x9, #8] 1144; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 1145; CHECK-GI-NEXT: add w0, w0, #16 1146; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 1147; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 1148; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 1149; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 1150; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s 1151; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s 1152; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s 1153; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 1154; CHECK-GI-NEXT: stp q3, q1, [x9] 1155; CHECK-GI-NEXT: stp q4, q2, [x9, #32]! 1156; CHECK-GI-NEXT: b.ne .LBB11_1 1157; CHECK-GI-NEXT: // %bb.2: // %for.end12 1158; CHECK-GI-NEXT: ret 1159vector.header: 1160 %conv4 = and i32 %val, 65535 1161 %wide.trip.count = zext i32 %N to i64 1162 %0 = add nsw i64 %wide.trip.count, -1 1163 %min.iters.check = icmp ult i32 %N, 16 1164 %1 = trunc i64 %0 to i32 1165 %2 = icmp ugt i64 %0, 4294967295 1166 %n.vec = and i64 %wide.trip.count, 4294967280 1167 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0 1168 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 1169 %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0 1170 %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer 1171 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 1172 br label %vector.body 1173 1174vector.body: ; preds = %vector.header, %vector.body 1175 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] 1176 %3 = trunc i64 %index to i32 1177 %4 = add i32 %N, %3 1178 %5 = zext i32 %4 to i64 1179 %6 = getelementptr inbounds i16, ptr %A, i64 %5 1180 %7 = bitcast ptr %6 to ptr 1181 %wide.load = load <8 x i16>, ptr %7, align 2 1182 %8 = getelementptr inbounds i16, ptr %6, i64 4 1183 %9 = bitcast ptr %8 to ptr 1184 %wide.load30 = load <8 x i16>, ptr %9, align 2 1185 %10 = zext <8 x i16> %wide.load to <8 x i32> 1186 %11 = zext <8 x i16> %wide.load30 to <8 x i32> 1187 %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10 1188 %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11 1189 %14 = getelementptr inbounds i32, ptr %C, i64 %5 1190 %15 = bitcast ptr %14 to ptr 1191 store <8 x i32> %12, ptr %15, align 4 1192 %16 = getelementptr inbounds i32, ptr %14, i64 8 1193 %17 = bitcast ptr %16 to ptr 1194 store <8 x i32> %13, ptr %17, align 4 1195 %index.next = add i64 %index, 16 1196 %18 = icmp eq i64 %index.next, %n.vec 1197 br i1 %18, label %for.end12, label %vector.body 1198 1199for.end12: ; preds = %vector.body 1200 ret void 1201} 1202 1203define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { 1204; CHECK-SD-LABEL: matrix_mul_signed_and: 1205; CHECK-SD: // %bb.0: // %vector.header 1206; CHECK-SD-NEXT: and w9, w3, #0xffff 1207; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 1208; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 1209; CHECK-SD-NEXT: fmov s0, w9 1210; CHECK-SD-NEXT: .LBB12_1: // %vector.body 1211; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 1212; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 1213; CHECK-SD-NEXT: subs x8, x8, #8 1214; CHECK-SD-NEXT: ldp d1, d2, [x9] 1215; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 1216; CHECK-SD-NEXT: add w0, w0, #8 1217; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 1218; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0 1219; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0] 1220; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] 1221; CHECK-SD-NEXT: stp q1, q2, [x9] 1222; CHECK-SD-NEXT: b.ne .LBB12_1 1223; CHECK-SD-NEXT: // %bb.2: // %for.end12 1224; CHECK-SD-NEXT: ret 1225; 1226; CHECK-GI-LABEL: matrix_mul_signed_and: 1227; CHECK-GI: // %bb.0: // %vector.header 1228; CHECK-GI-NEXT: and w8, w3, #0xffff 1229; CHECK-GI-NEXT: dup v0.4s, w8 1230; CHECK-GI-NEXT: mov w8, w0 1231; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 1232; CHECK-GI-NEXT: .LBB12_1: // %vector.body 1233; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 1234; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 1235; CHECK-GI-NEXT: subs x8, x8, #8 1236; CHECK-GI-NEXT: ldp d1, d2, [x9] 1237; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 1238; CHECK-GI-NEXT: add w0, w0, #8 1239; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 1240; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 1241; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s 1242; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 1243; CHECK-GI-NEXT: stp q1, q2, [x9] 1244; CHECK-GI-NEXT: b.ne .LBB12_1 1245; CHECK-GI-NEXT: // %bb.2: // %for.end12 1246; CHECK-GI-NEXT: ret 1247vector.header: 1248 %conv4 = and i32 %val, 65535 1249 %wide.trip.count = zext i32 %N to i64 1250 %0 = add nsw i64 %wide.trip.count, -1 1251 %min.iters.check = icmp ult i32 %N, 8 1252 %1 = trunc i64 %0 to i32 1253 %2 = icmp ugt i64 %0, 4294967295 1254 %n.vec = and i64 %wide.trip.count, 4294967288 1255 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 1256 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 1257 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 1258 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer 1259 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 1260 br label %vector.body 1261 1262vector.body: ; preds = %vector.header, %vector.body 1263 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] 1264 %3 = trunc i64 %index to i32 1265 %4 = add i32 %N, %3 1266 %5 = zext i32 %4 to i64 1267 %6 = getelementptr inbounds i16, ptr %A, i64 %5 1268 %7 = bitcast ptr %6 to ptr 1269 %wide.load = load <4 x i16>, ptr %7, align 2 1270 %8 = getelementptr inbounds i16, ptr %6, i64 4 1271 %9 = bitcast ptr %8 to ptr 1272 %wide.load30 = load <4 x i16>, ptr %9, align 2 1273 %10 = sext <4 x i16> %wide.load to <4 x i32> 1274 %11 = sext <4 x i16> %wide.load30 to <4 x i32> 1275 %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10 1276 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11 1277 %14 = getelementptr inbounds i32, ptr %C, i64 %5 1278 %15 = bitcast ptr %14 to ptr 1279 store <4 x i32> %12, ptr %15, align 4 1280 %16 = getelementptr inbounds i32, ptr %14, i64 4 1281 %17 = bitcast ptr %16 to ptr 1282 store <4 x i32> %13, ptr %17, align 4 1283 %index.next = add i64 %index, 8 1284 %18 = icmp eq i64 %index.next, %n.vec 1285 br i1 %18, label %for.end12, label %vector.body 1286 1287for.end12: ; preds = %vector.body 1288 ret void 1289} 1290 1291define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { 1292; CHECK-SD-LABEL: matrix_mul_signed_and_double: 1293; CHECK-SD: // %bb.0: // %vector.header 1294; CHECK-SD-NEXT: and w9, w3, #0xffff 1295; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 1296; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 1297; CHECK-SD-NEXT: fmov s0, w9 1298; CHECK-SD-NEXT: .LBB13_1: // %vector.body 1299; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 1300; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 1301; CHECK-SD-NEXT: subs x8, x8, #16 1302; CHECK-SD-NEXT: ldr q1, [x9] 1303; CHECK-SD-NEXT: ldur q2, [x9, #8] 1304; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 1305; CHECK-SD-NEXT: add w0, w0, #16 1306; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0 1307; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 1308; CHECK-SD-NEXT: sshll2 v4.4s, v2.8h, #0 1309; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0 1310; CHECK-SD-NEXT: mul v3.4s, v3.4s, v0.s[0] 1311; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0] 1312; CHECK-SD-NEXT: mul v4.4s, v4.4s, v0.s[0] 1313; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] 1314; CHECK-SD-NEXT: stp q1, q3, [x9] 1315; CHECK-SD-NEXT: stp q2, q4, [x9, #32] 1316; CHECK-SD-NEXT: b.ne .LBB13_1 1317; CHECK-SD-NEXT: // %bb.2: // %for.end12 1318; CHECK-SD-NEXT: ret 1319; 1320; CHECK-GI-LABEL: matrix_mul_signed_and_double: 1321; CHECK-GI: // %bb.0: // %vector.header 1322; CHECK-GI-NEXT: and w8, w3, #0xffff 1323; CHECK-GI-NEXT: dup v0.4s, w8 1324; CHECK-GI-NEXT: mov w8, w0 1325; CHECK-GI-NEXT: and x8, x8, #0xfffffff0 1326; CHECK-GI-NEXT: .LBB13_1: // %vector.body 1327; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 1328; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 1329; CHECK-GI-NEXT: subs x8, x8, #16 1330; CHECK-GI-NEXT: ldr q1, [x9] 1331; CHECK-GI-NEXT: ldur q2, [x9, #8] 1332; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 1333; CHECK-GI-NEXT: add w0, w0, #16 1334; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 1335; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 1336; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 1337; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 1338; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s 1339; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s 1340; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s 1341; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s 1342; CHECK-GI-NEXT: stp q3, q1, [x9] 1343; CHECK-GI-NEXT: stp q4, q2, [x9, #32]! 1344; CHECK-GI-NEXT: b.ne .LBB13_1 1345; CHECK-GI-NEXT: // %bb.2: // %for.end12 1346; CHECK-GI-NEXT: ret 1347vector.header: 1348 %conv4 = and i32 %val, 65535 1349 %wide.trip.count = zext i32 %N to i64 1350 %0 = add nsw i64 %wide.trip.count, -1 1351 %min.iters.check = icmp ult i32 %N, 16 1352 %1 = trunc i64 %0 to i32 1353 %2 = icmp ugt i64 %0, 4294967295 1354 %n.vec = and i64 %wide.trip.count, 4294967280 1355 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0 1356 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 1357 %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0 1358 %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer 1359 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 1360 br label %vector.body 1361 1362vector.body: ; preds = %vector.header, %vector.body 1363 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] 1364 %3 = trunc i64 %index to i32 1365 %4 = add i32 %N, %3 1366 %5 = zext i32 %4 to i64 1367 %6 = getelementptr inbounds i16, ptr %A, i64 %5 1368 %7 = bitcast ptr %6 to ptr 1369 %wide.load = load <8 x i16>, ptr %7, align 2 1370 %8 = getelementptr inbounds i16, ptr %6, i64 4 1371 %9 = bitcast ptr %8 to ptr 1372 %wide.load30 = load <8 x i16>, ptr %9, align 2 1373 %10 = sext <8 x i16> %wide.load to <8 x i32> 1374 %11 = sext <8 x i16> %wide.load30 to <8 x i32> 1375 %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10 1376 %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11 1377 %14 = getelementptr inbounds i32, ptr %C, i64 %5 1378 %15 = bitcast ptr %14 to ptr 1379 store <8 x i32> %12, ptr %15, align 4 1380 %16 = getelementptr inbounds i32, ptr %14, i64 8 1381 %17 = bitcast ptr %16 to ptr 1382 store <8 x i32> %13, ptr %17, align 4 1383 %index.next = add i64 %index, 16 1384 %18 = icmp eq i64 %index.next, %n.vec 1385 br i1 %18, label %for.end12, label %vector.body 1386 1387for.end12: ; preds = %vector.body 1388 ret void 1389} 1390 1391declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 1392 1393;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 1394; CHECK: {{.*}} 1395