1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64 | FileCheck %s 3 4define void @vld2(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { 5; CHECK-LABEL: vld2: 6; CHECK: .Lfunc_begin0: 7; CHECK-NEXT: .cfi_startproc 8; CHECK-NEXT: // %bb.0: // %entry 9; CHECK-NEXT: mov x8, xzr 10; CHECK-NEXT: .LBB0_1: // %vector.body 11; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 12; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 13; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s 14; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s 15; CHECK-NEXT: str q2, [x1, x8] 16; CHECK-NEXT: add x8, x8, #16 17; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 18; CHECK-NEXT: b.ne .LBB0_1 19; CHECK-NEXT: // %bb.2: // %while.end 20; CHECK-NEXT: ret 21entry: 22 br label %vector.body 23 24vector.body: ; preds = %vector.body, %entry 25 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 26 %0 = shl i64 %index, 1 27 %next.gep = getelementptr float, ptr %pSrc, i64 %0 28 %next.gep19 = getelementptr float, ptr %pDst, i64 %index 29 %wide.vec = load <8 x float>, ptr %next.gep, align 4 30 %1 = fmul fast <8 x float> %wide.vec, %wide.vec 31 %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 32 %3 = fmul fast <8 x float> %wide.vec, %wide.vec 33 %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 34 %5 = fadd fast <4 x float> %4, %2 35 store <4 x float> %5, ptr %next.gep19, align 4 36 %index.next = add i64 %index, 4 37 %6 = icmp eq i64 %index.next, 1024 38 br i1 %6, label %while.end, label %vector.body 39 40while.end: ; preds = %vector.body 41 ret void 42} 43 44define void @vld3(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { 45; CHECK-LABEL: vld3: 46; CHECK: .Lfunc_begin1: 47; CHECK-NEXT: .cfi_startproc 48; CHECK-NEXT: // %bb.0: // %entry 49; CHECK-NEXT: mov x8, xzr 50; CHECK-NEXT: .LBB1_1: // %vector.body 51; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 52; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 53; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s 54; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s 55; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s 56; CHECK-NEXT: str q3, [x1, x8] 57; CHECK-NEXT: add x8, x8, #16 58; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 59; CHECK-NEXT: b.ne .LBB1_1 60; CHECK-NEXT: // %bb.2: // %while.end 61; CHECK-NEXT: ret 62entry: 63 br label %vector.body 64 65vector.body: ; preds = %vector.body, %entry 66 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 67 %0 = mul i64 %index, 3 68 %next.gep = getelementptr float, ptr %pSrc, i64 %0 69 %next.gep23 = getelementptr float, ptr %pDst, i64 %index 70 %wide.vec = load <12 x float>, ptr %next.gep, align 4 71 %1 = fmul fast <12 x float> %wide.vec, %wide.vec 72 %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 73 %3 = fmul fast <12 x float> %wide.vec, %wide.vec 74 %4 = shufflevector <12 x float> %3, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 75 %5 = fadd fast <4 x float> %4, %2 76 %6 = fmul fast <12 x float> %wide.vec, %wide.vec 77 %7 = shufflevector <12 x float> %6, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 78 %8 = fadd fast <4 x float> %5, %7 79 store <4 x float> %8, ptr %next.gep23, align 4 80 %index.next = add i64 %index, 4 81 %9 = icmp eq i64 %index.next, 1024 82 br i1 %9, label %while.end, label %vector.body 83 84while.end: ; preds = %vector.body 85 ret void 86} 87 88define void @vld4(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { 89; CHECK-LABEL: vld4: 90; CHECK: .Lfunc_begin2: 91; CHECK-NEXT: .cfi_startproc 92; CHECK-NEXT: // %bb.0: // %entry 93; CHECK-NEXT: mov x8, xzr 94; CHECK-NEXT: .LBB2_1: // %vector.body 95; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 96; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 97; CHECK-NEXT: add x9, x1, x8 98; CHECK-NEXT: add x8, x8, #32 99; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 100; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s 101; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s 102; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s 103; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s 104; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] 105; CHECK-NEXT: b.ne .LBB2_1 106; CHECK-NEXT: // %bb.2: // %while.end 107; CHECK-NEXT: ret 108entry: 109 br label %vector.body 110 111vector.body: ; preds = %vector.body, %entry 112 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 113 %0 = shl i64 %index, 2 114 %next.gep = getelementptr float, ptr %pSrc, i64 %0 115 %1 = shl i64 %index, 1 116 %wide.vec = load <16 x float>, ptr %next.gep, align 4 117 %2 = fmul fast <16 x float> %wide.vec, %wide.vec 118 %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 119 %4 = fmul fast <16 x float> %wide.vec, %wide.vec 120 %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 121 %6 = fadd fast <4 x float> %5, %3 122 %7 = fmul fast <16 x float> %wide.vec, %wide.vec 123 %8 = shufflevector <16 x float> %7, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 124 %9 = fmul fast <16 x float> %wide.vec, %wide.vec 125 %10 = shufflevector <16 x float> %9, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 126 %11 = fadd fast <4 x float> %10, %8 127 %12 = getelementptr inbounds float, ptr %pDst, i64 %1 128 %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %11, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 129 store <8 x float> %interleaved.vec, ptr %12, align 4 130 %index.next = add i64 %index, 4 131 %13 = icmp eq i64 %index.next, 1024 132 br i1 %13, label %while.end, label %vector.body 133 134while.end: ; preds = %vector.body 135 ret void 136} 137 138define void @twosrc(ptr nocapture readonly %pSrc, ptr nocapture readonly %pSrc2, ptr noalias nocapture %pDst, i32 %numSamples) { 139; CHECK-LABEL: twosrc: 140; CHECK: .Lfunc_begin3: 141; CHECK-NEXT: .cfi_startproc 142; CHECK-NEXT: // %bb.0: // %entry 143; CHECK-NEXT: mov x8, xzr 144; CHECK-NEXT: .LBB3_1: // %vector.body 145; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 146; CHECK-NEXT: add x9, x0, x8 147; CHECK-NEXT: add x10, x1, x8 148; CHECK-NEXT: add x8, x8, #32 149; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] 150; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 151; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10] 152; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s 153; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s 154; CHECK-NEXT: str q4, [x2], #16 155; CHECK-NEXT: b.ne .LBB3_1 156; CHECK-NEXT: // %bb.2: // %while.end 157; CHECK-NEXT: ret 158entry: 159 br label %vector.body 160 161vector.body: ; preds = %vector.body, %entry 162 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 163 %0 = shl i64 %index, 1 164 %next.gep = getelementptr float, ptr %pSrc, i64 %0 165 %1 = shl i64 %index, 1 166 %next.gep23 = getelementptr float, ptr %pSrc2, i64 %1 167 %next.gep24 = getelementptr float, ptr %pDst, i64 %index 168 %wide.vec = load <8 x float>, ptr %next.gep, align 4 169 %wide.vec26 = load <8 x float>, ptr %next.gep23, align 4 170 %2 = fmul fast <8 x float> %wide.vec26, %wide.vec 171 %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 172 %4 = fmul fast <8 x float> %wide.vec26, %wide.vec 173 %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 174 %6 = fadd fast <4 x float> %5, %3 175 store <4 x float> %6, ptr %next.gep24, align 4 176 %index.next = add i64 %index, 4 177 %7 = icmp eq i64 %index.next, 1024 178 br i1 %7, label %while.end, label %vector.body 179 180while.end: ; preds = %vector.body 181 ret void 182} 183 184define void @vld2_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { 185; CHECK-LABEL: vld2_multiuse: 186; CHECK: .Lfunc_begin4: 187; CHECK-NEXT: .cfi_startproc 188; CHECK-NEXT: // %bb.0: // %entry 189; CHECK-NEXT: mov x8, xzr 190; CHECK-NEXT: .LBB4_1: // %vector.body 191; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 192; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 193; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s 194; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s 195; CHECK-NEXT: str q2, [x1, x8] 196; CHECK-NEXT: add x8, x8, #16 197; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 198; CHECK-NEXT: b.ne .LBB4_1 199; CHECK-NEXT: // %bb.2: // %while.end 200; CHECK-NEXT: ret 201entry: 202 br label %vector.body 203 204vector.body: ; preds = %vector.body, %entry 205 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 206 %0 = shl i64 %index, 1 207 %next.gep = getelementptr float, ptr %pSrc, i64 %0 208 %next.gep19 = getelementptr float, ptr %pDst, i64 %index 209 %wide.vec = load <8 x float>, ptr %next.gep, align 4 210 %1 = fmul fast <8 x float> %wide.vec, %wide.vec 211 %2 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 212 %3 = shufflevector <8 x float> %1, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 213 %4 = fadd fast <4 x float> %3, %2 214 store <4 x float> %4, ptr %next.gep19, align 4 215 %index.next = add i64 %index, 4 216 %5 = icmp eq i64 %index.next, 1024 217 br i1 %5, label %while.end, label %vector.body 218 219while.end: ; preds = %vector.body 220 ret void 221} 222 223define void @vld3_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { 224; CHECK-LABEL: vld3_multiuse: 225; CHECK: .Lfunc_begin5: 226; CHECK-NEXT: .cfi_startproc 227; CHECK-NEXT: // %bb.0: // %entry 228; CHECK-NEXT: mov x8, xzr 229; CHECK-NEXT: .LBB5_1: // %vector.body 230; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 231; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 232; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s 233; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s 234; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s 235; CHECK-NEXT: str q3, [x1, x8] 236; CHECK-NEXT: add x8, x8, #16 237; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 238; CHECK-NEXT: b.ne .LBB5_1 239; CHECK-NEXT: // %bb.2: // %while.end 240; CHECK-NEXT: ret 241entry: 242 br label %vector.body 243 244vector.body: ; preds = %vector.body, %entry 245 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 246 %0 = mul i64 %index, 3 247 %next.gep = getelementptr float, ptr %pSrc, i64 %0 248 %next.gep23 = getelementptr float, ptr %pDst, i64 %index 249 %wide.vec = load <12 x float>, ptr %next.gep, align 4 250 %1 = fmul fast <12 x float> %wide.vec, %wide.vec 251 %2 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 252 %3 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 253 %4 = fadd fast <4 x float> %3, %2 254 %5 = shufflevector <12 x float> %1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 255 %6 = fadd fast <4 x float> %4, %5 256 store <4 x float> %6, ptr %next.gep23, align 4 257 %index.next = add i64 %index, 4 258 %7 = icmp eq i64 %index.next, 1024 259 br i1 %7, label %while.end, label %vector.body 260 261while.end: ; preds = %vector.body 262 ret void 263} 264 265define void @vld4_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) { 266; CHECK-LABEL: vld4_multiuse: 267; CHECK: .Lfunc_begin6: 268; CHECK-NEXT: .cfi_startproc 269; CHECK-NEXT: // %bb.0: // %entry 270; CHECK-NEXT: mov x8, xzr 271; CHECK-NEXT: .LBB6_1: // %vector.body 272; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 273; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 274; CHECK-NEXT: add x9, x1, x8 275; CHECK-NEXT: add x8, x8, #32 276; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 277; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s 278; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s 279; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s 280; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s 281; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] 282; CHECK-NEXT: b.ne .LBB6_1 283; CHECK-NEXT: // %bb.2: // %while.end 284; CHECK-NEXT: ret 285entry: 286 br label %vector.body 287 288vector.body: ; preds = %vector.body, %entry 289 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 290 %0 = shl i64 %index, 2 291 %next.gep = getelementptr float, ptr %pSrc, i64 %0 292 %1 = shl i64 %index, 1 293 %wide.vec = load <16 x float>, ptr %next.gep, align 4 294 %2 = fmul fast <16 x float> %wide.vec, %wide.vec 295 %3 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 296 %4 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 297 %5 = fadd fast <4 x float> %4, %3 298 %6 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 299 %7 = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 300 %8 = fadd fast <4 x float> %7, %6 301 %9 = getelementptr inbounds float, ptr %pDst, i64 %1 302 %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %8, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 303 store <8 x float> %interleaved.vec, ptr %9, align 4 304 %index.next = add i64 %index, 4 305 %10 = icmp eq i64 %index.next, 1024 306 br i1 %10, label %while.end, label %vector.body 307 308while.end: ; preds = %vector.body 309 ret void 310} 311 312; This example has store(shuffle(shuffle(... that would be better to be treated 313; as a single store. This avoids the vld2 for data that is already shuffled. 314define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) { 315; CHECK-LABEL: transpose_s16_8x8_simpler: 316; CHECK: .Lfunc_begin7: 317; CHECK-NEXT: .cfi_startproc 318; CHECK-NEXT: // %bb.0: // %entry 319; CHECK-NEXT: ldp q0, q1, [x0] 320; CHECK-NEXT: ldp q2, q3, [x0, #64] 321; CHECK-NEXT: ldp q4, q5, [x0, #32] 322; CHECK-NEXT: ldp q6, q7, [x0, #96] 323; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h 324; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h 325; CHECK-NEXT: trn1 v2.8h, v4.8h, v5.8h 326; CHECK-NEXT: trn1 v3.8h, v6.8h, v7.8h 327; CHECK-NEXT: trn1 v0.4s, v0.4s, v1.4s 328; CHECK-NEXT: trn1 v1.4s, v2.4s, v3.4s 329; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s 330; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0] 331; CHECK-NEXT: str q2, [x0, #64] 332; CHECK-NEXT: ret 333entry: 334 %0 = load <8 x i16>, ptr %a, align 16 335 %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1 336 %1 = load <8 x i16>, ptr %arrayidx1, align 16 337 %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 338 %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2 339 %2 = load <8 x i16>, ptr %arrayidx2, align 16 340 %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3 341 %3 = load <8 x i16>, ptr %arrayidx3, align 16 342 %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 343 %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4 344 %4 = load <8 x i16>, ptr %arrayidx5, align 16 345 %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5 346 %5 = load <8 x i16>, ptr %arrayidx6, align 16 347 %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 348 %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6 349 %6 = load <8 x i16>, ptr %arrayidx8, align 16 350 %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7 351 %7 = load <8 x i16>, ptr %arrayidx9, align 16 352 %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 353 %8 = bitcast <8 x i16> %shuffle.i to <4 x i32> 354 %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32> 355 %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 356 %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32> 357 %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32> 358 %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 359 %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 360 %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 361 store <4 x i32> %vzip.i, ptr %a, align 16 362 store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16 363 ret void 364} 365 366; Same as above with some different shuffles 367define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) { 368; CHECK-LABEL: transpose_s16_8x8_simpler2: 369; CHECK: .Lfunc_begin8: 370; CHECK-NEXT: .cfi_startproc 371; CHECK-NEXT: // %bb.0: // %entry 372; CHECK-NEXT: ldp q0, q2, [x0] 373; CHECK-NEXT: ldp q3, q4, [x0, #64] 374; CHECK-NEXT: ldp q5, q6, [x0, #32] 375; CHECK-NEXT: ldp q7, q16, [x0, #96] 376; CHECK-NEXT: mov v0.h[5], v2.h[4] 377; CHECK-NEXT: zip1 v2.8h, v3.8h, v4.8h 378; CHECK-NEXT: zip1 v3.8h, v5.8h, v6.8h 379; CHECK-NEXT: mov v7.h[5], v16.h[4] 380; CHECK-NEXT: mov v0.s[1], v2.s[0] 381; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s 382; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s 383; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0] 384; CHECK-NEXT: str q2, [x0, #64] 385; CHECK-NEXT: ret 386entry: 387 %0 = load <8 x i16>, ptr %a, align 16 388 %arrayidx1 = getelementptr inbounds <8 x i16>, ptr %a, i64 1 389 %1 = load <8 x i16>, ptr %arrayidx1, align 16 390 %shuffle.i = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 391 %arrayidx2 = getelementptr inbounds <8 x i16>, ptr %a, i64 2 392 %2 = load <8 x i16>, ptr %arrayidx2, align 16 393 %arrayidx3 = getelementptr inbounds <8 x i16>, ptr %a, i64 3 394 %3 = load <8 x i16>, ptr %arrayidx3, align 16 395 %shuffle.i34 = shufflevector <8 x i16> %2, <8 x i16> %3, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 396 %arrayidx5 = getelementptr inbounds <8 x i16>, ptr %a, i64 4 397 %4 = load <8 x i16>, ptr %arrayidx5, align 16 398 %arrayidx6 = getelementptr inbounds <8 x i16>, ptr %a, i64 5 399 %5 = load <8 x i16>, ptr %arrayidx6, align 16 400 %shuffle.i35 = shufflevector <8 x i16> %4, <8 x i16> %5, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 401 %arrayidx8 = getelementptr inbounds <8 x i16>, ptr %a, i64 6 402 %6 = load <8 x i16>, ptr %arrayidx8, align 16 403 %arrayidx9 = getelementptr inbounds <8 x i16>, ptr %a, i64 7 404 %7 = load <8 x i16>, ptr %arrayidx9, align 16 405 %shuffle.i36 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 undef, i32 undef> 406 %8 = bitcast <8 x i16> %shuffle.i to <4 x i32> 407 %9 = bitcast <8 x i16> %shuffle.i35 to <4 x i32> 408 %shuffle.i37 = shufflevector <4 x i32> %8, <4 x i32> %9, <4 x i32> <i32 1, i32 4, i32 2, i32 7> 409 %10 = bitcast <8 x i16> %shuffle.i34 to <4 x i32> 410 %11 = bitcast <8 x i16> %shuffle.i36 to <4 x i32> 411 %shuffle.i38 = shufflevector <4 x i32> %10, <4 x i32> %11, <4 x i32> <i32 0, i32 5, i32 3, i32 6> 412 %vzip.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 413 %vzip1.i = shufflevector <4 x i32> %shuffle.i37, <4 x i32> %shuffle.i38, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 414 store <4 x i32> %vzip.i, ptr %a, align 16 415 store <4 x i32> %vzip1.i, ptr %arrayidx5, align 16 416 ret void 417} 418 419 420define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) { 421; CHECK-LABEL: transpose_s16_8x8: 422; CHECK: .Lfunc_begin9: 423; CHECK-NEXT: .cfi_startproc 424; CHECK-NEXT: // %bb.0: 425; CHECK-NEXT: ldr q0, [x0] 426; CHECK-NEXT: ldr q1, [x1] 427; CHECK-NEXT: ldr q3, [x4] 428; CHECK-NEXT: ldr q4, [x5] 429; CHECK-NEXT: ldr q2, [x2] 430; CHECK-NEXT: ldr q5, [x3] 431; CHECK-NEXT: trn1 v16.8h, v0.8h, v1.8h 432; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h 433; CHECK-NEXT: ldr q6, [x6] 434; CHECK-NEXT: ldr q7, [x7] 435; CHECK-NEXT: trn1 v17.8h, v3.8h, v4.8h 436; CHECK-NEXT: trn2 v1.8h, v3.8h, v4.8h 437; CHECK-NEXT: trn1 v18.8h, v2.8h, v5.8h 438; CHECK-NEXT: trn2 v2.8h, v2.8h, v5.8h 439; CHECK-NEXT: trn1 v19.8h, v6.8h, v7.8h 440; CHECK-NEXT: trn2 v3.8h, v6.8h, v7.8h 441; CHECK-NEXT: trn1 v4.4s, v16.4s, v17.4s 442; CHECK-NEXT: trn1 v6.4s, v0.4s, v1.4s 443; CHECK-NEXT: trn2 v16.4s, v16.4s, v17.4s 444; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s 445; CHECK-NEXT: trn1 v5.4s, v18.4s, v19.4s 446; CHECK-NEXT: trn1 v7.4s, v2.4s, v3.4s 447; CHECK-NEXT: trn2 v17.4s, v18.4s, v19.4s 448; CHECK-NEXT: trn2 v1.4s, v2.4s, v3.4s 449; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x0] 450; CHECK-NEXT: zip2 v2.4s, v4.4s, v5.4s 451; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s 452; CHECK-NEXT: zip2 v4.4s, v16.4s, v17.4s 453; CHECK-NEXT: st2 { v6.2s, v7.2s }, [x1] 454; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x2] 455; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x3] 456; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s 457; CHECK-NEXT: str q2, [x4] 458; CHECK-NEXT: str q3, [x5] 459; CHECK-NEXT: str q4, [x6] 460; CHECK-NEXT: str q0, [x7] 461; CHECK-NEXT: ret 462 %9 = load <8 x i16>, ptr %0, align 16 463 %10 = load <8 x i16>, ptr %1, align 16 464 %11 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 465 %12 = shufflevector <8 x i16> %9, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 466 %13 = load <8 x i16>, ptr %2, align 16 467 %14 = load <8 x i16>, ptr %3, align 16 468 %15 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 469 %16 = shufflevector <8 x i16> %13, <8 x i16> %14, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 470 %17 = load <8 x i16>, ptr %4, align 16 471 %18 = load <8 x i16>, ptr %5, align 16 472 %19 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 473 %20 = shufflevector <8 x i16> %17, <8 x i16> %18, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 474 %21 = load <8 x i16>, ptr %6, align 16 475 %22 = load <8 x i16>, ptr %7, align 16 476 %23 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 477 %24 = shufflevector <8 x i16> %21, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 478 %25 = bitcast <8 x i16> %11 to <4 x i32> 479 %26 = bitcast <8 x i16> %19 to <4 x i32> 480 %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 481 %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 482 %29 = bitcast <8 x i16> %12 to <4 x i32> 483 %30 = bitcast <8 x i16> %20 to <4 x i32> 484 %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 485 %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 486 %33 = bitcast <8 x i16> %15 to <4 x i32> 487 %34 = bitcast <8 x i16> %23 to <4 x i32> 488 %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 489 %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 490 %37 = bitcast <8 x i16> %16 to <4 x i32> 491 %38 = bitcast <8 x i16> %24 to <4 x i32> 492 %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 493 %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 494 %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 495 %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 496 %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 497 %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 498 %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 499 %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 500 %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 501 %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 502 store <4 x i32> %41, ptr %0, align 16 503 store <4 x i32> %43, ptr %1, align 16 504 store <4 x i32> %45, ptr %2, align 16 505 store <4 x i32> %47, ptr %3, align 16 506 store <4 x i32> %42, ptr %4, align 16 507 store <4 x i32> %44, ptr %5, align 16 508 store <4 x i32> %46, ptr %6, align 16 509 store <4 x i32> %48, ptr %7, align 16 510 ret void 511} 512 513define void @transpose_s16_8x8_(ptr nocapture noundef %0) { 514; CHECK-LABEL: transpose_s16_8x8_: 515; CHECK: .Lfunc_begin10: 516; CHECK-NEXT: .cfi_startproc 517; CHECK-NEXT: // %bb.0: 518; CHECK-NEXT: ldp q0, q1, [x0] 519; CHECK-NEXT: ldp q2, q3, [x0, #32] 520; CHECK-NEXT: ldp q4, q5, [x0, #64] 521; CHECK-NEXT: ldp q6, q7, [x0, #96] 522; CHECK-NEXT: trn1 v16.8h, v0.8h, v1.8h 523; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h 524; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h 525; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h 526; CHECK-NEXT: trn1 v17.8h, v4.8h, v5.8h 527; CHECK-NEXT: trn2 v3.8h, v4.8h, v5.8h 528; CHECK-NEXT: trn1 v18.8h, v6.8h, v7.8h 529; CHECK-NEXT: trn2 v4.8h, v6.8h, v7.8h 530; CHECK-NEXT: trn1 v5.4s, v16.4s, v17.4s 531; CHECK-NEXT: trn1 v7.4s, v0.4s, v3.4s 532; CHECK-NEXT: trn2 v16.4s, v16.4s, v17.4s 533; CHECK-NEXT: trn1 v6.4s, v1.4s, v18.4s 534; CHECK-NEXT: trn1 v19.4s, v2.4s, v4.4s 535; CHECK-NEXT: trn2 v1.4s, v1.4s, v18.4s 536; CHECK-NEXT: trn2 v0.4s, v0.4s, v3.4s 537; CHECK-NEXT: trn2 v2.4s, v2.4s, v4.4s 538; CHECK-NEXT: zip1 v3.4s, v5.4s, v6.4s 539; CHECK-NEXT: zip1 v4.4s, v7.4s, v19.4s 540; CHECK-NEXT: zip1 v17.4s, v16.4s, v1.4s 541; CHECK-NEXT: zip1 v18.4s, v0.4s, v2.4s 542; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s 543; CHECK-NEXT: zip2 v1.4s, v16.4s, v1.4s 544; CHECK-NEXT: zip2 v0.4s, v0.4s, v2.4s 545; CHECK-NEXT: stp q3, q4, [x0] 546; CHECK-NEXT: zip2 v3.4s, v7.4s, v19.4s 547; CHECK-NEXT: stp q17, q18, [x0, #32] 548; CHECK-NEXT: stp q1, q0, [x0, #96] 549; CHECK-NEXT: stp q5, q3, [x0, #64] 550; CHECK-NEXT: ret 551 %2 = load <8 x i16>, ptr %0, align 16 552 %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1 553 %4 = load <8 x i16>, ptr %3, align 1 554 %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 555 %6 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 556 %7 = getelementptr inbounds <8 x i16>, ptr %0, i64 2 557 %8 = load <8 x i16>, ptr %7, align 16 558 %9 = getelementptr inbounds <8 x i16>, ptr %0, i64 3 559 %10 = load <8 x i16>, ptr %9, align 16 560 %11 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 561 %12 = shufflevector <8 x i16> %8, <8 x i16> %10, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 562 %13 = getelementptr inbounds <8 x i16>, ptr %0, i64 4 563 %14 = load <8 x i16>, ptr %13, align 16 564 %15 = getelementptr inbounds <8 x i16>, ptr %0, i64 5 565 %16 = load <8 x i16>, ptr %15, align 16 566 %17 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 567 %18 = shufflevector <8 x i16> %14, <8 x i16> %16, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 568 %19 = getelementptr inbounds <8 x i16>, ptr %0, i64 6 569 %20 = load <8 x i16>, ptr %19, align 16 570 %21 = getelementptr inbounds <8 x i16>, ptr %0, i64 7 571 %22 = load <8 x i16>, ptr %21, align 16 572 %23 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 573 %24 = shufflevector <8 x i16> %20, <8 x i16> %22, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 574 %25 = bitcast <8 x i16> %5 to <4 x i32> 575 %26 = bitcast <8 x i16> %17 to <4 x i32> 576 %27 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 577 %28 = shufflevector <4 x i32> %25, <4 x i32> %26, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 578 %29 = bitcast <8 x i16> %6 to <4 x i32> 579 %30 = bitcast <8 x i16> %18 to <4 x i32> 580 %31 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 581 %32 = shufflevector <4 x i32> %29, <4 x i32> %30, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 582 %33 = bitcast <8 x i16> %11 to <4 x i32> 583 %34 = bitcast <8 x i16> %23 to <4 x i32> 584 %35 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 585 %36 = shufflevector <4 x i32> %33, <4 x i32> %34, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 586 %37 = bitcast <8 x i16> %12 to <4 x i32> 587 %38 = bitcast <8 x i16> %24 to <4 x i32> 588 %39 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 589 %40 = shufflevector <4 x i32> %37, <4 x i32> %38, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 590 %41 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 591 %42 = shufflevector <4 x i32> %27, <4 x i32> %35, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 592 %43 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 593 %44 = shufflevector <4 x i32> %31, <4 x i32> %39, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 594 %45 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 595 %46 = shufflevector <4 x i32> %28, <4 x i32> %36, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 596 %47 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 597 %48 = shufflevector <4 x i32> %32, <4 x i32> %40, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 598 store <4 x i32> %41, ptr %0, align 16 599 store <4 x i32> %43, ptr %3, align 16 600 store <4 x i32> %45, ptr %7, align 16 601 store <4 x i32> %47, ptr %9, align 16 602 store <4 x i32> %42, ptr %13, align 16 603 store <4 x i32> %44, ptr %15, align 16 604 store <4 x i32> %46, ptr %19, align 16 605 store <4 x i32> %48, ptr %21, align 16 606 ret void 607} 608 609define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) { 610; CHECK-LABEL: store_factor2: 611; CHECK: .Lfunc_begin11: 612; CHECK-NEXT: .cfi_startproc 613; CHECK-NEXT: // %bb.0: 614; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s 615; CHECK-NEXT: trn1 v3.4s, v1.4s, v0.4s 616; CHECK-NEXT: st2 { v2.4s, v3.4s }, [x0] 617; CHECK-NEXT: ret 618 %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 619 %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 620 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 621 store <8 x i32> %interleaved.vec, ptr %ptr, align 4 622 ret void 623} 624 625define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) { 626; CHECK-LABEL: store_factor2_high: 627; CHECK: .Lfunc_begin12: 628; CHECK-NEXT: .cfi_startproc 629; CHECK-NEXT: // %bb.0: 630; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s 631; CHECK-NEXT: trn1 v0.4s, v1.4s, v0.4s 632; CHECK-NEXT: zip1 v1.4s, v2.4s, v0.4s 633; CHECK-NEXT: trn1 v1.4s, v1.4s, v0.4s 634; CHECK-NEXT: zip2 v0.4s, v2.4s, v0.4s 635; CHECK-NEXT: str q1, [x0] 636; CHECK-NEXT: str q0, [x1] 637; CHECK-NEXT: ret 638 %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 639 %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 640 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 1, i32 6> 641 %interleaved.vec2 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 642 store <4 x i32> %interleaved.vec, ptr %ptr, align 4 643 store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4 644 ret void 645} 646 647define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) { 648; CHECK-LABEL: store_factor2_high2: 649; CHECK: .Lfunc_begin13: 650; CHECK-NEXT: .cfi_startproc 651; CHECK-NEXT: // %bb.0: 652; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s 653; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s 654; CHECK-NEXT: trn1 v2.4s, v2.4s, v1.4s 655; CHECK-NEXT: str q2, [x0] 656; CHECK-NEXT: str q0, [x1] 657; CHECK-NEXT: ret 658 %interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 6> 659 %interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 660 store <4 x i32> %interleaved.vec, ptr %ptr, align 4 661 store <4 x i32> %interleaved.vec2, ptr %ptr2, align 4 662 ret void 663} 664 665define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 666; CHECK-LABEL: store_factor3: 667; CHECK: .Lfunc_begin14: 668; CHECK-NEXT: .cfi_startproc 669; CHECK-NEXT: // %bb.0: 670; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #12 671; CHECK-NEXT: ext v6.16b, v1.16b, v2.16b, #12 672; CHECK-NEXT: zip2 v3.4s, v0.4s, v3.4s 673; CHECK-NEXT: mov v3.s[0], v0.s[0] 674; CHECK-NEXT: ext v0.16b, v2.16b, v0.16b, #12 675; CHECK-NEXT: zip2 v4.4s, v1.4s, v6.4s 676; CHECK-NEXT: mov v4.s[0], v1.s[0] 677; CHECK-NEXT: zip2 v5.4s, v2.4s, v0.4s 678; CHECK-NEXT: mov v5.s[0], v2.s[0] 679; CHECK-NEXT: st3 { v3.4s, v4.4s, v5.4s }, [x0] 680; CHECK-NEXT: ret 681 %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 5, i32 3, i32 6> 682 %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 5, i32 3, i32 6> 683 %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a0, <4 x i32> <i32 0, i32 5, i32 3, i32 6> 684 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 685 %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 686 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 687 store <12 x i32> %interleaved.vec, ptr %ptr, align 4 688 ret void 689} 690 691define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { 692; CHECK-LABEL: store_factor4: 693; CHECK: .Lfunc_begin15: 694; CHECK-NEXT: .cfi_startproc 695; CHECK-NEXT: // %bb.0: 696; CHECK-NEXT: trn1 v4.4s, v0.4s, v1.4s 697; CHECK-NEXT: trn1 v5.4s, v1.4s, v2.4s 698; CHECK-NEXT: trn1 v6.4s, v2.4s, v3.4s 699; CHECK-NEXT: trn1 v7.4s, v3.4s, v0.4s 700; CHECK-NEXT: st4 { v4.4s, v5.4s, v6.4s, v7.4s }, [x0] 701; CHECK-NEXT: ret 702 %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 703 %v1 = shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 704 %v2 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 705 %v3 = shufflevector <4 x i32> %a3, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 706 %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 707 %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 708 %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 709 store <16 x i32> %interleaved.vec, ptr %ptr, align 4 710 ret void 711} 712 713define void @debuginfo(ptr nocapture noundef writeonly %buf, <8 x i16> noundef %a) { 714; CHECK-LABEL: debuginfo: 715; CHECK: .Lfunc_begin16: 716; CHECK-NEXT: .cfi_startproc 717; CHECK-NEXT: // %bb.0: // %entry 718; CHECK-NEXT: movi v1.2d, #0000000000000000 719; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h 720; CHECK-NEXT: zip2 v0.8h, v0.8h, v1.8h 721; CHECK-NEXT: stp q2, q0, [x0] 722; CHECK-NEXT: ret 723entry: 724 %vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 725 %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 726 store <8 x i16> %vzip.i, ptr %buf, align 4 727 call void @llvm.dbg.value(metadata <8 x i16> %vzip1.i, metadata !21, metadata !DIExpression()), !dbg !23 728 %add.ptr = getelementptr inbounds i32, ptr %buf, i64 4 729 store <8 x i16> %vzip1.i, ptr %add.ptr, align 4 730 ret void 731} 732 733declare void @llvm.dbg.value(metadata, metadata, metadata) 734 735!llvm.dbg.cu = !{!0} 736!llvm.module.flags = !{!6, !7, !8, !9, !10, !11} 737 738!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None) 739!1 = !DIFile(filename: "a64.c", directory: "", checksumkind: CSK_MD5, checksum: "a1a236fb20d703d1ea5963e75545b91a") 740!2 = !{!15} 741!3 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) 742!4 = !{!5} 743!5 = !DISubrange(count: 8) 744!6 = !{i32 7, !"Dwarf Version", i32 5} 745!7 = !{i32 2, !"Debug Info Version", i32 3} 746!8 = !{i32 1, !"wchar_size", i32 4} 747!9 = !{i32 7, !"uwtable", i32 2} 748!10 = !{i32 7, !"frame-pointer", i32 1} 749!11 = !{i32 7, !"debug-info-assignment-tracking", i1 true} 750!12 = !DISubroutineType(types: !13) 751!13 = !{null, !14, !15} 752!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3, size: 64) 753!15 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !16) 754!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int16x8_t", file: !1, line: 57, baseType: !17) 755!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 128, flags: DIFlagVector, elements: !4) 756!18 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed) 757!19 = distinct !DISubprogram(name: "store_s16q_to_tran_low_", scope: !1, file: !1, line: 13, type: !12, scopeLine: 13, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !20) 758!20 = !{!21} 759!21 = !DILocalVariable(name: "__s1", scope: !22, file: !1, line: 16, type: !16) 760!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 16, column: 3) 761!23 = !DILocation(line: 0, scope: !22) 762