1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s 3 4target triple = "aarch64" 5; Expected to transform 6; *p = (a * b); 7; return (a * b) * a; 8define <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b, ptr %p) { 9; CHECK-LABEL: mul_triangle: 10; CHECK: // %bb.0: // %entry 11; CHECK-NEXT: movi v3.2d, #0000000000000000 12; CHECK-NEXT: movi v2.2d, #0000000000000000 13; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #0 14; CHECK-NEXT: fcmla v3.4s, v0.4s, v1.4s, #90 15; CHECK-NEXT: fcmla v2.4s, v3.4s, v0.4s, #0 16; CHECK-NEXT: str q3, [x0] 17; CHECK-NEXT: fcmla v2.4s, v3.4s, v0.4s, #90 18; CHECK-NEXT: mov v0.16b, v2.16b 19; CHECK-NEXT: ret 20entry: 21 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 22 %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 23 %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 24 %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 25 %0 = fmul fast <2 x float> %strided.vec37, %strided.vec 26 %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 27 %2 = fsub fast <2 x float> %0, %1 28 %3 = fmul fast <2 x float> %2, %strided.vec35 29 %4 = fmul fast <2 x float> %strided.vec38, %strided.vec 30 %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 31 %6 = fadd fast <2 x float> %4, %5 32 %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 33 store <4 x float> %otheruse, ptr %p 34 %7 = fmul fast <2 x float> %6, %strided.vec 35 %8 = fadd fast <2 x float> %3, %7 36 %9 = fmul fast <2 x float> %2, %strided.vec 37 %10 = fmul fast <2 x float> %6, %strided.vec35 38 %11 = fsub fast <2 x float> %9, %10 39 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 40 ret <4 x float> %interleaved.vec 41} 42 43; Expected to not transform. Shows that external use prevents deinterleaving. 44; *p = (a * b).real(); 45; return (a * b) * a; 46define <4 x float> @mul_triangle_external_use(<4 x float> %a, <4 x float> %b, ptr %p) { 47; CHECK-LABEL: mul_triangle_external_use: 48; CHECK: // %bb.0: // %entry 49; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 50; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 51; CHECK-NEXT: zip2 v4.2s, v0.2s, v2.2s 52; CHECK-NEXT: zip1 v5.2s, v1.2s, v3.2s 53; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s 54; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s 55; CHECK-NEXT: fmul v2.2s, v4.2s, v5.2s 56; CHECK-NEXT: fmul v3.2s, v1.2s, v4.2s 57; CHECK-NEXT: fmla v2.2s, v0.2s, v1.2s 58; CHECK-NEXT: fneg v1.2s, v3.2s 59; CHECK-NEXT: fmul v3.2s, v2.2s, v4.2s 60; CHECK-NEXT: str d2, [x0] 61; CHECK-NEXT: fmla v1.2s, v0.2s, v5.2s 62; CHECK-NEXT: fmul v5.2s, v2.2s, v0.2s 63; CHECK-NEXT: fneg v3.2s, v3.2s 64; CHECK-NEXT: fmla v5.2s, v4.2s, v1.2s 65; CHECK-NEXT: fmla v3.2s, v0.2s, v1.2s 66; CHECK-NEXT: zip1 v0.4s, v3.4s, v5.4s 67; CHECK-NEXT: ret 68entry: 69 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 70 %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 71 %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 72 %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 73 %0 = fmul fast <2 x float> %strided.vec37, %strided.vec 74 %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 75 %2 = fsub fast <2 x float> %0, %1 76 %3 = fmul fast <2 x float> %2, %strided.vec35 77 %4 = fmul fast <2 x float> %strided.vec38, %strided.vec 78 %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 79 %6 = fadd fast <2 x float> %4, %5 80 store <2 x float> %6, ptr %p 81 %7 = fmul fast <2 x float> %6, %strided.vec 82 %8 = fadd fast <2 x float> %3, %7 83 %9 = fmul fast <2 x float> %2, %strided.vec 84 %10 = fmul fast <2 x float> %6, %strided.vec35 85 %11 = fsub fast <2 x float> %9, %10 86 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 87 ret <4 x float> %interleaved.vec 88} 89 90; Expected to transform partially (only d * c). Shows that external use of shufflevector does not prevent deinterleaving. 91; *p1 = (a * b).real(); 92; *p2 = (a * b) * c; 93; return d * c; 94define <4 x float> @multiple_muls_shuffle_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) { 95; CHECK-LABEL: multiple_muls_shuffle_external: 96; CHECK: // %bb.0: // %entry 97; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8 98; CHECK-NEXT: ext v6.16b, v1.16b, v1.16b, #8 99; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 100; CHECK-NEXT: zip2 v7.2s, v0.2s, v5.2s 101; CHECK-NEXT: zip1 v16.2s, v1.2s, v6.2s 102; CHECK-NEXT: zip2 v1.2s, v1.2s, v6.2s 103; CHECK-NEXT: zip1 v0.2s, v0.2s, v5.2s 104; CHECK-NEXT: fmul v5.2s, v16.2s, v7.2s 105; CHECK-NEXT: fmul v6.2s, v1.2s, v7.2s 106; CHECK-NEXT: fmla v5.2s, v0.2s, v1.2s 107; CHECK-NEXT: fneg v1.2s, v6.2s 108; CHECK-NEXT: zip1 v6.2s, v2.2s, v4.2s 109; CHECK-NEXT: zip2 v4.2s, v2.2s, v4.2s 110; CHECK-NEXT: fmla v1.2s, v0.2s, v16.2s 111; CHECK-NEXT: fmul v17.2s, v6.2s, v5.2s 112; CHECK-NEXT: movi v0.2d, #0000000000000000 113; CHECK-NEXT: fmul v5.2s, v4.2s, v5.2s 114; CHECK-NEXT: fmla v17.2s, v1.2s, v4.2s 115; CHECK-NEXT: fcmla v0.4s, v2.4s, v3.4s, #0 116; CHECK-NEXT: str d1, [x0] 117; CHECK-NEXT: fneg v16.2s, v5.2s 118; CHECK-NEXT: fcmla v0.4s, v2.4s, v3.4s, #90 119; CHECK-NEXT: fmla v16.2s, v1.2s, v6.2s 120; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x1] 121; CHECK-NEXT: ret 122entry: 123 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 124 %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 125 %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 126 %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 127 %0 = fmul fast <2 x float> %strided.vec91, %strided.vec 128 %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88 129 %2 = fadd fast <2 x float> %0, %1 130 %3 = fmul fast <2 x float> %strided.vec90, %strided.vec 131 %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88 132 %5 = fsub fast <2 x float> %3, %4 133 store <2 x float> %5, ptr %p1 134 %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 135 %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 136 %6 = fmul fast <2 x float> %strided.vec94, %5 137 %7 = fmul fast <2 x float> %strided.vec93, %2 138 %8 = fadd fast <2 x float> %6, %7 139 %9 = fmul fast <2 x float> %strided.vec93, %5 140 %10 = fmul fast <2 x float> %strided.vec94, %2 141 %11 = fsub fast <2 x float> %9, %10 142 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 143 store <4 x float> %interleaved.vec, ptr %p2 144 %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2> 145 %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3> 146 %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94 147 %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93 148 %14 = fadd fast <2 x float> %13, %12 149 %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93 150 %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94 151 %17 = fsub fast <2 x float> %15, %16 152 %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 153 ret <4 x float> %interleaved.vec98 154} 155 156; Same as above but data are loaded from memory instead of being passes as arguments. 157; Expected to transform partially (only d * c). 158; Shows that ld2 is not generated for `c` although it used by both complex `d * c` and non-complex `(a * b) * c` instruction chains. 159define <4 x float> @multiple_muls_shuffle_external_with_loads(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_d, ptr %p1, ptr %p2) { 160; CHECK-LABEL: multiple_muls_shuffle_external_with_loads: 161; CHECK: // %bb.0: // %entry 162; CHECK-NEXT: ld2 { v0.2s, v1.2s }, [x0] 163; CHECK-NEXT: ld2 { v2.2s, v3.2s }, [x1] 164; CHECK-NEXT: fmul v4.2s, v3.2s, v1.2s 165; CHECK-NEXT: fmul v6.2s, v2.2s, v1.2s 166; CHECK-NEXT: fneg v4.2s, v4.2s 167; CHECK-NEXT: fmla v6.2s, v0.2s, v3.2s 168; CHECK-NEXT: fmla v4.2s, v0.2s, v2.2s 169; CHECK-NEXT: str d4, [x4] 170; CHECK-NEXT: ldr q5, [x2] 171; CHECK-NEXT: ext v7.16b, v5.16b, v5.16b, #8 172; CHECK-NEXT: zip1 v0.2s, v5.2s, v7.2s 173; CHECK-NEXT: zip2 v1.2s, v5.2s, v7.2s 174; CHECK-NEXT: fmul v3.2s, v0.2s, v6.2s 175; CHECK-NEXT: fmul v6.2s, v1.2s, v6.2s 176; CHECK-NEXT: fmla v3.2s, v4.2s, v1.2s 177; CHECK-NEXT: fneg v2.2s, v6.2s 178; CHECK-NEXT: fmla v2.2s, v4.2s, v0.2s 179; CHECK-NEXT: movi v0.2d, #0000000000000000 180; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x5] 181; CHECK-NEXT: ldr q1, [x3] 182; CHECK-NEXT: fcmla v0.4s, v5.4s, v1.4s, #0 183; CHECK-NEXT: fcmla v0.4s, v5.4s, v1.4s, #90 184; CHECK-NEXT: ret 185entry: 186 %a = load <4 x float>, ptr %ptr_a 187 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 188 %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 189 %b = load <4 x float>, ptr %ptr_b 190 %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 191 %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 192 %0 = fmul fast <2 x float> %strided.vec91, %strided.vec 193 %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88 194 %2 = fadd fast <2 x float> %0, %1 195 %3 = fmul fast <2 x float> %strided.vec90, %strided.vec 196 %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88 197 %5 = fsub fast <2 x float> %3, %4 198 store <2 x float> %5, ptr %p1 199 %c = load <4 x float>, ptr %ptr_c 200 %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 201 %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 202 %6 = fmul fast <2 x float> %strided.vec94, %5 203 %7 = fmul fast <2 x float> %strided.vec93, %2 204 %8 = fadd fast <2 x float> %6, %7 205 %9 = fmul fast <2 x float> %strided.vec93, %5 206 %10 = fmul fast <2 x float> %strided.vec94, %2 207 %11 = fsub fast <2 x float> %9, %10 208 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 209 store <4 x float> %interleaved.vec, ptr %p2 210 %d = load <4 x float>, ptr %ptr_d 211 %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2> 212 %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3> 213 %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94 214 %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93 215 %14 = fadd fast <2 x float> %13, %12 216 %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93 217 %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94 218 %17 = fsub fast <2 x float> %15, %16 219 %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 220 ret <4 x float> %interleaved.vec98 221} 222 223; Expected to not transform. Shows that external use prevents deinterleaving whole chain. 224; *p1 = (a * b).real(); 225; *p2 = (a * b) * (d * c); 226; return d * c; 227define <4 x float> @multiple_muls_mul_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) { 228; CHECK-LABEL: multiple_muls_mul_external: 229; CHECK: // %bb.0: // %entry 230; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 231; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 232; CHECK-NEXT: ext v16.16b, v2.16b, v2.16b, #8 233; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #8 234; CHECK-NEXT: zip2 v6.2s, v0.2s, v4.2s 235; CHECK-NEXT: zip2 v7.2s, v1.2s, v5.2s 236; CHECK-NEXT: zip1 v19.2s, v2.2s, v16.2s 237; CHECK-NEXT: zip2 v2.2s, v2.2s, v16.2s 238; CHECK-NEXT: zip2 v16.2s, v3.2s, v17.2s 239; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s 240; CHECK-NEXT: zip1 v1.2s, v1.2s, v5.2s 241; CHECK-NEXT: zip1 v3.2s, v3.2s, v17.2s 242; CHECK-NEXT: fmul v18.2s, v6.2s, v7.2s 243; CHECK-NEXT: fmul v5.2s, v19.2s, v16.2s 244; CHECK-NEXT: fmul v16.2s, v2.2s, v16.2s 245; CHECK-NEXT: fmul v7.2s, v0.2s, v7.2s 246; CHECK-NEXT: fneg v4.2s, v18.2s 247; CHECK-NEXT: fmla v5.2s, v3.2s, v2.2s 248; CHECK-NEXT: fneg v2.2s, v16.2s 249; CHECK-NEXT: fmla v7.2s, v1.2s, v6.2s 250; CHECK-NEXT: fmla v4.2s, v1.2s, v0.2s 251; CHECK-NEXT: fmla v2.2s, v3.2s, v19.2s 252; CHECK-NEXT: fmul v0.2s, v7.2s, v5.2s 253; CHECK-NEXT: fmul v17.2s, v4.2s, v5.2s 254; CHECK-NEXT: str d4, [x0] 255; CHECK-NEXT: fmla v17.2s, v2.2s, v7.2s 256; CHECK-NEXT: fneg v16.2s, v0.2s 257; CHECK-NEXT: zip1 v0.4s, v2.4s, v5.4s 258; CHECK-NEXT: fmla v16.2s, v2.2s, v4.2s 259; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x1] 260; CHECK-NEXT: ret 261entry: 262 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 263 %strided.vec126 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 264 %strided.vec128 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 265 %strided.vec129 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 266 %0 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec129 267 %1 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec128 268 %2 = fadd nnan ninf contract <2 x float> %1, %0 269 %3 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec128 270 %4 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec129 271 %5 = fsub nnan ninf contract <2 x float> %3, %4 272 store <2 x float> %5, ptr %p1 273 %strided.vec131 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 274 %strided.vec132 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 275 %strided.vec134 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2> 276 %strided.vec135 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3> 277 %6 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec135 278 %7 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec134 279 %8 = fadd nnan ninf contract <2 x float> %7, %6 280 %9 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec134 281 %10 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec135 282 %11 = fsub nnan ninf contract <2 x float> %9, %10 283 %12 = fmul nnan ninf contract <2 x float> %5, %8 284 %13 = fmul nnan ninf contract <2 x float> %2, %11 285 %14 = fadd nnan ninf contract <2 x float> %13, %12 286 %15 = fmul nnan ninf contract <2 x float> %5, %11 287 %16 = fmul nnan ninf contract <2 x float> %2, %8 288 %17 = fsub nnan ninf contract <2 x float> %15, %16 289 %interleaved.vec = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 290 store <4 x float> %interleaved.vec, ptr %p2 291 %interleaved.vec136 = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 292 ret <4 x float> %interleaved.vec136 293} 294 295; Expected to transform. Shows that composite common subexpression is not generated twice. 296; u[i] = a[i] * b[i] - (c[i] * d[i] + g[i] * h[i]); 297; v[i] = e[i] * f[i] + (c[i] * d[i] + g[i] * h[i]); 298define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) { 299; CHECK-LABEL: mul_add_common_mul_add_mul: 300; CHECK: // %bb.0: // %entry 301; CHECK-NEXT: movi v16.2d, #0000000000000000 302; CHECK-NEXT: movi v17.2d, #0000000000000000 303; CHECK-NEXT: ldr q19, [sp, #112] 304; CHECK-NEXT: ldp q18, q20, [sp, #80] 305; CHECK-NEXT: ldr q21, [sp, #64] 306; CHECK-NEXT: movi v22.2d, #0000000000000000 307; CHECK-NEXT: fcmla v16.2d, v18.2d, v19.2d, #0 308; CHECK-NEXT: fcmla v17.2d, v21.2d, v20.2d, #0 309; CHECK-NEXT: fcmla v22.2d, v1.2d, v3.2d, #0 310; CHECK-NEXT: fcmla v16.2d, v18.2d, v19.2d, #90 311; CHECK-NEXT: movi v18.2d, #0000000000000000 312; CHECK-NEXT: fcmla v17.2d, v21.2d, v20.2d, #90 313; CHECK-NEXT: fcmla v22.2d, v1.2d, v3.2d, #90 314; CHECK-NEXT: fcmla v16.2d, v5.2d, v7.2d, #0 315; CHECK-NEXT: fcmla v18.2d, v0.2d, v2.2d, #0 316; CHECK-NEXT: fcmla v17.2d, v4.2d, v6.2d, #0 317; CHECK-NEXT: fcmla v16.2d, v5.2d, v7.2d, #90 318; CHECK-NEXT: fcmla v18.2d, v0.2d, v2.2d, #90 319; CHECK-NEXT: fcmla v17.2d, v4.2d, v6.2d, #90 320; CHECK-NEXT: ldp q3, q0, [sp, #32] 321; CHECK-NEXT: ldp q2, q1, [sp] 322; CHECK-NEXT: fsub v4.2d, v22.2d, v16.2d 323; CHECK-NEXT: fsub v5.2d, v18.2d, v17.2d 324; CHECK-NEXT: fcmla v16.2d, v0.2d, v1.2d, #0 325; CHECK-NEXT: fcmla v17.2d, v3.2d, v2.2d, #0 326; CHECK-NEXT: stp q5, q4, [x0] 327; CHECK-NEXT: fcmla v16.2d, v0.2d, v1.2d, #90 328; CHECK-NEXT: fcmla v17.2d, v3.2d, v2.2d, #90 329; CHECK-NEXT: stp q17, q16, [x1] 330; CHECK-NEXT: ret 331entry: 332 %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2> 333 %strided.vec123 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3> 334 %strided.vec125 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2> 335 %strided.vec126 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3> 336 %0 = fmul fast <2 x double> %strided.vec125, %strided.vec 337 %1 = fmul fast <2 x double> %strided.vec126, %strided.vec 338 %2 = fmul fast <2 x double> %strided.vec125, %strided.vec123 339 %3 = fadd fast <2 x double> %1, %2 340 %strided.vec128 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2> 341 %strided.vec129 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3> 342 %strided.vec131 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2> 343 %strided.vec132 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3> 344 %4 = fmul fast <2 x double> %strided.vec131, %strided.vec128 345 %5 = fmul fast <2 x double> %strided.vec132, %strided.vec129 346 %6 = fmul fast <2 x double> %strided.vec132, %strided.vec128 347 %7 = fmul fast <2 x double> %strided.vec131, %strided.vec129 348 %8 = fsub fast <2 x double> %4, %5 349 %strided.vec134 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 0, i32 2> 350 %strided.vec135 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 1, i32 3> 351 %strided.vec137 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 0, i32 2> 352 %strided.vec138 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 1, i32 3> 353 %9 = fmul fast <2 x double> %strided.vec138, %strided.vec134 354 %10 = fmul fast <2 x double> %strided.vec137, %strided.vec135 355 %11 = fmul fast <2 x double> %strided.vec137, %strided.vec134 356 %12 = fmul fast <2 x double> %strided.vec135, %strided.vec138 357 %13 = fsub fast <2 x double> %11, %12 358 %14 = fadd fast <2 x double> %13, %8 359 %15 = fadd fast <2 x double> %6, %7 360 %16 = fadd fast <2 x double> %15, %9 361 %17 = fadd fast <2 x double> %16, %10 362 %18 = fmul fast <2 x double> %strided.vec126, %strided.vec123 363 %19 = fadd fast <2 x double> %18, %14 364 %20 = fsub fast <2 x double> %0, %19 365 %21 = fsub fast <2 x double> %3, %17 366 %interleaved.vec = shufflevector <2 x double> %20, <2 x double> %21, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 367 store <4 x double> %interleaved.vec, ptr %p1, align 8 368 %strided.vec140 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 0, i32 2> 369 %strided.vec141 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 1, i32 3> 370 %strided.vec143 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 0, i32 2> 371 %strided.vec144 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 1, i32 3> 372 %22 = fmul fast <2 x double> %strided.vec143, %strided.vec140 373 %23 = fmul fast <2 x double> %strided.vec144, %strided.vec140 374 %24 = fmul fast <2 x double> %strided.vec143, %strided.vec141 375 %25 = fadd fast <2 x double> %22, %14 376 %26 = fmul fast <2 x double> %strided.vec144, %strided.vec141 377 %27 = fsub fast <2 x double> %25, %26 378 %28 = fadd fast <2 x double> %24, %17 379 %29 = fadd fast <2 x double> %28, %23 380 %interleaved.vec145 = shufflevector <2 x double> %27, <2 x double> %29, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 381 store <4 x double> %interleaved.vec145, ptr %p2, align 8 382 ret void 383} 384