1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s 3 4target triple = "thumbv8.1m.main-none-none-eabi" 5 6; Expected to transform 7define arm_aapcs_vfpcc <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 8; CHECK-LABEL: mul_mul: 9; CHECK: @ %bb.0: @ %entry 10; CHECK-NEXT: vcmul.f32 q3, q0, q1, #0 11; CHECK-NEXT: vcmla.f32 q3, q0, q1, #90 12; CHECK-NEXT: vcmul.f32 q0, q3, q2, #0 13; CHECK-NEXT: vcmla.f32 q0, q3, q2, #90 14; CHECK-NEXT: bx lr 15entry: 16 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 17 %strided.vec151 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 18 %strided.vec153 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 19 %strided.vec154 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 20 %0 = fmul fast <2 x float> %strided.vec154, %strided.vec151 21 %1 = fmul fast <2 x float> %strided.vec153, %strided.vec 22 %2 = fmul fast <2 x float> %strided.vec154, %strided.vec 23 %3 = fmul fast <2 x float> %strided.vec153, %strided.vec151 24 %4 = fadd fast <2 x float> %3, %2 25 %5 = fsub fast <2 x float> %1, %0 26 %strided.vec156 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 27 %strided.vec157 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 28 %6 = fmul fast <2 x float> %4, %strided.vec156 29 %7 = fmul fast <2 x float> %5, %strided.vec157 30 %8 = fadd fast <2 x float> %6, %7 31 %9 = fmul fast <2 x float> %strided.vec156, %5 32 %10 = fmul fast <2 x float> %4, %strided.vec157 33 %11 = fsub fast <2 x float> %9, %10 34 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 35 ret <4 x float> %interleaved.vec 36} 37 38; Expected to not transform 39define arm_aapcs_vfpcc <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 40; CHECK-LABEL: add_mul: 41; CHECK: @ %bb.0: @ %entry 42; CHECK-NEXT: .vsave {d8, d9} 43; CHECK-NEXT: vpush {d8, d9} 44; CHECK-NEXT: vsub.f32 q3, q1, q2 45; CHECK-NEXT: vsub.f32 q0, q1, q0 46; CHECK-NEXT: vmov.f32 s16, s9 47; CHECK-NEXT: vmov.f32 s13, s14 48; CHECK-NEXT: vmov.f32 s17, s11 49; CHECK-NEXT: vmov.f32 s0, s1 50; CHECK-NEXT: vmul.f32 q1, q3, q4 51; CHECK-NEXT: vmov.f32 s1, s3 52; CHECK-NEXT: vmov.f32 s9, s10 53; CHECK-NEXT: vfma.f32 q1, q2, q0 54; CHECK-NEXT: vmul.f32 q0, q4, q0 55; CHECK-NEXT: vneg.f32 q4, q0 56; CHECK-NEXT: vmov.f32 s1, s4 57; CHECK-NEXT: vfma.f32 q4, q2, q3 58; CHECK-NEXT: vmov.f32 s3, s5 59; CHECK-NEXT: vmov.f32 s0, s16 60; CHECK-NEXT: vmov.f32 s2, s17 61; CHECK-NEXT: vpop {d8, d9} 62; CHECK-NEXT: bx lr 63entry: 64 %0 = fsub fast <4 x float> %b, %c 65 %1 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2> 66 %strided.vec58 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 67 %strided.vec59 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 68 %2 = fmul fast <2 x float> %1, %strided.vec59 69 %3 = fsub fast <4 x float> %b, %a 70 %4 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3> 71 %5 = fmul fast <2 x float> %strided.vec58, %4 72 %6 = fadd fast <2 x float> %5, %2 73 %7 = fmul fast <2 x float> %strided.vec58, %1 74 %8 = fmul fast <2 x float> %strided.vec59, %4 75 %9 = fsub fast <2 x float> %7, %8 76 %interleaved.vec = shufflevector <2 x float> %9, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 77 ret <4 x float> %interleaved.vec 78} 79 80; Expected to not transform 81define arm_aapcs_vfpcc <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 82; CHECK-LABEL: mul_mul270_mul: 83; CHECK: @ %bb.0: @ %entry 84; CHECK-NEXT: .vsave {d12} 85; CHECK-NEXT: vpush {d12} 86; CHECK-NEXT: .vsave {d10} 87; CHECK-NEXT: vpush {d10} 88; CHECK-NEXT: .vsave {d8} 89; CHECK-NEXT: vpush {d8} 90; CHECK-NEXT: vmov.f32 s20, s4 91; CHECK-NEXT: vmov.f32 s16, s8 92; CHECK-NEXT: vmov.f32 s17, s10 93; CHECK-NEXT: vmov.f32 s21, s6 94; CHECK-NEXT: vmul.f32 q3, q5, q4 95; CHECK-NEXT: vmov.f32 s4, s5 96; CHECK-NEXT: vneg.f32 q3, q3 97; CHECK-NEXT: vmov.f32 s24, s9 98; CHECK-NEXT: vmov.f32 s25, s11 99; CHECK-NEXT: vmov.f32 s5, s7 100; CHECK-NEXT: vmul.f32 q2, q1, q4 101; CHECK-NEXT: vmov.f32 s16, s0 102; CHECK-NEXT: vfma.f32 q3, q1, q6 103; CHECK-NEXT: vmov.f32 s17, s2 104; CHECK-NEXT: vmov.f32 s0, s1 105; CHECK-NEXT: vfma.f32 q2, q5, q6 106; CHECK-NEXT: vmul.f32 q1, q3, q4 107; CHECK-NEXT: vmov.f32 s1, s3 108; CHECK-NEXT: vfma.f32 q1, q2, q0 109; CHECK-NEXT: vmul.f32 q0, q3, q0 110; CHECK-NEXT: vneg.f32 q3, q0 111; CHECK-NEXT: vmov.f32 s1, s4 112; CHECK-NEXT: vfma.f32 q3, q2, q4 113; CHECK-NEXT: vmov.f32 s3, s5 114; CHECK-NEXT: vmov.f32 s0, s12 115; CHECK-NEXT: vmov.f32 s2, s13 116; CHECK-NEXT: vpop {d8} 117; CHECK-NEXT: vpop {d10} 118; CHECK-NEXT: vpop {d12} 119; CHECK-NEXT: bx lr 120entry: 121 %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 122 %strided.vec81 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 123 %strided.vec83 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 124 %strided.vec84 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 125 %0 = fmul fast <2 x float> %strided.vec84, %strided.vec 126 %1 = fmul fast <2 x float> %strided.vec83, %strided.vec81 127 %2 = fadd fast <2 x float> %1, %0 128 %strided.vec86 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 129 %strided.vec87 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 130 %3 = fmul fast <2 x float> %2, %strided.vec87 131 %4 = fmul fast <2 x float> %strided.vec84, %strided.vec81 132 %5 = fmul fast <2 x float> %strided.vec83, %strided.vec 133 %6 = fsub fast <2 x float> %4, %5 134 %7 = fmul fast <2 x float> %6, %strided.vec86 135 %8 = fadd fast <2 x float> %3, %7 136 %9 = fmul fast <2 x float> %2, %strided.vec86 137 %10 = fmul fast <2 x float> %6, %strided.vec87 138 %11 = fsub fast <2 x float> %9, %10 139 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 140 ret <4 x float> %interleaved.vec 141} 142 143; (a * b) * a 144; Expected to transform 145define arm_aapcs_vfpcc <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) { 146; CHECK-LABEL: mul_triangle: 147; CHECK: @ %bb.0: @ %entry 148; CHECK-NEXT: vcmul.f32 q2, q1, q0, #0 149; CHECK-NEXT: vcmla.f32 q2, q1, q0, #90 150; CHECK-NEXT: vcmul.f32 q1, q0, q2, #0 151; CHECK-NEXT: vcmla.f32 q1, q0, q2, #90 152; CHECK-NEXT: vmov q0, q1 153; CHECK-NEXT: bx lr 154entry: 155 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 156 %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 157 %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 158 %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 159 %0 = fmul fast <2 x float> %strided.vec37, %strided.vec 160 %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 161 %2 = fsub fast <2 x float> %0, %1 162 %3 = fmul fast <2 x float> %2, %strided.vec35 163 %4 = fmul fast <2 x float> %strided.vec38, %strided.vec 164 %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 165 %6 = fadd fast <2 x float> %4, %5 166 %7 = fmul fast <2 x float> %6, %strided.vec 167 %8 = fadd fast <2 x float> %3, %7 168 %9 = fmul fast <2 x float> %2, %strided.vec 169 %10 = fmul fast <2 x float> %6, %strided.vec35 170 %11 = fsub fast <2 x float> %9, %10 171 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 172 ret <4 x float> %interleaved.vec 173} 174 175 176; d * (b * a) * (c * a) 177; Expected to transform 178define arm_aapcs_vfpcc <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) { 179; CHECK-LABEL: mul_diamond: 180; CHECK: @ %bb.0: @ %entry 181; CHECK-NEXT: .vsave {d8, d9} 182; CHECK-NEXT: vpush {d8, d9} 183; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0 184; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90 185; CHECK-NEXT: vcmul.f32 q1, q4, q3, #0 186; CHECK-NEXT: vcmla.f32 q1, q4, q3, #90 187; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0 188; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90 189; CHECK-NEXT: vcmul.f32 q0, q3, q1, #0 190; CHECK-NEXT: vcmla.f32 q0, q3, q1, #90 191; CHECK-NEXT: vpop {d8, d9} 192; CHECK-NEXT: bx lr 193entry: 194 %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 195 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 196 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 197 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 198 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 199 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 200 %d.real = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2> 201 %d.imag = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3> 202 %0 = fmul fast <2 x float> %a.imag, %b.real 203 %1 = fmul fast <2 x float> %a.real, %b.imag 204 %2 = fadd fast <2 x float> %1, %0 205 %3 = fmul fast <2 x float> %a.real, %b.real 206 %4 = fmul fast <2 x float> %b.imag, %a.imag 207 %5 = fsub fast <2 x float> %3, %4 208 %6 = fmul fast <2 x float> %d.real, %5 209 %7 = fmul fast <2 x float> %2, %d.imag 210 %8 = fmul fast <2 x float> %d.real, %2 211 %9 = fmul fast <2 x float> %5, %d.imag 212 %10 = fsub fast <2 x float> %6, %7 213 %11 = fadd fast <2 x float> %8, %9 214 %12 = fmul fast <2 x float> %c.real, %a.imag 215 %13 = fmul fast <2 x float> %c.imag, %a.real 216 %14 = fadd fast <2 x float> %13, %12 217 %15 = fmul fast <2 x float> %14, %10 218 %16 = fmul fast <2 x float> %c.real, %a.real 219 %17 = fmul fast <2 x float> %c.imag, %a.imag 220 %18 = fsub fast <2 x float> %16, %17 221 %19 = fmul fast <2 x float> %18, %11 222 %20 = fadd fast <2 x float> %15, %19 223 %21 = fmul fast <2 x float> %18, %10 224 %22 = fmul fast <2 x float> %14, %11 225 %23 = fsub fast <2 x float> %21, %22 226 %interleaved.vec = shufflevector <2 x float> %23, <2 x float> %20, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 227 ret <4 x float> %interleaved.vec 228} 229 230; Expected to transform 231define arm_aapcs_vfpcc <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 232; CHECK-LABEL: mul_add90_mul: 233; CHECK: @ %bb.0: @ %entry 234; CHECK-NEXT: .vsave {d8, d9} 235; CHECK-NEXT: vpush {d8, d9} 236; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0 237; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0 238; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90 239; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90 240; CHECK-NEXT: vcadd.f32 q0, q3, q4, #90 241; CHECK-NEXT: vpop {d8, d9} 242; CHECK-NEXT: bx lr 243entry: 244 %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 245 %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 246 %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 247 %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 248 %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 249 %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 250 251 %i6 = fmul fast <2 x float> %br, %ar 252 %i7 = fmul fast <2 x float> %bi, %ai 253 %xr = fsub fast <2 x float> %i6, %i7 254 %i9 = fmul fast <2 x float> %bi, %ar 255 %i10 = fmul fast <2 x float> %br, %ai 256 %xi = fadd fast <2 x float> %i9, %i10 257 258 %j6 = fmul fast <2 x float> %cr, %ar 259 %j7 = fmul fast <2 x float> %ci, %ai 260 %yr = fsub fast <2 x float> %j6, %j7 261 %j9 = fmul fast <2 x float> %ci, %ar 262 %j10 = fmul fast <2 x float> %cr, %ai 263 %yi = fadd fast <2 x float> %j9, %j10 264 265 %zr = fsub fast <2 x float> %yr, %xi 266 %zi = fadd fast <2 x float> %yi, %xr 267 %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 268 ret <4 x float> %interleaved.vec 269} 270 271; Expected to not transform 272define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 273; CHECK-LABEL: mul_triangle_addmul: 274; CHECK: @ %bb.0: @ %entry 275; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 276; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 277; CHECK-NEXT: vmov.f32 s16, s0 278; CHECK-NEXT: vmov.f32 s20, s5 279; CHECK-NEXT: vmov.f32 s17, s2 280; CHECK-NEXT: vmov.f32 s21, s7 281; CHECK-NEXT: vmov.f32 s5, s6 282; CHECK-NEXT: vmul.f32 q3, q5, q4 283; CHECK-NEXT: vmul.f32 q4, q1, q4 284; CHECK-NEXT: vmov.f32 s0, s1 285; CHECK-NEXT: vmov.f32 s1, s3 286; CHECK-NEXT: vmov q6, q4 287; CHECK-NEXT: vfms.f32 q6, q5, q0 288; CHECK-NEXT: vmov q7, q3 289; CHECK-NEXT: vfma.f32 q3, q1, q0 290; CHECK-NEXT: vmov.f32 s20, s8 291; CHECK-NEXT: vmov.f32 s21, s10 292; CHECK-NEXT: vmov.f32 s4, s9 293; CHECK-NEXT: vfma.f32 q7, q5, q0 294; CHECK-NEXT: vmov.f32 s5, s11 295; CHECK-NEXT: vadd.f32 q5, q7, q6 296; CHECK-NEXT: vfms.f32 q4, q1, q0 297; CHECK-NEXT: vmov.f32 s1, s20 298; CHECK-NEXT: vsub.f32 q1, q4, q3 299; CHECK-NEXT: vmov.f32 s3, s21 300; CHECK-NEXT: vmov.f32 s0, s4 301; CHECK-NEXT: vmov.f32 s2, s5 302; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 303; CHECK-NEXT: bx lr 304entry: 305 %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 306 %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 307 %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 308 %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 309 %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 310 %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 311 312 %i6 = fmul fast <2 x float> %br, %ar 313 %i7 = fmul fast <2 x float> %bi, %ai 314 %xr = fsub fast <2 x float> %i6, %i7 315 %i9 = fmul fast <2 x float> %bi, %ar 316 %i10 = fmul fast <2 x float> %br, %ai 317 %xi = fadd fast <2 x float> %i9, %i10 318 319 ;%j6 = fmul fast <2 x float> %cr, %ar 320 %j7 = fmul fast <2 x float> %ci, %ai 321 %yr = fsub fast <2 x float> %i6, %j7 322 ;%j9 = fmul fast <2 x float> %ci, %ar 323 %j10 = fmul fast <2 x float> %cr, %ai 324 %yi = fadd fast <2 x float> %i9, %j10 325 326 %zr = fsub fast <2 x float> %yr, %xi 327 %zi = fadd fast <2 x float> %yi, %xr 328 %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 329 ret <4 x float> %interleaved.vec 330} 331 332; Expected to not transform 333define arm_aapcs_vfpcc <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) { 334; CHECK-LABEL: mul_triangle_multiuses: 335; CHECK: @ %bb.0: @ %entry 336; CHECK-NEXT: .vsave {d8, d9} 337; CHECK-NEXT: vpush {d8, d9} 338; CHECK-NEXT: vmov.f32 s16, s4 339; CHECK-NEXT: vmov.f32 s8, s1 340; CHECK-NEXT: vmov.f32 s17, s6 341; CHECK-NEXT: vmov.f32 s9, s3 342; CHECK-NEXT: vmov.f32 s4, s5 343; CHECK-NEXT: vmul.f32 q3, q2, q4 344; CHECK-NEXT: vmov.f32 s1, s2 345; CHECK-NEXT: vmov.f32 s5, s7 346; CHECK-NEXT: vfma.f32 q3, q1, q0 347; CHECK-NEXT: vmul.f32 q1, q1, q2 348; CHECK-NEXT: vneg.f32 q1, q1 349; CHECK-NEXT: vfma.f32 q1, q4, q0 350; CHECK-NEXT: vmov.f32 s18, s12 351; CHECK-NEXT: vmov.f32 s16, s4 352; CHECK-NEXT: vmov.f32 s17, s5 353; CHECK-NEXT: vmov.f32 s19, s13 354; CHECK-NEXT: vstrw.32 q4, [r0] 355; CHECK-NEXT: vmul.f32 q4, q3, q0 356; CHECK-NEXT: vfma.f32 q4, q1, q2 357; CHECK-NEXT: vmul.f32 q2, q3, q2 358; CHECK-NEXT: vneg.f32 q2, q2 359; CHECK-NEXT: vfma.f32 q2, q1, q0 360; CHECK-NEXT: vmov.f32 s1, s16 361; CHECK-NEXT: vmov.f32 s0, s8 362; CHECK-NEXT: vmov.f32 s2, s9 363; CHECK-NEXT: vmov.f32 s3, s17 364; CHECK-NEXT: vpop {d8, d9} 365; CHECK-NEXT: bx lr 366entry: 367 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 368 %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 369 %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 370 %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 371 %0 = fmul fast <2 x float> %strided.vec37, %strided.vec 372 %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 373 %2 = fsub fast <2 x float> %0, %1 374 %3 = fmul fast <2 x float> %2, %strided.vec35 375 %4 = fmul fast <2 x float> %strided.vec38, %strided.vec 376 %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 377 %6 = fadd fast <2 x float> %4, %5 378 %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 379 store <4 x float> %otheruse, ptr %p 380 %7 = fmul fast <2 x float> %6, %strided.vec 381 %8 = fadd fast <2 x float> %3, %7 382 %9 = fmul fast <2 x float> %2, %strided.vec 383 %10 = fmul fast <2 x float> %6, %strided.vec35 384 %11 = fsub fast <2 x float> %9, %10 385 %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 386 ret <4 x float> %interleaved.vec 387} 388 389; Expected to transform 390define <4 x float> @mul_addequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 391; CHECK-LABEL: mul_addequal: 392; CHECK: @ %bb.0: @ %entry 393; CHECK-NEXT: add.w r12, sp, #16 394; CHECK-NEXT: vmov d0, r0, r1 395; CHECK-NEXT: mov r0, sp 396; CHECK-NEXT: vldrw.u32 q2, [r12] 397; CHECK-NEXT: vldrw.u32 q1, [r0] 398; CHECK-NEXT: vmov d1, r2, r3 399; CHECK-NEXT: vcmla.f32 q2, q0, q1, #0 400; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 401; CHECK-NEXT: vmov r0, r1, d4 402; CHECK-NEXT: vmov r2, r3, d5 403; CHECK-NEXT: bx lr 404entry: 405 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 406 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 407 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 408 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 409 %0 = fmul fast <2 x float> %b.imag, %strided.vec 410 %1 = fmul fast <2 x float> %b.real, %a.imag 411 %2 = fadd fast <2 x float> %1, %0 412 %3 = fmul fast <2 x float> %b.real, %strided.vec 413 %4 = fmul fast <2 x float> %a.imag, %b.imag 414 %5 = fsub fast <2 x float> %3, %4 415 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 416 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 417 %6 = fadd fast <2 x float> %5, %c.real 418 %7 = fadd fast <2 x float> %2, %c.imag 419 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 420 ret <4 x float> %interleaved.vec 421} 422 423; Expected to transform 424define <4 x float> @mul_subequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 425; CHECK-LABEL: mul_subequal: 426; CHECK: @ %bb.0: @ %entry 427; CHECK-NEXT: vmov d0, r0, r1 428; CHECK-NEXT: mov r1, sp 429; CHECK-NEXT: vldrw.u32 q2, [r1] 430; CHECK-NEXT: vmov d1, r2, r3 431; CHECK-NEXT: add r0, sp, #16 432; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 433; CHECK-NEXT: vldrw.u32 q1, [r0] 434; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 435; CHECK-NEXT: vsub.f32 q0, q3, q1 436; CHECK-NEXT: vmov r0, r1, d0 437; CHECK-NEXT: vmov r2, r3, d1 438; CHECK-NEXT: bx lr 439entry: 440 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 441 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 442 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 443 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 444 %0 = fmul fast <2 x float> %b.imag, %strided.vec 445 %1 = fmul fast <2 x float> %b.real, %a.imag 446 %2 = fadd fast <2 x float> %1, %0 447 %3 = fmul fast <2 x float> %b.real, %strided.vec 448 %4 = fmul fast <2 x float> %a.imag, %b.imag 449 %5 = fsub fast <2 x float> %3, %4 450 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 451 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 452 %6 = fsub fast <2 x float> %5, %c.real 453 %7 = fsub fast <2 x float> %2, %c.imag 454 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 455 ret <4 x float> %interleaved.vec 456} 457 458 459; Expected to transform 460define <4 x float> @mul_mulequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 461; CHECK-LABEL: mul_mulequal: 462; CHECK: @ %bb.0: @ %entry 463; CHECK-NEXT: vmov d0, r0, r1 464; CHECK-NEXT: mov r1, sp 465; CHECK-NEXT: vldrw.u32 q2, [r1] 466; CHECK-NEXT: vmov d1, r2, r3 467; CHECK-NEXT: add r0, sp, #16 468; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 469; CHECK-NEXT: vldrw.u32 q1, [r0] 470; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 471; CHECK-NEXT: vmul.f32 q0, q3, q1 472; CHECK-NEXT: vmov r0, r1, d0 473; CHECK-NEXT: vmov r2, r3, d1 474; CHECK-NEXT: bx lr 475entry: 476 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 477 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 478 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 479 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 480 %0 = fmul fast <2 x float> %b.imag, %strided.vec 481 %1 = fmul fast <2 x float> %b.real, %a.imag 482 %2 = fadd fast <2 x float> %1, %0 483 %3 = fmul fast <2 x float> %b.real, %strided.vec 484 %4 = fmul fast <2 x float> %a.imag, %b.imag 485 %5 = fsub fast <2 x float> %3, %4 486 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 487 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 488 %6 = fmul fast <2 x float> %5, %c.real 489 %7 = fmul fast <2 x float> %2, %c.imag 490 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 491 ret <4 x float> %interleaved.vec 492} 493 494; Expected to not transform 495define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) { 496; CHECK-LABEL: mul_divequal: 497; CHECK: @ %bb.0: @ %entry 498; CHECK-NEXT: .vsave {d10, d11} 499; CHECK-NEXT: vpush {d10, d11} 500; CHECK-NEXT: .vsave {d8} 501; CHECK-NEXT: vpush {d8} 502; CHECK-NEXT: vmov d0, r0, r1 503; CHECK-NEXT: add r0, sp, #24 504; CHECK-NEXT: vldrw.u32 q1, [r0] 505; CHECK-NEXT: vmov d1, r2, r3 506; CHECK-NEXT: vmov.f32 s16, s1 507; CHECK-NEXT: add.w r12, sp, #40 508; CHECK-NEXT: vmov.f32 s12, s5 509; CHECK-NEXT: vmov.f32 s13, s7 510; CHECK-NEXT: vmov.f32 s1, s2 511; CHECK-NEXT: vmov.f32 s8, s4 512; CHECK-NEXT: vmul.f32 q5, q3, q0 513; CHECK-NEXT: vmov.f32 s9, s6 514; CHECK-NEXT: vldrw.u32 q1, [r12] 515; CHECK-NEXT: vmov.f32 s17, s3 516; CHECK-NEXT: vfma.f32 q5, q2, q4 517; CHECK-NEXT: vmul.f32 q3, q4, q3 518; CHECK-NEXT: vdiv.f32 s3, s21, s7 519; CHECK-NEXT: vneg.f32 q3, q3 520; CHECK-NEXT: vfma.f32 q3, q2, q0 521; CHECK-NEXT: vdiv.f32 s1, s20, s5 522; CHECK-NEXT: vdiv.f32 s2, s13, s6 523; CHECK-NEXT: vdiv.f32 s0, s12, s4 524; CHECK-NEXT: vmov r0, r1, d0 525; CHECK-NEXT: vmov r2, r3, d1 526; CHECK-NEXT: vpop {d8} 527; CHECK-NEXT: vpop {d10, d11} 528; CHECK-NEXT: bx lr 529entry: 530 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 531 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 532 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 533 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 534 %0 = fmul fast <2 x float> %b.imag, %strided.vec 535 %1 = fmul fast <2 x float> %b.real, %a.imag 536 %2 = fadd fast <2 x float> %1, %0 537 %3 = fmul fast <2 x float> %b.real, %strided.vec 538 %4 = fmul fast <2 x float> %a.imag, %b.imag 539 %5 = fsub fast <2 x float> %3, %4 540 %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2> 541 %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3> 542 %6 = fdiv fast <2 x float> %5, %c.real 543 %7 = fdiv fast <2 x float> %2, %c.imag 544 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 545 ret <4 x float> %interleaved.vec 546} 547 548; Expected to transform 549define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) { 550; CHECK-LABEL: mul_negequal: 551; CHECK: @ %bb.0: @ %entry 552; CHECK-NEXT: vmov d0, r0, r1 553; CHECK-NEXT: mov r0, sp 554; CHECK-NEXT: vldrw.u32 q1, [r0] 555; CHECK-NEXT: vmov d1, r2, r3 556; CHECK-NEXT: vcmul.f32 q2, q0, q1, #180 557; CHECK-NEXT: vcmla.f32 q2, q0, q1, #270 558; CHECK-NEXT: vmov r0, r1, d4 559; CHECK-NEXT: vmov r2, r3, d5 560; CHECK-NEXT: bx lr 561entry: 562 %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2> 563 %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3> 564 %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2> 565 %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3> 566 %0 = fmul fast <2 x float> %b.imag, %strided.vec 567 %1 = fmul fast <2 x float> %b.real, %a.imag 568 %2 = fadd fast <2 x float> %1, %0 569 %3 = fmul fast <2 x float> %b.real, %strided.vec 570 %4 = fmul fast <2 x float> %a.imag, %b.imag 571 %5 = fsub fast <2 x float> %3, %4 572 %6 = fneg fast <2 x float> %5 573 %7 = fneg fast <2 x float> %2 574 %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %7, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 575 ret <4 x float> %interleaved.vec 576} 577