1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @vmlau32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 5; CHECK-LABEL: vmlau32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vmla.i32 q0, q1, r0 8; CHECK-NEXT: bx lr 9entry: 10 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 11 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 12 %2 = mul nsw <4 x i32> %B, %1 13 %3 = add nsw <4 x i32> %A, %2 14 ret <4 x i32> %3 15} 16 17define arm_aapcs_vfpcc <4 x i32> @vmlau32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 18; CHECK-LABEL: vmlau32b: 19; CHECK: @ %bb.0: @ %entry 20; CHECK-NEXT: vmla.i32 q0, q1, r0 21; CHECK-NEXT: bx lr 22entry: 23 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 24 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 25 %2 = mul nsw <4 x i32> %1, %B 26 %3 = add nsw <4 x i32> %2, %A 27 ret <4 x i32> %3 28} 29 30define arm_aapcs_vfpcc <8 x i16> @vmlau16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 31; CHECK-LABEL: vmlau16: 32; CHECK: @ %bb.0: @ %entry 33; CHECK-NEXT: vmla.i16 q0, q1, r0 34; CHECK-NEXT: bx lr 35entry: 36 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 37 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 38 %2 = mul nsw <8 x i16> %B, %1 39 %3 = add nsw <8 x i16> %A, %2 40 ret <8 x i16> %3 41} 42 43define arm_aapcs_vfpcc <8 x i16> @vmlau16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 44; CHECK-LABEL: vmlau16b: 45; CHECK: @ %bb.0: @ %entry 46; CHECK-NEXT: vmla.i16 q0, q1, r0 47; CHECK-NEXT: bx lr 48entry: 49 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 50 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 51 %2 = mul nsw <8 x i16> %1, %B 52 %3 = add nsw <8 x i16> %2, %A 53 ret <8 x i16> %3 54} 55 56define arm_aapcs_vfpcc <16 x i8> @vmlau8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 57; CHECK-LABEL: vmlau8: 58; CHECK: @ %bb.0: @ %entry 59; CHECK-NEXT: vmla.i8 q0, q1, r0 60; CHECK-NEXT: bx lr 61entry: 62 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 63 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 64 %2 = mul nsw <16 x i8> %B, %1 65 %3 = add nsw <16 x i8> %A, %2 66 ret <16 x i8> %3 67} 68 69define arm_aapcs_vfpcc <16 x i8> @vmlau8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 70; CHECK-LABEL: vmlau8b: 71; CHECK: @ %bb.0: @ %entry 72; CHECK-NEXT: vmla.i8 q0, q1, r0 73; CHECK-NEXT: bx lr 74entry: 75 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 76 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 77 %2 = mul nsw <16 x i8> %1, %B 78 %3 = add nsw <16 x i8> %2, %A 79 ret <16 x i8> %3 80} 81 82define void @vmla32_in_loop(ptr %s1, i32 %x, ptr %d, i32 %n) { 83; CHECK-LABEL: vmla32_in_loop: 84; CHECK: @ %bb.0: @ %entry 85; CHECK-NEXT: .LBB6_1: @ %vector.body 86; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 87; CHECK-NEXT: vldrw.u32 q0, [r0], #16 88; CHECK-NEXT: vldrw.u32 q1, [r2] 89; CHECK-NEXT: subs r3, #4 90; CHECK-NEXT: vmla.i32 q1, q0, r1 91; CHECK-NEXT: vstrb.8 q1, [r2], #16 92; CHECK-NEXT: bne .LBB6_1 93; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 94; CHECK-NEXT: bx lr 95entry: 96 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0 97 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 98 br label %vector.body 99 100vector.body: 101 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 102 %0 = getelementptr inbounds i32, ptr %s1, i32 %index 103 %wide.load = load <4 x i32>, ptr %0, align 4 104 %1 = mul nsw <4 x i32> %wide.load, %broadcast.splat9 105 %2 = getelementptr inbounds i32, ptr %d, i32 %index 106 %wide.load10 = load <4 x i32>, ptr %2, align 4 107 %3 = add nsw <4 x i32> %wide.load10, %1 108 store <4 x i32> %3, ptr %2, align 4 109 %index.next = add i32 %index, 4 110 %4 = icmp eq i32 %index.next, %n 111 br i1 %4, label %for.cond.cleanup, label %vector.body 112 113for.cond.cleanup: 114 ret void 115} 116 117define void @vmla16_in_loop(ptr %s1, i16 %x, ptr %d, i32 %n) { 118; CHECK-LABEL: vmla16_in_loop: 119; CHECK: @ %bb.0: @ %entry 120; CHECK-NEXT: .LBB7_1: @ %vector.body 121; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 122; CHECK-NEXT: vldrh.u16 q0, [r0], #16 123; CHECK-NEXT: vldrh.u16 q1, [r2] 124; CHECK-NEXT: subs r3, #8 125; CHECK-NEXT: vmla.i16 q1, q0, r1 126; CHECK-NEXT: vstrb.8 q1, [r2], #16 127; CHECK-NEXT: bne .LBB7_1 128; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 129; CHECK-NEXT: bx lr 130entry: 131 %broadcast.splatinsert11 = insertelement <8 x i16> undef, i16 %x, i32 0 132 %broadcast.splat12 = shufflevector <8 x i16> %broadcast.splatinsert11, <8 x i16> undef, <8 x i32> zeroinitializer 133 br label %vector.body 134 135vector.body: 136 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 137 %0 = getelementptr inbounds i16, ptr %s1, i32 %index 138 %wide.load = load <8 x i16>, ptr %0, align 2 139 %1 = mul <8 x i16> %wide.load, %broadcast.splat12 140 %2 = getelementptr inbounds i16, ptr %d, i32 %index 141 %wide.load13 = load <8 x i16>, ptr %2, align 2 142 %3 = add <8 x i16> %1, %wide.load13 143 store <8 x i16> %3, ptr %2, align 2 144 %index.next = add i32 %index, 8 145 %4 = icmp eq i32 %index.next, %n 146 br i1 %4, label %for.cond.cleanup, label %vector.body 147 148for.cond.cleanup: 149 ret void 150} 151 152define void @vmla8_in_loop(ptr %s1, i8 %x, ptr %d, i32 %n) { 153; CHECK-LABEL: vmla8_in_loop: 154; CHECK: @ %bb.0: @ %entry 155; CHECK-NEXT: .LBB8_1: @ %vector.body 156; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 157; CHECK-NEXT: vldrh.u16 q0, [r0], #16 158; CHECK-NEXT: vldrh.u16 q1, [r2] 159; CHECK-NEXT: subs r3, #16 160; CHECK-NEXT: vmla.i8 q1, q0, r1 161; CHECK-NEXT: vstrb.8 q1, [r2], #16 162; CHECK-NEXT: bne .LBB8_1 163; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 164; CHECK-NEXT: bx lr 165entry: 166 %broadcast.splatinsert11 = insertelement <16 x i8> undef, i8 %x, i32 0 167 %broadcast.splat12 = shufflevector <16 x i8> %broadcast.splatinsert11, <16 x i8> undef, <16 x i32> zeroinitializer 168 br label %vector.body 169 170vector.body: 171 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 172 %0 = getelementptr inbounds i8, ptr %s1, i32 %index 173 %wide.load = load <16 x i8>, ptr %0, align 2 174 %1 = mul <16 x i8> %wide.load, %broadcast.splat12 175 %2 = getelementptr inbounds i8, ptr %d, i32 %index 176 %wide.load13 = load <16 x i8>, ptr %2, align 2 177 %3 = add <16 x i8> %1, %wide.load13 178 store <16 x i8> %3, ptr %2, align 2 179 %index.next = add i32 %index, 16 180 %4 = icmp eq i32 %index.next, %n 181 br i1 %4, label %for.cond.cleanup, label %vector.body 182 183for.cond.cleanup: 184 ret void 185} 186 187 188define arm_aapcs_vfpcc <4 x i32> @vmlasu32(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 189; CHECK-LABEL: vmlasu32: 190; CHECK: @ %bb.0: @ %entry 191; CHECK-NEXT: vmlas.i32 q0, q1, r0 192; CHECK-NEXT: bx lr 193entry: 194 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 195 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 196 %2 = mul nsw <4 x i32> %A, %B 197 %3 = add nsw <4 x i32> %1, %2 198 ret <4 x i32> %3 199} 200 201define arm_aapcs_vfpcc <4 x i32> @vmlasu32b(<4 x i32> %A, <4 x i32> %B, i32 %X) nounwind { 202; CHECK-LABEL: vmlasu32b: 203; CHECK: @ %bb.0: @ %entry 204; CHECK-NEXT: vmlas.i32 q0, q1, r0 205; CHECK-NEXT: bx lr 206entry: 207 %0 = insertelement <4 x i32> undef, i32 %X, i32 0 208 %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 209 %2 = mul nsw <4 x i32> %A, %B 210 %3 = add nsw <4 x i32> %2, %1 211 ret <4 x i32> %3 212} 213 214define arm_aapcs_vfpcc <8 x i16> @vmlasu16(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 215; CHECK-LABEL: vmlasu16: 216; CHECK: @ %bb.0: @ %entry 217; CHECK-NEXT: vmlas.i16 q0, q1, r0 218; CHECK-NEXT: bx lr 219entry: 220 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 221 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 222 %2 = mul nsw <8 x i16> %A, %B 223 %3 = add nsw <8 x i16> %1, %2 224 ret <8 x i16> %3 225} 226 227define arm_aapcs_vfpcc <8 x i16> @vmlasu16b(<8 x i16> %A, <8 x i16> %B, i16 %X) nounwind { 228; CHECK-LABEL: vmlasu16b: 229; CHECK: @ %bb.0: @ %entry 230; CHECK-NEXT: vmlas.i16 q0, q1, r0 231; CHECK-NEXT: bx lr 232entry: 233 %0 = insertelement <8 x i16> undef, i16 %X, i32 0 234 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 235 %2 = mul nsw <8 x i16> %A, %B 236 %3 = add nsw <8 x i16> %2, %1 237 ret <8 x i16> %3 238} 239 240define arm_aapcs_vfpcc <16 x i8> @vmlasu8(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 241; CHECK-LABEL: vmlasu8: 242; CHECK: @ %bb.0: @ %entry 243; CHECK-NEXT: vmlas.i8 q0, q1, r0 244; CHECK-NEXT: bx lr 245entry: 246 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 247 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 248 %2 = mul nsw <16 x i8> %A, %B 249 %3 = add nsw <16 x i8> %1, %2 250 ret <16 x i8> %3 251} 252 253define arm_aapcs_vfpcc <16 x i8> @vmlasu8b(<16 x i8> %A, <16 x i8> %B, i8 %X) nounwind { 254; CHECK-LABEL: vmlasu8b: 255; CHECK: @ %bb.0: @ %entry 256; CHECK-NEXT: vmlas.i8 q0, q1, r0 257; CHECK-NEXT: bx lr 258entry: 259 %0 = insertelement <16 x i8> undef, i8 %X, i32 0 260 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 261 %2 = mul nsw <16 x i8> %A, %B 262 %3 = add nsw <16 x i8> %2, %1 263 ret <16 x i8> %3 264} 265 266define void @vmlas32_in_loop(ptr %s1, i32 %x, ptr %d, i32 %n) { 267; CHECK-LABEL: vmlas32_in_loop: 268; CHECK: @ %bb.0: @ %entry 269; CHECK-NEXT: .LBB15_1: @ %vector.body 270; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 271; CHECK-NEXT: vldrw.u32 q0, [r2] 272; CHECK-NEXT: vldrw.u32 q1, [r0], #16 273; CHECK-NEXT: subs r3, #4 274; CHECK-NEXT: vmlas.i32 q1, q0, r1 275; CHECK-NEXT: vstrb.8 q1, [r2], #16 276; CHECK-NEXT: bne .LBB15_1 277; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 278; CHECK-NEXT: bx lr 279entry: 280 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0 281 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 282 br label %vector.body 283 284vector.body: 285 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 286 %0 = getelementptr inbounds i32, ptr %s1, i32 %index 287 %wide.load = load <4 x i32>, ptr %0, align 4 288 %1 = getelementptr inbounds i32, ptr %d, i32 %index 289 %wide.load10 = load <4 x i32>, ptr %1, align 4 290 %2 = mul nsw <4 x i32> %wide.load, %wide.load10 291 %3 = add nsw <4 x i32> %broadcast.splat9, %2 292 store <4 x i32> %3, ptr %1, align 4 293 %index.next = add i32 %index, 4 294 %4 = icmp eq i32 %index.next, %n 295 br i1 %4, label %for.cond.cleanup, label %vector.body 296 297for.cond.cleanup: 298 ret void 299} 300 301define void @vmlas16_in_loop(ptr %s1, i16 %x, ptr %d, i32 %n) { 302; CHECK-LABEL: vmlas16_in_loop: 303; CHECK: @ %bb.0: @ %entry 304; CHECK-NEXT: .LBB16_1: @ %vector.body 305; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 306; CHECK-NEXT: vldrh.u16 q0, [r2] 307; CHECK-NEXT: vldrh.u16 q1, [r0], #16 308; CHECK-NEXT: subs r3, #8 309; CHECK-NEXT: vmlas.i16 q1, q0, r1 310; CHECK-NEXT: vstrb.8 q1, [r2], #16 311; CHECK-NEXT: bne .LBB16_1 312; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 313; CHECK-NEXT: bx lr 314entry: 315 %broadcast.splatinsert11 = insertelement <8 x i16> undef, i16 %x, i32 0 316 %broadcast.splat12 = shufflevector <8 x i16> %broadcast.splatinsert11, <8 x i16> undef, <8 x i32> zeroinitializer 317 br label %vector.body 318 319vector.body: 320 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 321 %0 = getelementptr inbounds i16, ptr %s1, i32 %index 322 %wide.load = load <8 x i16>, ptr %0, align 2 323 %1 = getelementptr inbounds i16, ptr %d, i32 %index 324 %wide.load13 = load <8 x i16>, ptr %1, align 2 325 %2 = mul <8 x i16> %wide.load, %wide.load13 326 %3 = add <8 x i16> %2, %broadcast.splat12 327 store <8 x i16> %3, ptr %1, align 2 328 %index.next = add i32 %index, 8 329 %4 = icmp eq i32 %index.next, %n 330 br i1 %4, label %for.cond.cleanup, label %vector.body 331 332for.cond.cleanup: 333 ret void 334} 335 336define void @vmlas8_in_loop(ptr %s1, i8 %x, ptr %d, i32 %n) { 337; CHECK-LABEL: vmlas8_in_loop: 338; CHECK: @ %bb.0: @ %entry 339; CHECK-NEXT: .LBB17_1: @ %vector.body 340; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 341; CHECK-NEXT: vldrh.u16 q0, [r2] 342; CHECK-NEXT: vldrh.u16 q1, [r0], #16 343; CHECK-NEXT: subs r3, #16 344; CHECK-NEXT: vmlas.i8 q1, q0, r1 345; CHECK-NEXT: vstrb.8 q1, [r2], #16 346; CHECK-NEXT: bne .LBB17_1 347; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 348; CHECK-NEXT: bx lr 349entry: 350 %broadcast.splatinsert11 = insertelement <16 x i8> undef, i8 %x, i32 0 351 %broadcast.splat12 = shufflevector <16 x i8> %broadcast.splatinsert11, <16 x i8> undef, <16 x i32> zeroinitializer 352 br label %vector.body 353 354vector.body: 355 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 356 %0 = getelementptr inbounds i8, ptr %s1, i32 %index 357 %wide.load = load <16 x i8>, ptr %0, align 2 358 %1 = getelementptr inbounds i8, ptr %d, i32 %index 359 %wide.load13 = load <16 x i8>, ptr %1, align 2 360 %2 = mul <16 x i8> %wide.load, %wide.load13 361 %3 = add <16 x i8> %2, %broadcast.splat12 362 store <16 x i8> %3, ptr %1, align 2 363 %index.next = add i32 %index, 16 364 %4 = icmp eq i32 %index.next, %n 365 br i1 %4, label %for.cond.cleanup, label %vector.body 366 367for.cond.cleanup: 368 ret void 369} 370