1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s 3 4define <8 x i8> @v_dup8(i8 %A) nounwind { 5; CHECK-LABEL: v_dup8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vdup.8 d16, r0 8; CHECK-NEXT: vmov r0, r1, d16 9; CHECK-NEXT: mov pc, lr 10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 18 ret <8 x i8> %tmp8 19} 20 21define <4 x i16> @v_dup16(i16 %A) nounwind { 22; CHECK-LABEL: v_dup16: 23; CHECK: @ %bb.0: 24; CHECK-NEXT: vdup.16 d16, r0 25; CHECK-NEXT: vmov r0, r1, d16 26; CHECK-NEXT: mov pc, lr 27 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 28 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 29 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 30 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 31 ret <4 x i16> %tmp4 32} 33 34define <2 x i32> @v_dup32(i32 %A) nounwind { 35; CHECK-LABEL: v_dup32: 36; CHECK: @ %bb.0: 37; CHECK-NEXT: vdup.32 d16, r0 38; CHECK-NEXT: vmov r0, r1, d16 39; CHECK-NEXT: mov pc, lr 40 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 41 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 42 ret <2 x i32> %tmp2 43} 44 45define <2 x float> @v_dupfloat(float %A) nounwind { 46; CHECK-LABEL: v_dupfloat: 47; CHECK: @ %bb.0: 48; CHECK-NEXT: vdup.32 d16, r0 49; CHECK-NEXT: vmov r0, r1, d16 50; CHECK-NEXT: mov pc, lr 51 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 52 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 53 ret <2 x float> %tmp2 54} 55 56define <16 x i8> @v_dupQ8(i8 %A) nounwind { 57; CHECK-LABEL: v_dupQ8: 58; CHECK: @ %bb.0: 59; CHECK-NEXT: vdup.8 q8, r0 60; CHECK-NEXT: vmov r0, r1, d16 61; CHECK-NEXT: vmov r2, r3, d17 62; CHECK-NEXT: mov pc, lr 63 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 64 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 65 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 66 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 67 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 68 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 69 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 70 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 71 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 72 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 73 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 74 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 75 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 76 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 77 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 78 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 79 ret <16 x i8> %tmp16 80} 81 82define <8 x i16> @v_dupQ16(i16 %A) nounwind { 83; CHECK-LABEL: v_dupQ16: 84; CHECK: @ %bb.0: 85; CHECK-NEXT: vdup.16 q8, r0 86; CHECK-NEXT: vmov r0, r1, d16 87; CHECK-NEXT: vmov r2, r3, d17 88; CHECK-NEXT: mov pc, lr 89 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 90 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 91 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 92 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 93 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 94 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 95 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 96 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 97 ret <8 x i16> %tmp8 98} 99 100define <4 x i32> @v_dupQ32(i32 %A) nounwind { 101; CHECK-LABEL: v_dupQ32: 102; CHECK: @ %bb.0: 103; CHECK-NEXT: mov r1, r0 104; CHECK-NEXT: mov r2, r0 105; CHECK-NEXT: mov r3, r0 106; CHECK-NEXT: mov pc, lr 107 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 108 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 109 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 110 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 111 ret <4 x i32> %tmp4 112} 113 114define <4 x float> @v_dupQfloat(float %A) nounwind { 115; CHECK-LABEL: v_dupQfloat: 116; CHECK: @ %bb.0: 117; CHECK-NEXT: vdup.32 q8, r0 118; CHECK-NEXT: vmov r0, r1, d16 119; CHECK-NEXT: vmov r2, r3, d17 120; CHECK-NEXT: mov pc, lr 121 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 122 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 123 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 124 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 125 ret <4 x float> %tmp4 126} 127 128; Check to make sure it works with shuffles, too. 129 130define <8 x i8> @v_shuffledup8(i8 %A) nounwind { 131; CHECK-LABEL: v_shuffledup8: 132; CHECK: @ %bb.0: 133; CHECK-NEXT: vdup.8 d16, r0 134; CHECK-NEXT: vmov r0, r1, d16 135; CHECK-NEXT: mov pc, lr 136 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 137 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 138 ret <8 x i8> %tmp2 139} 140 141define <4 x i16> @v_shuffledup16(i16 %A) nounwind { 142; CHECK-LABEL: v_shuffledup16: 143; CHECK: @ %bb.0: 144; CHECK-NEXT: vdup.16 d16, r0 145; CHECK-NEXT: vmov r0, r1, d16 146; CHECK-NEXT: mov pc, lr 147 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 148 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 149 ret <4 x i16> %tmp2 150} 151 152define <2 x i32> @v_shuffledup32(i32 %A) nounwind { 153; CHECK-LABEL: v_shuffledup32: 154; CHECK: @ %bb.0: 155; CHECK-NEXT: vdup.32 d16, r0 156; CHECK-NEXT: vmov r0, r1, d16 157; CHECK-NEXT: mov pc, lr 158 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 159 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 160 ret <2 x i32> %tmp2 161} 162 163define <2 x float> @v_shuffledupfloat(float %A) nounwind { 164; CHECK-LABEL: v_shuffledupfloat: 165; CHECK: @ %bb.0: 166; CHECK-NEXT: vdup.32 d16, r0 167; CHECK-NEXT: vmov r0, r1, d16 168; CHECK-NEXT: mov pc, lr 169 %tmp1 = insertelement <2 x float> undef, float %A, i32 0 170 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 171 ret <2 x float> %tmp2 172} 173 174define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { 175; CHECK-LABEL: v_shuffledupQ8: 176; CHECK: @ %bb.0: 177; CHECK-NEXT: vdup.8 q8, r0 178; CHECK-NEXT: vmov r0, r1, d16 179; CHECK-NEXT: vmov r2, r3, d17 180; CHECK-NEXT: mov pc, lr 181 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 182 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer 183 ret <16 x i8> %tmp2 184} 185 186define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { 187; CHECK-LABEL: v_shuffledupQ16: 188; CHECK: @ %bb.0: 189; CHECK-NEXT: vdup.16 q8, r0 190; CHECK-NEXT: vmov r0, r1, d16 191; CHECK-NEXT: vmov r2, r3, d17 192; CHECK-NEXT: mov pc, lr 193 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 194 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer 195 ret <8 x i16> %tmp2 196} 197 198define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { 199; CHECK-LABEL: v_shuffledupQ32: 200; CHECK: @ %bb.0: 201; CHECK-NEXT: vdup.32 q8, r0 202; CHECK-NEXT: vmov r0, r1, d16 203; CHECK-NEXT: vmov r2, r3, d17 204; CHECK-NEXT: mov pc, lr 205 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 206 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 207 ret <4 x i32> %tmp2 208} 209 210define <4 x float> @v_shuffledupQfloat(float %A) nounwind { 211; CHECK-LABEL: v_shuffledupQfloat: 212; CHECK: @ %bb.0: 213; CHECK-NEXT: vdup.32 q8, r0 214; CHECK-NEXT: vmov r0, r1, d16 215; CHECK-NEXT: vmov r2, r3, d17 216; CHECK-NEXT: mov pc, lr 217 %tmp1 = insertelement <4 x float> undef, float %A, i32 0 218 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 219 ret <4 x float> %tmp2 220} 221 222define arm_aapcs_vfpcc <8 x i8> @vduplane8(<8 x i8> %A) nounwind { 223; CHECK-LABEL: vduplane8: 224; CHECK: @ %bb.0: 225; CHECK-NEXT: vdup.8 d0, d0[1] 226; CHECK-NEXT: mov pc, lr 227 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 228 ret <8 x i8> %tmp2 229} 230 231define arm_aapcs_vfpcc <4 x i16> @vduplane16(<4 x i16> %A) nounwind { 232; CHECK-LABEL: vduplane16: 233; CHECK: @ %bb.0: 234; CHECK-NEXT: vdup.16 d0, d0[1] 235; CHECK-NEXT: mov pc, lr 236 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 237 ret <4 x i16> %tmp2 238} 239 240define arm_aapcs_vfpcc <2 x i32> @vduplane32(<2 x i32> %A) nounwind { 241; CHECK-LABEL: vduplane32: 242; CHECK: @ %bb.0: 243; CHECK-NEXT: vdup.32 d0, d0[1] 244; CHECK-NEXT: mov pc, lr 245 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > 246 ret <2 x i32> %tmp2 247} 248 249define arm_aapcs_vfpcc <2 x float> @vduplanefloat(<2 x float> %A) nounwind { 250; CHECK-LABEL: vduplanefloat: 251; CHECK: @ %bb.0: 252; CHECK-NEXT: vdup.32 d0, d0[1] 253; CHECK-NEXT: mov pc, lr 254 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 > 255 ret <2 x float> %tmp2 256} 257 258define arm_aapcs_vfpcc <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind { 259; CHECK-LABEL: vduplaneQ8: 260; CHECK: @ %bb.0: 261; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 262; CHECK-NEXT: vdup.8 q0, d0[1] 263; CHECK-NEXT: mov pc, lr 264 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 265 ret <16 x i8> %tmp2 266} 267 268define arm_aapcs_vfpcc <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind { 269; CHECK-LABEL: vduplaneQ16: 270; CHECK: @ %bb.0: 271; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 272; CHECK-NEXT: vdup.16 q0, d0[1] 273; CHECK-NEXT: mov pc, lr 274 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > 275 ret <8 x i16> %tmp2 276} 277 278define arm_aapcs_vfpcc <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind { 279; CHECK-LABEL: vduplaneQ32: 280; CHECK: @ %bb.0: 281; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 282; CHECK-NEXT: vdup.32 q0, d0[1] 283; CHECK-NEXT: mov pc, lr 284 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 285 ret <4 x i32> %tmp2 286} 287 288define arm_aapcs_vfpcc <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind { 289; CHECK-LABEL: vduplaneQfloat: 290; CHECK: @ %bb.0: 291; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 292; CHECK-NEXT: vdup.32 q0, d0[1] 293; CHECK-NEXT: mov pc, lr 294 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > 295 ret <4 x float> %tmp2 296} 297 298define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { 299; CHECK-LABEL: foo: 300; CHECK: @ %bb.0: @ %entry 301; CHECK-NEXT: mov r0, r2 302; CHECK-NEXT: mov r1, r3 303; CHECK-NEXT: mov pc, lr 304entry: 305 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 306 ret <2 x i64> %0 307} 308 309define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { 310; CHECK-LABEL: bar: 311; CHECK: @ %bb.0: @ %entry 312; CHECK-NEXT: mov r2, r0 313; CHECK-NEXT: mov r3, r1 314; CHECK-NEXT: mov pc, lr 315entry: 316 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> 317 ret <2 x i64> %0 318} 319 320define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { 321; CHECK-LABEL: baz: 322; CHECK: @ %bb.0: @ %entry 323; CHECK-NEXT: mov r0, r2 324; CHECK-NEXT: mov r1, r3 325; CHECK-NEXT: mov pc, lr 326entry: 327 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> 328 ret <2 x double> %0 329} 330 331define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { 332; CHECK-LABEL: qux: 333; CHECK: @ %bb.0: @ %entry 334; CHECK-NEXT: mov r2, r0 335; CHECK-NEXT: mov r3, r1 336; CHECK-NEXT: mov pc, lr 337entry: 338 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> 339 ret <2 x double> %0 340} 341 342; Radar 7373643 343define void @redundantVdup(ptr %ptr) nounwind { 344; CHECK-LABEL: redundantVdup: 345; CHECK: @ %bb.0: 346; CHECK-NEXT: vmov.i8 d16, #0x80 347; CHECK-NEXT: vstr d16, [r0] 348; CHECK-NEXT: mov pc, lr 349 %1 = insertelement <8 x i8> undef, i8 -128, i32 0 350 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 351 store <8 x i8> %2, ptr %ptr, align 8 352 ret void 353} 354 355define <4 x i32> @tdupi(i32 %x, i32 %y) { 356; CHECK-LABEL: tdupi: 357; CHECK: @ %bb.0: 358; CHECK-NEXT: mov r3, r1 359; CHECK-NEXT: mov r1, r0 360; CHECK-NEXT: mov r2, r0 361; CHECK-NEXT: mov pc, lr 362 %1 = insertelement <4 x i32> undef, i32 %x, i32 0 363 %2 = insertelement <4 x i32> %1, i32 %x, i32 1 364 %3 = insertelement <4 x i32> %2, i32 %x, i32 2 365 %4 = insertelement <4 x i32> %3, i32 %y, i32 3 366 ret <4 x i32> %4 367} 368 369define <4 x float> @tdupf(float %x, float %y) { 370; CHECK-LABEL: tdupf: 371; CHECK: @ %bb.0: 372; CHECK-NEXT: vdup.32 q0, r0 373; CHECK-NEXT: vmov s3, r1 374; CHECK-NEXT: vmov r0, r1, d0 375; CHECK-NEXT: vmov r2, r3, d1 376; CHECK-NEXT: mov pc, lr 377 %1 = insertelement <4 x float> undef, float %x, i32 0 378 %2 = insertelement <4 x float> %1, float %x, i32 1 379 %3 = insertelement <4 x float> %2, float %x, i32 2 380 %4 = insertelement <4 x float> %3, float %y, i32 3 381 ret <4 x float> %4 382} 383 384; This test checks that when splatting an element from a vector into another, 385; the value isn't moved out to GPRs first. 386define <4 x i32> @tduplane(<4 x i32> %invec) { 387; CHECK-LABEL: tduplane: 388; CHECK: @ %bb.0: 389; CHECK-NEXT: vmov d16, r0, r1 390; CHECK-NEXT: mov r3, #255 391; CHECK-NEXT: vmov.32 r0, d16[1] 392; CHECK-NEXT: mov r1, r0 393; CHECK-NEXT: mov r2, r0 394; CHECK-NEXT: mov pc, lr 395 %in = extractelement <4 x i32> %invec, i32 1 396 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 397 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 398 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 399 %4 = insertelement <4 x i32> %3, i32 255, i32 3 400 ret <4 x i32> %4 401} 402 403define <2 x float> @check_f32(<4 x float> %v) nounwind { 404; CHECK-LABEL: check_f32: 405; CHECK: @ %bb.0: 406; CHECK-NEXT: vmov d16, r2, r3 407; CHECK-NEXT: vdup.32 d16, d16[1] 408; CHECK-NEXT: vmov r0, r1, d16 409; CHECK-NEXT: mov pc, lr 410 %x = extractelement <4 x float> %v, i32 3 411 %1 = insertelement <2 x float> undef, float %x, i32 0 412 %2 = insertelement <2 x float> %1, float %x, i32 1 413 ret <2 x float> %2 414} 415 416define <2 x i32> @check_i32(<4 x i32> %v) nounwind { 417; CHECK-LABEL: check_i32: 418; CHECK: @ %bb.0: 419; CHECK-NEXT: vmov d16, r2, r3 420; CHECK-NEXT: vdup.32 d16, d16[1] 421; CHECK-NEXT: vmov r0, r1, d16 422; CHECK-NEXT: mov pc, lr 423 %x = extractelement <4 x i32> %v, i32 3 424 %1 = insertelement <2 x i32> undef, i32 %x, i32 0 425 %2 = insertelement <2 x i32> %1, i32 %x, i32 1 426 ret <2 x i32> %2 427} 428 429define <4 x i16> @check_i16(<8 x i16> %v) nounwind { 430; CHECK-LABEL: check_i16: 431; CHECK: @ %bb.0: 432; CHECK-NEXT: vmov d16, r0, r1 433; CHECK-NEXT: vdup.16 d16, d16[3] 434; CHECK-NEXT: vmov r0, r1, d16 435; CHECK-NEXT: mov pc, lr 436 %x = extractelement <8 x i16> %v, i32 3 437 %1 = insertelement <4 x i16> undef, i16 %x, i32 0 438 %2 = insertelement <4 x i16> %1, i16 %x, i32 1 439 ret <4 x i16> %2 440} 441 442define <8 x i8> @check_i8(<16 x i8> %v) nounwind { 443; CHECK-LABEL: check_i8: 444; CHECK: @ %bb.0: 445; CHECK-NEXT: vmov d16, r0, r1 446; CHECK-NEXT: vdup.8 d16, d16[3] 447; CHECK-NEXT: vmov r0, r1, d16 448; CHECK-NEXT: mov pc, lr 449 %x = extractelement <16 x i8> %v, i32 3 450 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 451 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 452 ret <8 x i8> %2 453} 454 455; Check that an SPR splat produces a vdup. 456 457define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) { 458; CHECK-LABEL: check_spr_splat2: 459; CHECK: @ %bb.0: 460; CHECK-NEXT: lsl r2, r2, #16 461; CHECK-NEXT: vmov d16, r0, r1 462; CHECK-NEXT: asr r2, r2, #16 463; CHECK-NEXT: vmov s0, r2 464; CHECK-NEXT: vcvt.f32.s32 s0, s0 465; CHECK-NEXT: vdup.32 d17, d0[0] 466; CHECK-NEXT: vsub.f32 d16, d17, d16 467; CHECK-NEXT: vmov r0, r1, d16 468; CHECK-NEXT: mov pc, lr 469 %conv = sitofp i16 %q to float 470 %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0 471 %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer 472 %sub = fsub <2 x float> %splat.splat, %p 473 ret <2 x float> %sub 474} 475 476define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) { 477; CHECK-LABEL: check_spr_splat4: 478; CHECK: @ %bb.0: 479; CHECK-NEXT: ldrsh r12, [sp] 480; CHECK-NEXT: vmov d17, r2, r3 481; CHECK-NEXT: vmov d16, r0, r1 482; CHECK-NEXT: vmov s0, r12 483; CHECK-NEXT: vcvt.f32.s32 s0, s0 484; CHECK-NEXT: vdup.32 q9, d0[0] 485; CHECK-NEXT: vsub.f32 q8, q9, q8 486; CHECK-NEXT: vmov r0, r1, d16 487; CHECK-NEXT: vmov r2, r3, d17 488; CHECK-NEXT: mov pc, lr 489 %conv = sitofp i16 %q to float 490 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0 491 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 492 %sub = fsub <4 x float> %splat.splat, %p 493 ret <4 x float> %sub 494} 495; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant. 496define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) { 497; CHECK-LABEL: check_spr_splat4_lane1: 498; CHECK: @ %bb.0: 499; CHECK-NEXT: ldrsh r12, [sp] 500; CHECK-NEXT: vmov d17, r2, r3 501; CHECK-NEXT: vmov d16, r0, r1 502; CHECK-NEXT: vmov s0, r12 503; CHECK-NEXT: vcvt.f32.s32 s0, s0 504; CHECK-NEXT: vdup.32 q9, d0[0] 505; CHECK-NEXT: vsub.f32 q8, q9, q8 506; CHECK-NEXT: vmov r0, r1, d16 507; CHECK-NEXT: vmov r2, r3, d17 508; CHECK-NEXT: mov pc, lr 509 %conv = sitofp i16 %q to float 510 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1 511 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 512 %sub = fsub <4 x float> %splat.splat, %p 513 ret <4 x float> %sub 514} 515 516; Also make sure we don't barf on variable-index extractelts, where we almost 517; could have generated a vdup. 518 519define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) { 520; CHECK-LABEL: check_i8_varidx: 521; CHECK: @ %bb.0: 522; CHECK-NEXT: .save {r11} 523; CHECK-NEXT: push {r11} 524; CHECK-NEXT: .setfp r11, sp 525; CHECK-NEXT: mov r11, sp 526; CHECK-NEXT: .pad #28 527; CHECK-NEXT: sub sp, sp, #28 528; CHECK-NEXT: bic sp, sp, #15 529; CHECK-NEXT: ldr r12, [r11, #4] 530; CHECK-NEXT: vmov d17, r2, r3 531; CHECK-NEXT: vmov d16, r0, r1 532; CHECK-NEXT: mov r1, sp 533; CHECK-NEXT: and r0, r12, #15 534; CHECK-NEXT: vst1.64 {d16, d17}, [r1:128], r0 535; CHECK-NEXT: vld1.8 {d16[]}, [r1] 536; CHECK-NEXT: vmov r0, r1, d16 537; CHECK-NEXT: mov sp, r11 538; CHECK-NEXT: pop {r11} 539; CHECK-NEXT: mov pc, lr 540 %x = extractelement <16 x i8> %v, i32 %idx 541 %1 = insertelement <8 x i8> undef, i8 %x, i32 0 542 %2 = insertelement <8 x i8> %1, i8 %x, i32 1 543 ret <8 x i8> %2 544} 545