1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s 3 4define <8 x i8> @vld1dupi8(ptr %A) nounwind { 5; CHECK-LABEL: vld1dupi8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vld1.8 {d16[]}, [r0] 8; CHECK-NEXT: vmov r0, r1, d16 9; CHECK-NEXT: mov pc, lr 10;Check the (default) alignment value. 11 %tmp1 = load i8, ptr %A, align 8 12 %tmp2 = insertelement <8 x i8> undef, i8 %tmp1, i32 0 13 %tmp3 = shufflevector <8 x i8> %tmp2, <8 x i8> undef, <8 x i32> zeroinitializer 14 ret <8 x i8> %tmp3 15} 16 17define <8 x i8> @vld1dupi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind { 18; CHECK-LABEL: vld1dupi8_preinc: 19; CHECK: @ %bb.0: @ %entry 20; CHECK-NEXT: ldr r2, [r0] 21; CHECK-NEXT: add r3, r2, r1 22; CHECK-NEXT: str r3, [r0] 23; CHECK-NEXT: vld1.8 {d16[]}, [r3] 24; CHECK-NEXT: vmov r2, r1, d16 25; CHECK-NEXT: mov r0, r2 26; CHECK-NEXT: mov pc, lr 27entry: 28 %0 = load ptr, ptr %a, align 4 29 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %b 30 %1 = load i8, ptr %add.ptr, align 1 31 %2 = insertelement <8 x i8> undef, i8 %1, i32 0 32 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 33 store ptr %add.ptr, ptr %a, align 4 34 ret <8 x i8> %lane 35} 36 37define <8 x i8> @vld1dupi8_postinc_fixed(ptr noalias nocapture %a) nounwind { 38; CHECK-LABEL: vld1dupi8_postinc_fixed: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: ldr r3, [r0] 41; CHECK-NEXT: vld1.8 {d16[]}, [r3]! 42; CHECK-NEXT: str r3, [r0] 43; CHECK-NEXT: vmov r2, r1, d16 44; CHECK-NEXT: mov r0, r2 45; CHECK-NEXT: mov pc, lr 46entry: 47 %0 = load ptr, ptr %a, align 4 48 %1 = load i8, ptr %0, align 1 49 %2 = insertelement <8 x i8> undef, i8 %1, i32 0 50 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 51 %add.ptr = getelementptr inbounds i8, ptr %0, i32 1 52 store ptr %add.ptr, ptr %a, align 4 53 ret <8 x i8> %lane 54} 55 56define <8 x i8> @vld1dupi8_postinc_register(ptr noalias nocapture %a, i32 %n) nounwind { 57; CHECK-LABEL: vld1dupi8_postinc_register: 58; CHECK: @ %bb.0: @ %entry 59; CHECK-NEXT: ldr r3, [r0] 60; CHECK-NEXT: vld1.8 {d16[]}, [r3], r1 61; CHECK-NEXT: str r3, [r0] 62; CHECK-NEXT: vmov r2, r1, d16 63; CHECK-NEXT: mov r0, r2 64; CHECK-NEXT: mov pc, lr 65entry: 66 %0 = load ptr, ptr %a, align 4 67 %1 = load i8, ptr %0, align 1 68 %2 = insertelement <8 x i8> undef, i8 %1, i32 0 69 %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 70 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %n 71 store ptr %add.ptr, ptr %a, align 4 72 ret <8 x i8> %lane 73} 74 75define <16 x i8> @vld1dupqi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind { 76; CHECK-LABEL: vld1dupqi8_preinc: 77; CHECK: @ %bb.0: @ %entry 78; CHECK-NEXT: .save {r11, lr} 79; CHECK-NEXT: push {r11, lr} 80; CHECK-NEXT: ldr r2, [r0] 81; CHECK-NEXT: add lr, r2, r1 82; CHECK-NEXT: str lr, [r0] 83; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr] 84; CHECK-NEXT: vmov r12, r1, d16 85; CHECK-NEXT: vmov r2, r3, d17 86; CHECK-NEXT: mov r0, r12 87; CHECK-NEXT: pop {r11, lr} 88; CHECK-NEXT: mov pc, lr 89entry: 90 %0 = load ptr, ptr %a, align 4 91 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %b 92 %1 = load i8, ptr %add.ptr, align 1 93 %2 = insertelement <16 x i8> undef, i8 %1, i32 0 94 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer 95 store ptr %add.ptr, ptr %a, align 4 96 ret <16 x i8> %lane 97} 98 99define <16 x i8> @vld1dupqi8_postinc_fixed(ptr noalias nocapture %a) nounwind { 100; CHECK-LABEL: vld1dupqi8_postinc_fixed: 101; CHECK: @ %bb.0: @ %entry 102; CHECK-NEXT: .save {r11, lr} 103; CHECK-NEXT: push {r11, lr} 104; CHECK-NEXT: ldr lr, [r0] 105; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr]! 106; CHECK-NEXT: str lr, [r0] 107; CHECK-NEXT: vmov r12, r1, d16 108; CHECK-NEXT: vmov r2, r3, d17 109; CHECK-NEXT: mov r0, r12 110; CHECK-NEXT: pop {r11, lr} 111; CHECK-NEXT: mov pc, lr 112entry: 113 %0 = load ptr, ptr %a, align 4 114 %1 = load i8, ptr %0, align 1 115 %2 = insertelement <16 x i8> undef, i8 %1, i32 0 116 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer 117 %add.ptr = getelementptr inbounds i8, ptr %0, i32 1 118 store ptr %add.ptr, ptr %a, align 4 119 ret <16 x i8> %lane 120} 121 122define <16 x i8> @vld1dupqi8_postinc_register(ptr noalias nocapture %a, i32 %n) nounwind { 123; CHECK-LABEL: vld1dupqi8_postinc_register: 124; CHECK: @ %bb.0: @ %entry 125; CHECK-NEXT: .save {r11, lr} 126; CHECK-NEXT: push {r11, lr} 127; CHECK-NEXT: ldr lr, [r0] 128; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr], r1 129; CHECK-NEXT: str lr, [r0] 130; CHECK-NEXT: vmov r12, r1, d16 131; CHECK-NEXT: vmov r2, r3, d17 132; CHECK-NEXT: mov r0, r12 133; CHECK-NEXT: pop {r11, lr} 134; CHECK-NEXT: mov pc, lr 135entry: 136 %0 = load ptr, ptr %a, align 4 137 %1 = load i8, ptr %0, align 1 138 %2 = insertelement <16 x i8> undef, i8 %1, i32 0 139 %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer 140 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %n 141 store ptr %add.ptr, ptr %a, align 4 142 ret <16 x i8> %lane 143} 144 145define <4 x i16> @vld1dupi16(ptr %A) nounwind { 146; CHECK-LABEL: vld1dupi16: 147; CHECK: @ %bb.0: 148; CHECK-NEXT: vld1.16 {d16[]}, [r0:16] 149; CHECK-NEXT: vmov r0, r1, d16 150; CHECK-NEXT: mov pc, lr 151;Check the alignment value. Max for this instruction is 16 bits: 152 %tmp1 = load i16, ptr %A, align 8 153 %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0 154 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer 155 ret <4 x i16> %tmp3 156} 157 158define <4 x i16> @vld1dupi16_misaligned(ptr %A) nounwind { 159; CHECK-LABEL: vld1dupi16_misaligned: 160; CHECK: @ %bb.0: 161; CHECK-NEXT: vld1.16 {d16[]}, [r0] 162; CHECK-NEXT: vmov r0, r1, d16 163; CHECK-NEXT: mov pc, lr 164 %tmp1 = load i16, ptr %A, align 1 165 %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0 166 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer 167 ret <4 x i16> %tmp3 168} 169 170; This sort of looks like a vld1dup, but there's an extension in the way. 171define <4 x i16> @load_i16_dup_zext(ptr %A) nounwind { 172; CHECK-LABEL: load_i16_dup_zext: 173; CHECK: @ %bb.0: 174; CHECK-NEXT: ldrb r0, [r0] 175; CHECK-NEXT: vdup.16 d16, r0 176; CHECK-NEXT: vmov r0, r1, d16 177; CHECK-NEXT: mov pc, lr 178 %tmp1 = load i8, ptr %A, align 1 179 %tmp2 = zext i8 %tmp1 to i16 180 %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0 181 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 182 ret <4 x i16> %tmp4 183} 184 185; This sort of looks like a vld1dup, but there's an extension in the way. 186define <4 x i16> @load_i16_dup_sext(ptr %A) nounwind { 187; CHECK-LABEL: load_i16_dup_sext: 188; CHECK: @ %bb.0: 189; CHECK-NEXT: ldrsb r0, [r0] 190; CHECK-NEXT: vdup.16 d16, r0 191; CHECK-NEXT: vmov r0, r1, d16 192; CHECK-NEXT: mov pc, lr 193 %tmp1 = load i8, ptr %A, align 1 194 %tmp2 = sext i8 %tmp1 to i16 195 %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0 196 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 197 ret <4 x i16> %tmp4 198} 199 200; This sort of looks like a vld1dup, but there's an extension in the way. 201define <8 x i16> @load_i16_dupq_zext(ptr %A) nounwind { 202; CHECK-LABEL: load_i16_dupq_zext: 203; CHECK: @ %bb.0: 204; CHECK-NEXT: ldrb r0, [r0] 205; CHECK-NEXT: vdup.16 q8, r0 206; CHECK-NEXT: vmov r0, r1, d16 207; CHECK-NEXT: vmov r2, r3, d17 208; CHECK-NEXT: mov pc, lr 209 %tmp1 = load i8, ptr %A, align 1 210 %tmp2 = zext i8 %tmp1 to i16 211 %tmp3 = insertelement <8 x i16> undef, i16 %tmp2, i32 0 212 %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer 213 ret <8 x i16> %tmp4 214} 215 216define <2 x i32> @vld1dupi32(ptr %A) nounwind { 217; CHECK-LABEL: vld1dupi32: 218; CHECK: @ %bb.0: 219; CHECK-NEXT: vld1.32 {d16[]}, [r0:32] 220; CHECK-NEXT: vmov r0, r1, d16 221; CHECK-NEXT: mov pc, lr 222;Check the alignment value. Max for this instruction is 32 bits: 223 %tmp1 = load i32, ptr %A, align 8 224 %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0 225 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer 226 ret <2 x i32> %tmp3 227} 228 229; This sort of looks like a vld1dup, but there's an extension in the way. 230define <4 x i32> @load_i32_dup_zext(ptr %A) nounwind { 231; CHECK-LABEL: load_i32_dup_zext: 232; CHECK: @ %bb.0: 233; CHECK-NEXT: ldrb r0, [r0] 234; CHECK-NEXT: vdup.32 q8, r0 235; CHECK-NEXT: vmov r0, r1, d16 236; CHECK-NEXT: vmov r2, r3, d17 237; CHECK-NEXT: mov pc, lr 238 %tmp1 = load i8, ptr %A, align 1 239 %tmp2 = zext i8 %tmp1 to i32 240 %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0 241 %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer 242 ret <4 x i32> %tmp4 243} 244 245; This sort of looks like a vld1dup, but there's an extension in the way. 246define <4 x i32> @load_i32_dup_sext(ptr %A) nounwind { 247; CHECK-LABEL: load_i32_dup_sext: 248; CHECK: @ %bb.0: 249; CHECK-NEXT: ldrsb r0, [r0] 250; CHECK-NEXT: vdup.32 q8, r0 251; CHECK-NEXT: vmov r0, r1, d16 252; CHECK-NEXT: vmov r2, r3, d17 253; CHECK-NEXT: mov pc, lr 254 %tmp1 = load i8, ptr %A, align 1 255 %tmp2 = sext i8 %tmp1 to i32 256 %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0 257 %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer 258 ret <4 x i32> %tmp4 259} 260 261define <2 x float> @vld1dupf(ptr %A) nounwind { 262; CHECK-LABEL: vld1dupf: 263; CHECK: @ %bb.0: 264; CHECK-NEXT: vld1.32 {d16[]}, [r0:32] 265; CHECK-NEXT: vmov r0, r1, d16 266; CHECK-NEXT: mov pc, lr 267 %tmp0 = load float, ptr %A 268 %tmp1 = insertelement <2 x float> undef, float %tmp0, i32 0 269 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer 270 ret <2 x float> %tmp2 271} 272 273define <16 x i8> @vld1dupQi8(ptr %A) nounwind { 274; CHECK-LABEL: vld1dupQi8: 275; CHECK: @ %bb.0: 276; CHECK-NEXT: vld1.8 {d16[], d17[]}, [r0] 277; CHECK-NEXT: vmov r0, r1, d16 278; CHECK-NEXT: vmov r2, r3, d17 279; CHECK-NEXT: mov pc, lr 280;Check the (default) alignment value. 281 %tmp1 = load i8, ptr %A, align 8 282 %tmp2 = insertelement <16 x i8> undef, i8 %tmp1, i32 0 283 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer 284 ret <16 x i8> %tmp3 285} 286 287define <4 x float> @vld1dupQf(ptr %A) nounwind { 288; CHECK-LABEL: vld1dupQf: 289; CHECK: @ %bb.0: 290; CHECK-NEXT: vld1.32 {d16[], d17[]}, [r0:32] 291; CHECK-NEXT: vmov r0, r1, d16 292; CHECK-NEXT: vmov r2, r3, d17 293; CHECK-NEXT: mov pc, lr 294 %tmp0 = load float, ptr %A 295 %tmp1 = insertelement <4 x float> undef, float %tmp0, i32 0 296 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer 297 ret <4 x float> %tmp2 298} 299 300%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } 301%struct.__neon_int4x16x2_t = type { <4 x i16>, <4 x i16> } 302%struct.__neon_int2x32x2_t = type { <2 x i32>, <2 x i32> } 303 304define <8 x i8> @vld2dupi8(ptr %A) nounwind { 305; CHECK-LABEL: vld2dupi8: 306; CHECK: @ %bb.0: 307; CHECK-NEXT: vld2.8 {d16[0], d17[0]}, [r0] 308; CHECK-NEXT: vadd.i8 d16, d16, d17 309; CHECK-NEXT: vdup.8 d16, d16[0] 310; CHECK-NEXT: vmov r0, r1, d16 311; CHECK-NEXT: mov pc, lr 312;Check the (default) alignment value. 313 %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 314 %tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0 315 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 316 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1 317 %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer 318 %tmp5 = add <8 x i8> %tmp2, %tmp4 319 ret <8 x i8> %tmp5 320} 321 322define void @vld2dupi8_preinc(ptr noalias nocapture sret(%struct.__neon_int8x8x2_t) %agg.result, ptr noalias nocapture %a, i32 %b) nounwind { 323; CHECK-LABEL: vld2dupi8_preinc: 324; CHECK: @ %bb.0: @ %entry 325; CHECK-NEXT: ldr r3, [r1] 326; CHECK-NEXT: add r2, r3, r2 327; CHECK-NEXT: str r2, [r1] 328; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r2] 329; CHECK-NEXT: vst1.8 {d16}, [r0:64]! 330; CHECK-NEXT: vstr d17, [r0] 331; CHECK-NEXT: mov pc, lr 332entry: 333 %0 = load ptr, ptr %a, align 4 334 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %b 335 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 336 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 337 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 338 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 339 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 340 store ptr %add.ptr, ptr %a, align 4 341 store <8 x i8> %lane, ptr %agg.result, align 8 342 %r11 = getelementptr inbounds %struct.__neon_int8x8x2_t, ptr %agg.result, i32 0, i32 1 343 store <8 x i8> %lane1, ptr %r11, align 8 344 ret void 345} 346 347define void @vld2dupi8_postinc_fixed(ptr noalias nocapture sret(%struct.__neon_int8x8x2_t) %agg.result, ptr noalias nocapture %a) nounwind { 348; CHECK-LABEL: vld2dupi8_postinc_fixed: 349; CHECK: @ %bb.0: @ %entry 350; CHECK-NEXT: ldr r2, [r1] 351; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r2]! 352; CHECK-NEXT: str r2, [r1] 353; CHECK-NEXT: vst1.8 {d16}, [r0:64]! 354; CHECK-NEXT: vstr d17, [r0] 355; CHECK-NEXT: mov pc, lr 356entry: 357 %0 = load ptr, ptr %a, align 4 358 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 359 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 360 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 361 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 362 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 363 %add.ptr = getelementptr inbounds i8, ptr %0, i32 2 364 store ptr %add.ptr, ptr %a, align 4 365 store <8 x i8> %lane, ptr %agg.result, align 8 366 %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, ptr %agg.result, i32 0, i32 1 367 store <8 x i8> %lane1, ptr %r10, align 8 368 ret void 369} 370 371define void @vld2dupi8_postinc_variable(ptr noalias nocapture sret(%struct.__neon_int8x8x2_t) %agg.result, ptr noalias nocapture %a, i32 %n) nounwind { 372; CHECK-LABEL: vld2dupi8_postinc_variable: 373; CHECK: @ %bb.0: @ %entry 374; CHECK-NEXT: ldr r3, [r1] 375; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r3], r2 376; CHECK-NEXT: str r3, [r1] 377; CHECK-NEXT: vst1.8 {d16}, [r0:64]! 378; CHECK-NEXT: vstr d17, [r0] 379; CHECK-NEXT: mov pc, lr 380entry: 381 %0 = load ptr, ptr %a, align 4 382 %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) 383 %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 384 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 385 %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 386 %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer 387 %add.ptr = getelementptr inbounds i8, ptr %0, i32 %n 388 store ptr %add.ptr, ptr %a, align 4 389 store <8 x i8> %lane, ptr %agg.result, align 8 390 %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, ptr %agg.result, i32 0, i32 1 391 store <8 x i8> %lane1, ptr %r10, align 8 392 ret void 393} 394 395define <4 x i16> @vld2dupi16(ptr %A) nounwind { 396; CHECK-LABEL: vld2dupi16: 397; CHECK: @ %bb.0: 398; CHECK-NEXT: vld2.16 {d16[0], d17[0]}, [r0] 399; CHECK-NEXT: vadd.i16 d16, d16, d17 400; CHECK-NEXT: vdup.16 d16, d16[0] 401; CHECK-NEXT: vmov r0, r1, d16 402; CHECK-NEXT: mov pc, lr 403;Check that a power-of-two alignment smaller than the total size of the memory 404;being loaded is ignored. 405 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) 406 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 407 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 408 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 409 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 410 %tmp5 = add <4 x i16> %tmp2, %tmp4 411 ret <4 x i16> %tmp5 412} 413 414;Check for a post-increment updating load. 415define <4 x i16> @vld2dupi16_update(ptr %ptr) nounwind { 416; CHECK-LABEL: vld2dupi16_update: 417; CHECK: @ %bb.0: 418; CHECK-NEXT: ldr r3, [r0] 419; CHECK-NEXT: vld2.16 {d16[0], d17[0]}, [r3]! 420; CHECK-NEXT: vadd.i16 d16, d16, d17 421; CHECK-NEXT: str r3, [r0] 422; CHECK-NEXT: vdup.16 d16, d16[0] 423; CHECK-NEXT: vmov r2, r1, d16 424; CHECK-NEXT: mov r0, r2 425; CHECK-NEXT: mov pc, lr 426 %A = load ptr, ptr %ptr 427 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) 428 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 429 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 430 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 431 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 432 %tmp5 = add <4 x i16> %tmp2, %tmp4 433 %tmp6 = getelementptr i16, ptr %A, i32 2 434 store ptr %tmp6, ptr %ptr 435 ret <4 x i16> %tmp5 436} 437 438define <4 x i16> @vld2dupi16_odd_update(ptr %ptr) nounwind { 439; CHECK-LABEL: vld2dupi16_odd_update: 440; CHECK: @ %bb.0: 441; CHECK-NEXT: ldr r3, [r0] 442; CHECK-NEXT: mov r1, #6 443; CHECK-NEXT: vld2.16 {d16[0], d17[0]}, [r3], r1 444; CHECK-NEXT: vadd.i16 d16, d16, d17 445; CHECK-NEXT: str r3, [r0] 446; CHECK-NEXT: vdup.16 d16, d16[0] 447; CHECK-NEXT: vmov r2, r1, d16 448; CHECK-NEXT: mov r0, r2 449; CHECK-NEXT: mov pc, lr 450 %A = load ptr, ptr %ptr 451 %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) 452 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 453 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 454 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 455 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 456 %tmp5 = add <4 x i16> %tmp2, %tmp4 457 %tmp6 = getelementptr i16, ptr %A, i32 3 458 store ptr %tmp6, ptr %ptr 459 ret <4 x i16> %tmp5 460} 461 462define <2 x i32> @vld2dupi32(ptr %A) nounwind { 463; CHECK-LABEL: vld2dupi32: 464; CHECK: @ %bb.0: 465; CHECK-NEXT: vld2.32 {d16[0], d17[0]}, [r0:64] 466; CHECK-NEXT: vadd.i32 d16, d16, d17 467; CHECK-NEXT: vdup.32 d16, d16[0] 468; CHECK-NEXT: vmov r0, r1, d16 469; CHECK-NEXT: mov pc, lr 470;Check the alignment value. Max for this instruction is 64 bits: 471 %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16) 472 %tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0 473 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 474 %tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1 475 %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer 476 %tmp5 = add <2 x i32> %tmp2, %tmp4 477 ret <2 x i32> %tmp5 478} 479 480declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 481declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 482declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 483 484%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } 485%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } 486 487;Check for a post-increment updating load with register increment. 488define <8 x i8> @vld3dupi8_update(ptr %ptr, i32 %inc) nounwind { 489; CHECK-LABEL: vld3dupi8_update: 490; CHECK: @ %bb.0: 491; CHECK-NEXT: ldr r3, [r0] 492; CHECK-NEXT: vld3.8 {d16[0], d17[0], d18[0]}, [r3], r1 493; CHECK-NEXT: vadd.i8 d20, d16, d17 494; CHECK-NEXT: vadd.i8 d16, d20, d18 495; CHECK-NEXT: str r3, [r0] 496; CHECK-NEXT: vdup.8 d16, d16[0] 497; CHECK-NEXT: vmov r2, r1, d16 498; CHECK-NEXT: mov r0, r2 499; CHECK-NEXT: mov pc, lr 500 %A = load ptr, ptr %ptr 501 %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8) 502 %tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0 503 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer 504 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1 505 %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer 506 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 2 507 %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <8 x i32> zeroinitializer 508 %tmp7 = add <8 x i8> %tmp2, %tmp4 509 %tmp8 = add <8 x i8> %tmp7, %tmp6 510 %tmp9 = getelementptr i8, ptr %A, i32 %inc 511 store ptr %tmp9, ptr %ptr 512 ret <8 x i8> %tmp8 513} 514 515define <4 x i16> @vld3dupi16(ptr %A) nounwind { 516; CHECK-LABEL: vld3dupi16: 517; CHECK: @ %bb.0: 518; CHECK-NEXT: vld3.16 {d16[0], d17[0], d18[0]}, [r0] 519; CHECK-NEXT: vadd.i16 d20, d16, d17 520; CHECK-NEXT: vadd.i16 d16, d20, d18 521; CHECK-NEXT: vdup.16 d16, d16[0] 522; CHECK-NEXT: vmov r0, r1, d16 523; CHECK-NEXT: mov pc, lr 524;Check the (default) alignment value. VLD3 does not support alignment. 525 %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8) 526 %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0 527 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 528 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1 529 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 530 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 2 531 %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer 532 %tmp7 = add <4 x i16> %tmp2, %tmp4 533 %tmp8 = add <4 x i16> %tmp7, %tmp6 534 ret <4 x i16> %tmp8 535} 536 537declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly 538declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 539 540%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } 541%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } 542 543;Check for a post-increment updating load. 544define <4 x i16> @vld4dupi16_update(ptr %ptr) nounwind { 545; CHECK-LABEL: vld4dupi16_update: 546; CHECK: @ %bb.0: 547; CHECK-NEXT: ldr r3, [r0] 548; CHECK-NEXT: vld4.16 {d16[0], d17[0], d18[0], d19[0]}, [r3]! 549; CHECK-NEXT: vadd.i16 d16, d16, d17 550; CHECK-NEXT: vadd.i16 d20, d18, d19 551; CHECK-NEXT: str r3, [r0] 552; CHECK-NEXT: vadd.i16 d16, d16, d20 553; CHECK-NEXT: vdup.16 d16, d16[0] 554; CHECK-NEXT: vmov r2, r1, d16 555; CHECK-NEXT: mov r0, r2 556; CHECK-NEXT: mov pc, lr 557 %A = load ptr, ptr %ptr 558 %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1) 559 %tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0 560 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer 561 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1 562 %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer 563 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 2 564 %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer 565 %tmp7 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 3 566 %tmp8 = shufflevector <4 x i16> %tmp7, <4 x i16> undef, <4 x i32> zeroinitializer 567 %tmp9 = add <4 x i16> %tmp2, %tmp4 568 %tmp10 = add <4 x i16> %tmp6, %tmp8 569 %tmp11 = add <4 x i16> %tmp9, %tmp10 570 %tmp12 = getelementptr i16, ptr %A, i32 4 571 store ptr %tmp12, ptr %ptr 572 ret <4 x i16> %tmp11 573} 574 575define <2 x i32> @vld4dupi32(ptr %A) nounwind { 576; CHECK-LABEL: vld4dupi32: 577; CHECK: @ %bb.0: 578; CHECK-NEXT: vld4.32 {d16[0], d17[0], d18[0], d19[0]}, [r0:64] 579; CHECK-NEXT: vadd.i32 d16, d16, d17 580; CHECK-NEXT: vadd.i32 d20, d18, d19 581; CHECK-NEXT: vadd.i32 d16, d16, d20 582; CHECK-NEXT: vdup.32 d16, d16[0] 583; CHECK-NEXT: vmov r0, r1, d16 584; CHECK-NEXT: mov pc, lr 585;Check the alignment value. An 8-byte alignment is allowed here even though 586;it is smaller than the total size of the memory being loaded. 587 %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8) 588 %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0 589 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer 590 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1 591 %tmp4 = shufflevector <2 x i32> %tmp3, <2 x i32> undef, <2 x i32> zeroinitializer 592 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 2 593 %tmp6 = shufflevector <2 x i32> %tmp5, <2 x i32> undef, <2 x i32> zeroinitializer 594 %tmp7 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 3 595 %tmp8 = shufflevector <2 x i32> %tmp7, <2 x i32> undef, <2 x i32> zeroinitializer 596 %tmp9 = add <2 x i32> %tmp2, %tmp4 597 %tmp10 = add <2 x i32> %tmp6, %tmp8 598 %tmp11 = add <2 x i32> %tmp9, %tmp10 599 ret <2 x i32> %tmp11 600} 601 602declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly 603declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly 604