1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 3 4define <8 x i8> @vzipi8(ptr %A, ptr %B) nounwind { 5; CHECK-LABEL: vzipi8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vldr d16, [r1] 8; CHECK-NEXT: vldr d17, [r0] 9; CHECK-NEXT: vzip.8 d17, d16 10; CHECK-NEXT: vadd.i8 d16, d17, d16 11; CHECK-NEXT: vmov r0, r1, d16 12; CHECK-NEXT: mov pc, lr 13 %tmp1 = load <8 x i8>, ptr %A 14 %tmp2 = load <8 x i8>, ptr %B 15 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 16 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 17 %tmp5 = add <8 x i8> %tmp3, %tmp4 18 ret <8 x i8> %tmp5 19} 20 21define <16 x i8> @vzipi8_Qres(ptr %A, ptr %B) nounwind { 22; CHECK-LABEL: vzipi8_Qres: 23; CHECK: @ %bb.0: 24; CHECK-NEXT: vldr d17, [r1] 25; CHECK-NEXT: vldr d16, [r0] 26; CHECK-NEXT: vzip.8 d16, d17 27; CHECK-NEXT: vmov r0, r1, d16 28; CHECK-NEXT: vmov r2, r3, d17 29; CHECK-NEXT: mov pc, lr 30 %tmp1 = load <8 x i8>, ptr %A 31 %tmp2 = load <8 x i8>, ptr %B 32 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 33 ret <16 x i8> %tmp3 34} 35 36define <4 x i16> @vzipi16(ptr %A, ptr %B) nounwind { 37; CHECK-LABEL: vzipi16: 38; CHECK: @ %bb.0: 39; CHECK-NEXT: vldr d16, [r1] 40; CHECK-NEXT: vldr d17, [r0] 41; CHECK-NEXT: vzip.16 d17, d16 42; CHECK-NEXT: vadd.i16 d16, d17, d16 43; CHECK-NEXT: vmov r0, r1, d16 44; CHECK-NEXT: mov pc, lr 45 %tmp1 = load <4 x i16>, ptr %A 46 %tmp2 = load <4 x i16>, ptr %B 47 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 48 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 49 %tmp5 = add <4 x i16> %tmp3, %tmp4 50 ret <4 x i16> %tmp5 51} 52 53define <8 x i16> @vzipi16_Qres(ptr %A, ptr %B) nounwind { 54; CHECK-LABEL: vzipi16_Qres: 55; CHECK: @ %bb.0: 56; CHECK-NEXT: vldr d17, [r1] 57; CHECK-NEXT: vldr d16, [r0] 58; CHECK-NEXT: vzip.16 d16, d17 59; CHECK-NEXT: vmov r0, r1, d16 60; CHECK-NEXT: vmov r2, r3, d17 61; CHECK-NEXT: mov pc, lr 62 %tmp1 = load <4 x i16>, ptr %A 63 %tmp2 = load <4 x i16>, ptr %B 64 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 65 ret <8 x i16> %tmp3 66} 67 68; VZIP.32 is equivalent to VTRN.32 for 64-bit vectors. 69 70define <16 x i8> @vzipQi8(ptr %A, ptr %B) nounwind { 71; CHECK-LABEL: vzipQi8: 72; CHECK: @ %bb.0: 73; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 74; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 75; CHECK-NEXT: vzip.8 q9, q8 76; CHECK-NEXT: vadd.i8 q8, q9, q8 77; CHECK-NEXT: vmov r0, r1, d16 78; CHECK-NEXT: vmov r2, r3, d17 79; CHECK-NEXT: mov pc, lr 80 %tmp1 = load <16 x i8>, ptr %A 81 %tmp2 = load <16 x i8>, ptr %B 82 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 83 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 84 %tmp5 = add <16 x i8> %tmp3, %tmp4 85 ret <16 x i8> %tmp5 86} 87 88define <32 x i8> @vzipQi8_QQres(ptr %A, ptr %B) nounwind { 89; CHECK-LABEL: vzipQi8_QQres: 90; CHECK: @ %bb.0: 91; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 92; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 93; CHECK-NEXT: vzip.8 q9, q8 94; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 95; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 96; CHECK-NEXT: mov pc, lr 97 %tmp1 = load <16 x i8>, ptr %A 98 %tmp2 = load <16 x i8>, ptr %B 99 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 100 ret <32 x i8> %tmp3 101} 102 103define <8 x i16> @vzipQi16(ptr %A, ptr %B) nounwind { 104; CHECK-LABEL: vzipQi16: 105; CHECK: @ %bb.0: 106; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 107; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 108; CHECK-NEXT: vzip.16 q9, q8 109; CHECK-NEXT: vadd.i16 q8, q9, q8 110; CHECK-NEXT: vmov r0, r1, d16 111; CHECK-NEXT: vmov r2, r3, d17 112; CHECK-NEXT: mov pc, lr 113 %tmp1 = load <8 x i16>, ptr %A 114 %tmp2 = load <8 x i16>, ptr %B 115 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 116 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 117 %tmp5 = add <8 x i16> %tmp3, %tmp4 118 ret <8 x i16> %tmp5 119} 120 121define <16 x i16> @vzipQi16_QQres(ptr %A, ptr %B) nounwind { 122; CHECK-LABEL: vzipQi16_QQres: 123; CHECK: @ %bb.0: 124; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 125; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 126; CHECK-NEXT: vzip.16 q9, q8 127; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 128; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 129; CHECK-NEXT: mov pc, lr 130 %tmp1 = load <8 x i16>, ptr %A 131 %tmp2 = load <8 x i16>, ptr %B 132 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 133 ret <16 x i16> %tmp3 134} 135 136define <4 x i32> @vzipQi32(ptr %A, ptr %B) nounwind { 137; CHECK-LABEL: vzipQi32: 138; CHECK: @ %bb.0: 139; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 140; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 141; CHECK-NEXT: vzip.32 q9, q8 142; CHECK-NEXT: vadd.i32 q8, q9, q8 143; CHECK-NEXT: vmov r0, r1, d16 144; CHECK-NEXT: vmov r2, r3, d17 145; CHECK-NEXT: mov pc, lr 146 %tmp1 = load <4 x i32>, ptr %A 147 %tmp2 = load <4 x i32>, ptr %B 148 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 149 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 150 %tmp5 = add <4 x i32> %tmp3, %tmp4 151 ret <4 x i32> %tmp5 152} 153 154define <8 x i32> @vzipQi32_QQres(ptr %A, ptr %B) nounwind { 155; CHECK-LABEL: vzipQi32_QQres: 156; CHECK: @ %bb.0: 157; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 158; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 159; CHECK-NEXT: vzip.32 q9, q8 160; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 161; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 162; CHECK-NEXT: mov pc, lr 163 %tmp1 = load <4 x i32>, ptr %A 164 %tmp2 = load <4 x i32>, ptr %B 165 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 166 ret <8 x i32> %tmp3 167} 168 169define <4 x float> @vzipQf(ptr %A, ptr %B) nounwind { 170; CHECK-LABEL: vzipQf: 171; CHECK: @ %bb.0: 172; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 173; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 174; CHECK-NEXT: vzip.32 q9, q8 175; CHECK-NEXT: vadd.f32 q8, q9, q8 176; CHECK-NEXT: vmov r0, r1, d16 177; CHECK-NEXT: vmov r2, r3, d17 178; CHECK-NEXT: mov pc, lr 179 %tmp1 = load <4 x float>, ptr %A 180 %tmp2 = load <4 x float>, ptr %B 181 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 182 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 183 %tmp5 = fadd <4 x float> %tmp3, %tmp4 184 ret <4 x float> %tmp5 185} 186 187define <8 x float> @vzipQf_QQres(ptr %A, ptr %B) nounwind { 188; CHECK-LABEL: vzipQf_QQres: 189; CHECK: @ %bb.0: 190; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 191; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 192; CHECK-NEXT: vzip.32 q9, q8 193; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 194; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 195; CHECK-NEXT: mov pc, lr 196 %tmp1 = load <4 x float>, ptr %A 197 %tmp2 = load <4 x float>, ptr %B 198 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 199 ret <8 x float> %tmp3 200} 201 202; Undef shuffle indices should not prevent matching to VZIP: 203 204define <8 x i8> @vzipi8_undef(ptr %A, ptr %B) nounwind { 205; CHECK-LABEL: vzipi8_undef: 206; CHECK: @ %bb.0: 207; CHECK-NEXT: vldr d16, [r1] 208; CHECK-NEXT: vldr d17, [r0] 209; CHECK-NEXT: vzip.8 d17, d16 210; CHECK-NEXT: vadd.i8 d16, d17, d16 211; CHECK-NEXT: vmov r0, r1, d16 212; CHECK-NEXT: mov pc, lr 213 %tmp1 = load <8 x i8>, ptr %A 214 %tmp2 = load <8 x i8>, ptr %B 215 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11> 216 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15> 217 %tmp5 = add <8 x i8> %tmp3, %tmp4 218 ret <8 x i8> %tmp5 219} 220 221define <16 x i8> @vzipi8_undef_Qres(ptr %A, ptr %B) nounwind { 222; CHECK-LABEL: vzipi8_undef_Qres: 223; CHECK: @ %bb.0: 224; CHECK-NEXT: vldr d17, [r1] 225; CHECK-NEXT: vldr d16, [r0] 226; CHECK-NEXT: vzip.8 d16, d17 227; CHECK-NEXT: vmov r0, r1, d16 228; CHECK-NEXT: vmov r2, r3, d17 229; CHECK-NEXT: mov pc, lr 230 %tmp1 = load <8 x i8>, ptr %A 231 %tmp2 = load <8 x i8>, ptr %B 232 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15> 233 ret <16 x i8> %tmp3 234} 235 236define <16 x i8> @vzipQi8_undef(ptr %A, ptr %B) nounwind { 237; CHECK-LABEL: vzipQi8_undef: 238; CHECK: @ %bb.0: 239; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 240; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 241; CHECK-NEXT: vzip.8 q9, q8 242; CHECK-NEXT: vadd.i8 q8, q9, q8 243; CHECK-NEXT: vmov r0, r1, d16 244; CHECK-NEXT: vmov r2, r3, d17 245; CHECK-NEXT: mov pc, lr 246 %tmp1 = load <16 x i8>, ptr %A 247 %tmp2 = load <16 x i8>, ptr %B 248 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 249 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31> 250 %tmp5 = add <16 x i8> %tmp3, %tmp4 251 ret <16 x i8> %tmp5 252} 253 254define <32 x i8> @vzipQi8_undef_QQres(ptr %A, ptr %B) nounwind { 255; CHECK-LABEL: vzipQi8_undef_QQres: 256; CHECK: @ %bb.0: 257; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 258; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 259; CHECK-NEXT: vzip.8 q9, q8 260; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 261; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 262; CHECK-NEXT: mov pc, lr 263 %tmp1 = load <16 x i8>, ptr %A 264 %tmp2 = load <16 x i8>, ptr %B 265 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31> 266 ret <32 x i8> %tmp3 267} 268 269define <8 x i16> @vzip_lower_shufflemask_undef(ptr %A, ptr %B) { 270; CHECK-LABEL: vzip_lower_shufflemask_undef: 271; CHECK: @ %bb.0: @ %entry 272; CHECK-NEXT: vldr d17, [r1] 273; CHECK-NEXT: vldr d18, [r0] 274; CHECK-NEXT: vzip.16 d18, d17 275; CHECK-NEXT: vmov r0, r1, d16 276; CHECK-NEXT: vmov r2, r3, d17 277; CHECK-NEXT: mov pc, lr 278entry: 279 %tmp1 = load <4 x i16>, ptr %A 280 %tmp2 = load <4 x i16>, ptr %B 281 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7> 282 ret <8 x i16> %0 283} 284 285; NOTE: The mask here looks like something that could be done with a vzip, 286; but which the current handling of two-result vzip can't do - thus ending up 287; as a vtrn. 288define <8 x i16> @vzip_lower_shufflemask_undef_rev(ptr %A, ptr %B) { 289; CHECK-LABEL: vzip_lower_shufflemask_undef_rev: 290; CHECK: @ %bb.0: @ %entry 291; CHECK-NEXT: vldr d16, [r1] 292; CHECK-NEXT: vldr d19, [r0] 293; CHECK-NEXT: vtrn.16 d19, d16 294; CHECK-NEXT: vmov r0, r1, d18 295; CHECK-NEXT: vmov r2, r3, d19 296; CHECK-NEXT: mov pc, lr 297entry: 298 %tmp1 = load <4 x i16>, ptr %A 299 %tmp2 = load <4 x i16>, ptr %B 300 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 4, i32 undef, i32 undef> 301 ret <8 x i16> %0 302} 303 304define <4 x i32> @vzip_lower_shufflemask_zeroed(ptr %A) { 305; CHECK-LABEL: vzip_lower_shufflemask_zeroed: 306; CHECK: @ %bb.0: @ %entry 307; CHECK-NEXT: vldr d16, [r0] 308; CHECK-NEXT: vdup.32 q9, d16[0] 309; CHECK-NEXT: vzip.32 q8, q9 310; CHECK-NEXT: vmov r0, r1, d16 311; CHECK-NEXT: vmov r2, r3, d17 312; CHECK-NEXT: mov pc, lr 313entry: 314 %tmp1 = load <2 x i32>, ptr %A 315 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 0, i32 1, i32 0> 316 ret <4 x i32> %0 317} 318 319define <4 x i32> @vzip_lower_shufflemask_vuzp(ptr %A) { 320; CHECK-LABEL: vzip_lower_shufflemask_vuzp: 321; CHECK: @ %bb.0: @ %entry 322; CHECK-NEXT: vldr d16, [r0] 323; CHECK-NEXT: vdup.32 q9, d16[0] 324; CHECK-NEXT: vzip.32 q8, q9 325; CHECK-NEXT: vmov r0, r1, d16 326; CHECK-NEXT: vmov r2, r3, d17 327; CHECK-NEXT: mov pc, lr 328entry: 329 %tmp1 = load <2 x i32>, ptr %A 330 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 0> 331 ret <4 x i32> %0 332} 333 334define void @vzip_undef_rev_shufflemask_vtrn(ptr %A, ptr %B) { 335; CHECK-LABEL: vzip_undef_rev_shufflemask_vtrn: 336; CHECK: @ %bb.0: @ %entry 337; CHECK-NEXT: vldr d16, [r0] 338; CHECK-NEXT: vorr q9, q8, q8 339; CHECK-NEXT: vzip.32 q8, q9 340; CHECK-NEXT: vext.32 q8, q8, q8, #2 341; CHECK-NEXT: vst1.64 {d16, d17}, [r1] 342; CHECK-NEXT: mov pc, lr 343entry: 344 %tmp1 = load <2 x i32>, ptr %A 345 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 346 store <4 x i32> %0, ptr %B 347 ret void 348} 349 350define void @vzip_vext_factor(ptr %A, ptr %B) { 351; CHECK-LABEL: vzip_vext_factor: 352; CHECK: @ %bb.0: @ %entry 353; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 354; CHECK-NEXT: vext.16 d18, d16, d17, #1 355; CHECK-NEXT: vext.16 d16, d18, d17, #2 356; CHECK-NEXT: vext.16 d16, d16, d16, #1 357; CHECK-NEXT: vstr d16, [r1] 358; CHECK-NEXT: mov pc, lr 359entry: 360 %tmp1 = load <8 x i16>, ptr %A 361 %0 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 4, i32 5, i32 3> 362 store <4 x i16> %0, ptr %B 363 ret void 364} 365 366define <8 x i8> @vdup_zip(ptr nocapture readonly %x, ptr nocapture readonly %y) { 367; CHECK-LABEL: vdup_zip: 368; CHECK: @ %bb.0: @ %entry 369; CHECK-NEXT: vld1.8 {d16[]}, [r1] 370; CHECK-NEXT: vld1.8 {d17[]}, [r0] 371; CHECK-NEXT: vzip.8 d17, d16 372; CHECK-NEXT: vmov r0, r1, d17 373; CHECK-NEXT: mov pc, lr 374entry: 375 %0 = load i8, ptr %x, align 1 376 %1 = insertelement <8 x i8> undef, i8 %0, i32 0 377 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef> 378 %2 = load i8, ptr %y, align 1 379 %3 = insertelement <8 x i8> undef, i8 %2, i32 0 380 %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef> 381 %vzip.i = shufflevector <8 x i8> %lane, <8 x i8> %lane3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 382 ret <8 x i8> %vzip.i 383} 384