1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=arm-eabi -mattr=+neon,+fullfp16 %s -o - | FileCheck %s 3 4define <8 x i8> @test_vrev64D8(ptr %A) nounwind { 5; CHECK-LABEL: test_vrev64D8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vldr d16, [r0] 8; CHECK-NEXT: vrev64.8 d16, d16 9; CHECK-NEXT: vmov r0, r1, d16 10; CHECK-NEXT: mov pc, lr 11 %tmp1 = load <8 x i8>, ptr %A 12 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 13 ret <8 x i8> %tmp2 14} 15 16define <4 x i16> @test_vrev64D16(ptr %A) nounwind { 17; CHECK-LABEL: test_vrev64D16: 18; CHECK: @ %bb.0: 19; CHECK-NEXT: vldr d16, [r0] 20; CHECK-NEXT: vrev64.16 d16, d16 21; CHECK-NEXT: vmov r0, r1, d16 22; CHECK-NEXT: mov pc, lr 23 %tmp1 = load <4 x i16>, ptr %A 24 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 25 ret <4 x i16> %tmp2 26} 27 28define <4 x half> @test_vrev64Df16(ptr %A) nounwind { 29; CHECK-LABEL: test_vrev64Df16: 30; CHECK: @ %bb.0: 31; CHECK-NEXT: vldr d16, [r0] 32; CHECK-NEXT: vrev64.16 d16, d16 33; CHECK-NEXT: vmov r0, r1, d16 34; CHECK-NEXT: mov pc, lr 35 %tmp1 = load <4 x half>, ptr %A 36 %tmp2 = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 37 ret <4 x half> %tmp2 38} 39 40define <2 x i32> @test_vrev64D32(ptr %A) nounwind { 41; CHECK-LABEL: test_vrev64D32: 42; CHECK: @ %bb.0: 43; CHECK-NEXT: vldr d16, [r0] 44; CHECK-NEXT: vrev64.32 d16, d16 45; CHECK-NEXT: vmov r0, r1, d16 46; CHECK-NEXT: mov pc, lr 47 %tmp1 = load <2 x i32>, ptr %A 48 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0> 49 ret <2 x i32> %tmp2 50} 51 52define <2 x float> @test_vrev64Df(ptr %A) nounwind { 53; CHECK-LABEL: test_vrev64Df: 54; CHECK: @ %bb.0: 55; CHECK-NEXT: vldr d16, [r0] 56; CHECK-NEXT: vrev64.32 d16, d16 57; CHECK-NEXT: vmov r0, r1, d16 58; CHECK-NEXT: mov pc, lr 59 %tmp1 = load <2 x float>, ptr %A 60 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0> 61 ret <2 x float> %tmp2 62} 63 64define <16 x i8> @test_vrev64Q8(ptr %A) nounwind { 65; CHECK-LABEL: test_vrev64Q8: 66; CHECK: @ %bb.0: 67; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 68; CHECK-NEXT: vrev64.8 q8, q8 69; CHECK-NEXT: vmov r0, r1, d16 70; CHECK-NEXT: vmov r2, r3, d17 71; CHECK-NEXT: mov pc, lr 72 %tmp1 = load <16 x i8>, ptr %A 73 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> 74 ret <16 x i8> %tmp2 75} 76 77define <8 x i16> @test_vrev64Q16(ptr %A) nounwind { 78; CHECK-LABEL: test_vrev64Q16: 79; CHECK: @ %bb.0: 80; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 81; CHECK-NEXT: vrev64.16 q8, q8 82; CHECK-NEXT: vmov r0, r1, d16 83; CHECK-NEXT: vmov r2, r3, d17 84; CHECK-NEXT: mov pc, lr 85 %tmp1 = load <8 x i16>, ptr %A 86 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 87 ret <8 x i16> %tmp2 88} 89 90define <8 x half> @test_vrev64Qf16(ptr %A) nounwind { 91; CHECK-LABEL: test_vrev64Qf16: 92; CHECK: @ %bb.0: 93; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 94; CHECK-NEXT: vrev64.16 q8, q8 95; CHECK-NEXT: vmov r0, r1, d16 96; CHECK-NEXT: vmov r2, r3, d17 97; CHECK-NEXT: mov pc, lr 98 %tmp1 = load <8 x half>, ptr %A 99 %tmp2 = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 100 ret <8 x half> %tmp2 101} 102 103define <4 x i32> @test_vrev64Q32(ptr %A) nounwind { 104; CHECK-LABEL: test_vrev64Q32: 105; CHECK: @ %bb.0: 106; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 107; CHECK-NEXT: vrev64.32 q8, q8 108; CHECK-NEXT: vmov r0, r1, d16 109; CHECK-NEXT: vmov r2, r3, d17 110; CHECK-NEXT: mov pc, lr 111 %tmp1 = load <4 x i32>, ptr %A 112 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 113 ret <4 x i32> %tmp2 114} 115 116define <4 x float> @test_vrev64Qf(ptr %A) nounwind { 117; CHECK-LABEL: test_vrev64Qf: 118; CHECK: @ %bb.0: 119; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 120; CHECK-NEXT: vrev64.32 q8, q8 121; CHECK-NEXT: vmov r0, r1, d16 122; CHECK-NEXT: vmov r2, r3, d17 123; CHECK-NEXT: mov pc, lr 124 %tmp1 = load <4 x float>, ptr %A 125 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 126 ret <4 x float> %tmp2 127} 128 129define <8 x i8> @test_vrev32D8(ptr %A) nounwind { 130; CHECK-LABEL: test_vrev32D8: 131; CHECK: @ %bb.0: 132; CHECK-NEXT: vldr d16, [r0] 133; CHECK-NEXT: vrev32.8 d16, d16 134; CHECK-NEXT: vmov r0, r1, d16 135; CHECK-NEXT: mov pc, lr 136 %tmp1 = load <8 x i8>, ptr %A 137 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 138 ret <8 x i8> %tmp2 139} 140 141define <4 x i16> @test_vrev32D16(ptr %A) nounwind { 142; CHECK-LABEL: test_vrev32D16: 143; CHECK: @ %bb.0: 144; CHECK-NEXT: vldr d16, [r0] 145; CHECK-NEXT: vrev32.16 d16, d16 146; CHECK-NEXT: vmov r0, r1, d16 147; CHECK-NEXT: mov pc, lr 148 %tmp1 = load <4 x i16>, ptr %A 149 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 150 ret <4 x i16> %tmp2 151} 152 153define <4 x half> @test_vrev32Df16(ptr %A) nounwind { 154; CHECK-LABEL: test_vrev32Df16: 155; CHECK: @ %bb.0: 156; CHECK-NEXT: vldr d16, [r0] 157; CHECK-NEXT: vrev32.16 d16, d16 158; CHECK-NEXT: vmov r0, r1, d16 159; CHECK-NEXT: mov pc, lr 160 %tmp1 = load <4 x half>, ptr %A 161 %tmp2 = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 162 ret <4 x half> %tmp2 163} 164 165define <16 x i8> @test_vrev32Q8(ptr %A) nounwind { 166; CHECK-LABEL: test_vrev32Q8: 167; CHECK: @ %bb.0: 168; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 169; CHECK-NEXT: vrev32.8 q8, q8 170; CHECK-NEXT: vmov r0, r1, d16 171; CHECK-NEXT: vmov r2, r3, d17 172; CHECK-NEXT: mov pc, lr 173 %tmp1 = load <16 x i8>, ptr %A 174 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 175 ret <16 x i8> %tmp2 176} 177 178define <8 x i16> @test_vrev32Q16(ptr %A) nounwind { 179; CHECK-LABEL: test_vrev32Q16: 180; CHECK: @ %bb.0: 181; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 182; CHECK-NEXT: vrev32.16 q8, q8 183; CHECK-NEXT: vmov r0, r1, d16 184; CHECK-NEXT: vmov r2, r3, d17 185; CHECK-NEXT: mov pc, lr 186 %tmp1 = load <8 x i16>, ptr %A 187 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 188 ret <8 x i16> %tmp2 189} 190 191define <8 x half> @test_vrev32Qf16(ptr %A) nounwind { 192; CHECK-LABEL: test_vrev32Qf16: 193; CHECK: @ %bb.0: 194; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 195; CHECK-NEXT: vrev32.16 q8, q8 196; CHECK-NEXT: vmov r0, r1, d16 197; CHECK-NEXT: vmov r2, r3, d17 198; CHECK-NEXT: mov pc, lr 199 %tmp1 = load <8 x half>, ptr %A 200 %tmp2 = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 201 ret <8 x half> %tmp2 202} 203 204define <8 x i8> @test_vrev16D8(ptr %A) nounwind { 205; CHECK-LABEL: test_vrev16D8: 206; CHECK: @ %bb.0: 207; CHECK-NEXT: vldr d16, [r0] 208; CHECK-NEXT: vrev16.8 d16, d16 209; CHECK-NEXT: vmov r0, r1, d16 210; CHECK-NEXT: mov pc, lr 211 %tmp1 = load <8 x i8>, ptr %A 212 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 213 ret <8 x i8> %tmp2 214} 215 216define <16 x i8> @test_vrev16Q8(ptr %A) nounwind { 217; CHECK-LABEL: test_vrev16Q8: 218; CHECK: @ %bb.0: 219; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 220; CHECK-NEXT: vrev16.8 q8, q8 221; CHECK-NEXT: vmov r0, r1, d16 222; CHECK-NEXT: vmov r2, r3, d17 223; CHECK-NEXT: mov pc, lr 224 %tmp1 = load <16 x i8>, ptr %A 225 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 226 ret <16 x i8> %tmp2 227} 228 229; Undef shuffle indices should not prevent matching to VREV: 230 231define <8 x i8> @test_vrev64D8_undef(ptr %A) nounwind { 232; CHECK-LABEL: test_vrev64D8_undef: 233; CHECK: @ %bb.0: 234; CHECK-NEXT: vldr d16, [r0] 235; CHECK-NEXT: vrev64.8 d16, d16 236; CHECK-NEXT: vmov r0, r1, d16 237; CHECK-NEXT: mov pc, lr 238 %tmp1 = load <8 x i8>, ptr %A 239 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0> 240 ret <8 x i8> %tmp2 241} 242 243define <8 x i16> @test_vrev32Q16_undef(ptr %A) nounwind { 244; CHECK-LABEL: test_vrev32Q16_undef: 245; CHECK: @ %bb.0: 246; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 247; CHECK-NEXT: vrev32.16 q8, q8 248; CHECK-NEXT: vmov r0, r1, d16 249; CHECK-NEXT: vmov r2, r3, d17 250; CHECK-NEXT: mov pc, lr 251 %tmp1 = load <8 x i16>, ptr %A 252 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef> 253 ret <8 x i16> %tmp2 254} 255 256define <8 x half> @test_vrev32Qf16_undef(ptr %A) nounwind { 257; CHECK-LABEL: test_vrev32Qf16_undef: 258; CHECK: @ %bb.0: 259; CHECK-NEXT: vld1.64 {d16, d17}, [r0] 260; CHECK-NEXT: vrev32.16 q8, q8 261; CHECK-NEXT: vmov r0, r1, d16 262; CHECK-NEXT: vmov r2, r3, d17 263; CHECK-NEXT: mov pc, lr 264 %tmp1 = load <8 x half>, ptr %A 265 %tmp2 = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef> 266 ret <8 x half> %tmp2 267} 268 269; A vcombine feeding a VREV should not obscure things. Radar 8597007. 270 271define void @test_with_vcombine(ptr %v) nounwind { 272; CHECK-LABEL: test_with_vcombine: 273; CHECK: @ %bb.0: 274; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] 275; CHECK-NEXT: vadd.f32 d18, d17, d17 276; CHECK-NEXT: vrev64.32 d16, d16 277; CHECK-NEXT: vrev64.32 d17, d18 278; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 279; CHECK-NEXT: mov pc, lr 280 %tmp1 = load <4 x float>, ptr %v, align 16 281 %tmp2 = bitcast <4 x float> %tmp1 to <2 x double> 282 %tmp3 = extractelement <2 x double> %tmp2, i32 0 283 %tmp4 = bitcast double %tmp3 to <2 x float> 284 %tmp5 = extractelement <2 x double> %tmp2, i32 1 285 %tmp6 = bitcast double %tmp5 to <2 x float> 286 %tmp7 = fadd <2 x float> %tmp6, %tmp6 287 %tmp8 = shufflevector <2 x float> %tmp4, <2 x float> %tmp7, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 288 store <4 x float> %tmp8, ptr %v, align 16 289 ret void 290} 291 292; The type <2 x i16> is legalized to <2 x i32> and need to be trunc-stored 293; to <2 x i16> when stored to memory. 294define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp { 295; CHECK-LABEL: test_vrev64: 296; CHECK: @ %bb.0: @ %entry 297; CHECK-NEXT: vld1.32 {d16, d17}, [r0] 298; CHECK-NEXT: vmov.u16 r0, d17[2] 299; CHECK-NEXT: vmov.u16 r2, d17[1] 300; CHECK-NEXT: vmov.32 d16[0], r0 301; CHECK-NEXT: vmov.32 d16[1], r2 302; CHECK-NEXT: vuzp.16 d16, d17 303; CHECK-NEXT: vst1.32 {d16[0]}, [r1:32] 304; CHECK-NEXT: mov pc, lr 305entry: 306 %tmp2 = load <8 x i16>, ptr %source, align 4 307 %tmp3 = extractelement <8 x i16> %tmp2, i32 6 308 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0 309 %tmp9 = extractelement <8 x i16> %tmp2, i32 5 310 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1 311 store <2 x i16> %tmp11, ptr %dst, align 4 312 ret void 313} 314 315; Test vrev of float4 316define void @float_vrev64(ptr nocapture %source, ptr nocapture %dest) nounwind noinline ssp { 317; CHECK-LABEL: float_vrev64: 318; CHECK: @ %bb.0: @ %entry 319; CHECK-NEXT: vmov.i32 q8, #0x0 320; CHECK-NEXT: vld1.32 {d18, d19}, [r0] 321; CHECK-NEXT: add r0, r1, #176 322; CHECK-NEXT: vext.32 q8, q9, q8, #3 323; CHECK-NEXT: vrev64.32 q8, q8 324; CHECK-NEXT: vst1.32 {d16, d17}, [r0] 325; CHECK-NEXT: mov pc, lr 326entry: 327 %tmp2 = load <4 x float>, ptr %source, align 4 328 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0> 329 %arrayidx8 = getelementptr inbounds <4 x float>, ptr %dest, i32 11 330 store <4 x float> %tmp5, ptr %arrayidx8, align 4 331 ret void 332} 333 334define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { 335; CHECK-LABEL: test_vrev32_bswap: 336; CHECK: @ %bb.0: 337; CHECK-NEXT: vmov d17, r2, r3 338; CHECK-NEXT: vmov d16, r0, r1 339; CHECK-NEXT: vrev32.8 q8, q8 340; CHECK-NEXT: vmov r0, r1, d16 341; CHECK-NEXT: vmov r2, r3, d17 342; CHECK-NEXT: mov pc, lr 343 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) 344 ret <4 x i32> %bswap 345} 346 347declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone 348