1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { 5; CHECK-LABEL: extret1_f16_sf: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vmov d0, r0, r1 8; CHECK-NEXT: mov r0, sp 9; CHECK-NEXT: vldrw.u32 q1, [r0] 10; CHECK-NEXT: ldr r0, [sp, #16] 11; CHECK-NEXT: vadd.f16 q0, q0, q1 12; CHECK-NEXT: vmovx.f16 s0, s0 13; CHECK-NEXT: vstr.16 s0, [r0] 14; CHECK-NEXT: vmov r0, s0 15; CHECK-NEXT: bx lr 16 %c = fadd <8 x half> %a, %b 17 %e = extractelement <8 x half> %c, i32 1 18 store half %e, ptr %p, align 2 19 ret half %e 20} 21 22define half @extret4_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { 23; CHECK-LABEL: extret4_f16_sf: 24; CHECK: @ %bb.0: 25; CHECK-NEXT: mov r0, sp 26; CHECK-NEXT: vmov d1, r2, r3 27; CHECK-NEXT: vldrw.u32 q1, [r0] 28; CHECK-NEXT: ldr r0, [sp, #16] 29; CHECK-NEXT: vadd.f16 q0, q0, q1 30; CHECK-NEXT: vstr.16 s2, [r0] 31; CHECK-NEXT: vmov r0, s2 32; CHECK-NEXT: bx lr 33 %c = fadd <8 x half> %a, %b 34 %e = extractelement <8 x half> %c, i32 4 35 store half %e, ptr %p, align 2 36 ret half %e 37} 38 39define arm_aapcs_vfpcc half @extret1_f16_hf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { 40; CHECK-LABEL: extret1_f16_hf: 41; CHECK: @ %bb.0: 42; CHECK-NEXT: vadd.f16 q0, q0, q1 43; CHECK-NEXT: vmovx.f16 s0, s0 44; CHECK-NEXT: vstr.16 s0, [r0] 45; CHECK-NEXT: bx lr 46 %c = fadd <8 x half> %a, %b 47 %e = extractelement <8 x half> %c, i32 1 48 store half %e, ptr %p, align 2 49 ret half %e 50} 51 52define arm_aapcs_vfpcc half @extret4_f16_hf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { 53; CHECK-LABEL: extret4_f16_hf: 54; CHECK: @ %bb.0: 55; CHECK-NEXT: vadd.f16 q0, q0, q1 56; CHECK-NEXT: vmov.f32 s0, s2 57; CHECK-NEXT: vstr.16 s2, [r0] 58; CHECK-NEXT: bx lr 59 %c = fadd <8 x half> %a, %b 60 %e = extractelement <8 x half> %c, i32 4 61 store half %e, ptr %p, align 2 62 ret half %e 63} 64 65define arm_aapcs_vfpcc <8 x half> @extret1_v8f16_hf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { 66; CHECK-LABEL: extret1_v8f16_hf: 67; CHECK: @ %bb.0: 68; CHECK-NEXT: vadd.f16 q0, q0, q1 69; CHECK-NEXT: vmov.u16 r1, q0[1] 70; CHECK-NEXT: vdup.16 q0, r1 71; CHECK-NEXT: strh r1, [r0] 72; CHECK-NEXT: bx lr 73 %c = fadd <8 x half> %a, %b 74 %e = extractelement <8 x half> %c, i32 1 75 store half %e, ptr %p, align 2 76 %i = insertelement <8 x half> undef, half %e, i32 0 77 %s = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer 78 ret <8 x half> %s 79} 80 81define arm_aapcs_vfpcc <8 x half> @extret4_v8f16_hf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { 82; CHECK-LABEL: extret4_v8f16_hf: 83; CHECK: @ %bb.0: 84; CHECK-NEXT: vadd.f16 q0, q0, q1 85; CHECK-NEXT: vmov.u16 r1, q0[4] 86; CHECK-NEXT: vdup.16 q0, r1 87; CHECK-NEXT: strh r1, [r0] 88; CHECK-NEXT: bx lr 89 %c = fadd <8 x half> %a, %b 90 %e = extractelement <8 x half> %c, i32 4 91 store half %e, ptr %p, align 2 92 %i = insertelement <8 x half> undef, half %e, i32 0 93 %s = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer 94 ret <8 x half> %s 95} 96 97 98define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { 99; CHECK-LABEL: extret1_f32_sf: 100; CHECK: @ %bb.0: 101; CHECK-NEXT: vmov d0, r0, r1 102; CHECK-NEXT: mov r0, sp 103; CHECK-NEXT: vldrw.u32 q1, [r0] 104; CHECK-NEXT: ldr r1, [sp, #16] 105; CHECK-NEXT: vadd.f32 q0, q0, q1 106; CHECK-NEXT: vmov r0, s1 107; CHECK-NEXT: vstr s1, [r1] 108; CHECK-NEXT: bx lr 109 %c = fadd <4 x float> %a, %b 110 %e = extractelement <4 x float> %c, i32 1 111 store float %e, ptr %p, align 4 112 ret float %e 113} 114 115define float @extret2_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { 116; CHECK-LABEL: extret2_f32_sf: 117; CHECK: @ %bb.0: 118; CHECK-NEXT: mov r0, sp 119; CHECK-NEXT: vmov d1, r2, r3 120; CHECK-NEXT: vldrw.u32 q1, [r0] 121; CHECK-NEXT: ldr r1, [sp, #16] 122; CHECK-NEXT: vadd.f32 q0, q0, q1 123; CHECK-NEXT: vmov r0, s2 124; CHECK-NEXT: vstr s2, [r1] 125; CHECK-NEXT: bx lr 126 %c = fadd <4 x float> %a, %b 127 %e = extractelement <4 x float> %c, i32 2 128 store float %e, ptr %p, align 4 129 ret float %e 130} 131 132define arm_aapcs_vfpcc float @extret1_f32_hf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { 133; CHECK-LABEL: extret1_f32_hf: 134; CHECK: @ %bb.0: 135; CHECK-NEXT: vadd.f32 q0, q0, q1 136; CHECK-NEXT: vmov.f32 s0, s1 137; CHECK-NEXT: vstr s1, [r0] 138; CHECK-NEXT: bx lr 139 %c = fadd <4 x float> %a, %b 140 %e = extractelement <4 x float> %c, i32 1 141 store float %e, ptr %p, align 4 142 ret float %e 143} 144 145 146define arm_aapcs_vfpcc float @extret2_f32_hf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { 147; CHECK-LABEL: extret2_f32_hf: 148; CHECK: @ %bb.0: 149; CHECK-NEXT: vadd.f32 q0, q0, q1 150; CHECK-NEXT: vmov.f32 s0, s2 151; CHECK-NEXT: vstr s2, [r0] 152; CHECK-NEXT: bx lr 153 %c = fadd <4 x float> %a, %b 154 %e = extractelement <4 x float> %c, i32 2 155 store float %e, ptr %p, align 4 156 ret float %e 157} 158 159define arm_aapcs_vfpcc <4 x float> @extret1_v4f32_hf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { 160; CHECK-LABEL: extret1_v4f32_hf: 161; CHECK: @ %bb.0: 162; CHECK-NEXT: vadd.f32 q1, q0, q1 163; CHECK-NEXT: vmov r1, s5 164; CHECK-NEXT: vstr s5, [r0] 165; CHECK-NEXT: vdup.32 q0, r1 166; CHECK-NEXT: bx lr 167 %c = fadd <4 x float> %a, %b 168 %e = extractelement <4 x float> %c, i32 1 169 store float %e, ptr %p, align 4 170 %i = insertelement <4 x float> undef, float %e, i32 0 171 %s = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer 172 ret <4 x float> %s 173} 174 175define arm_aapcs_vfpcc <4 x float> @extret2_v4f32_hf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { 176; CHECK-LABEL: extret2_v4f32_hf: 177; CHECK: @ %bb.0: 178; CHECK-NEXT: vadd.f32 q1, q0, q1 179; CHECK-NEXT: vmov r1, s6 180; CHECK-NEXT: vstr s6, [r0] 181; CHECK-NEXT: vdup.32 q0, r1 182; CHECK-NEXT: bx lr 183 %c = fadd <4 x float> %a, %b 184 %e = extractelement <4 x float> %c, i32 2 185 store float %e, ptr %p, align 4 186 %i = insertelement <4 x float> undef, float %e, i32 0 187 %s = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer 188 ret <4 x float> %s 189} 190