1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE 3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVEFP 4 5define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) { 6; CHECK-LABEL: fpext_4: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vcvtt.f32.f16 s3, s1 9; CHECK-NEXT: vcvtb.f32.f16 s2, s1 10; CHECK-NEXT: vcvtt.f32.f16 s1, s0 11; CHECK-NEXT: vcvtb.f32.f16 s0, s0 12; CHECK-NEXT: bx lr 13entry: 14 %out = fpext <4 x half> %src1 to <4 x float> 15 ret <4 x float> %out 16} 17 18define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) { 19; CHECK-LABEL: fpext_8: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vcvtt.f32.f16 s11, s1 22; CHECK-NEXT: vcvtb.f32.f16 s10, s1 23; CHECK-NEXT: vcvtt.f32.f16 s9, s0 24; CHECK-NEXT: vcvtb.f32.f16 s8, s0 25; CHECK-NEXT: vcvtt.f32.f16 s7, s3 26; CHECK-NEXT: vcvtb.f32.f16 s6, s3 27; CHECK-NEXT: vcvtt.f32.f16 s5, s2 28; CHECK-NEXT: vcvtb.f32.f16 s4, s2 29; CHECK-NEXT: vmov q0, q2 30; CHECK-NEXT: bx lr 31entry: 32 %out = fpext <8 x half> %src1 to <8 x float> 33 ret <8 x float> %out 34} 35 36 37define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) { 38; CHECK-LABEL: fptrunc_4: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vcvtb.f16.f32 s0, s0 41; CHECK-NEXT: vcvtt.f16.f32 s0, s1 42; CHECK-NEXT: vcvtb.f16.f32 s1, s2 43; CHECK-NEXT: vcvtt.f16.f32 s1, s3 44; CHECK-NEXT: bx lr 45entry: 46 %out = fptrunc <4 x float> %src1 to <4 x half> 47 ret <4 x half> %out 48} 49 50define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) { 51; CHECK-LABEL: fptrunc_8: 52; CHECK: @ %bb.0: @ %entry 53; CHECK-NEXT: vcvtb.f16.f32 s0, s0 54; CHECK-NEXT: vcvtt.f16.f32 s0, s1 55; CHECK-NEXT: vcvtb.f16.f32 s1, s2 56; CHECK-NEXT: vcvtb.f16.f32 s2, s4 57; CHECK-NEXT: vcvtt.f16.f32 s1, s3 58; CHECK-NEXT: vcvtb.f16.f32 s3, s6 59; CHECK-NEXT: vcvtt.f16.f32 s2, s5 60; CHECK-NEXT: vcvtt.f16.f32 s3, s7 61; CHECK-NEXT: bx lr 62entry: 63 %out = fptrunc <8 x float> %src1 to <8 x half> 64 ret <8 x half> %out 65} 66 67 68define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) { 69; CHECK-MVE-LABEL: shuffle_trunc1: 70; CHECK-MVE: @ %bb.0: @ %entry 71; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 72; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 73; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 74; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 75; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 76; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 77; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 78; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 79; CHECK-MVE-NEXT: bx lr 80; 81; CHECK-MVEFP-LABEL: shuffle_trunc1: 82; CHECK-MVEFP: @ %bb.0: @ %entry 83; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 84; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1 85; CHECK-MVEFP-NEXT: bx lr 86entry: 87 %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 88 %out = fptrunc <8 x float> %strided.vec to <8 x half> 89 ret <8 x half> %out 90} 91 92define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) { 93; CHECK-MVE-LABEL: shuffle_trunc2: 94; CHECK-MVE: @ %bb.0: @ %entry 95; CHECK-MVE-NEXT: vmov q2, q0 96; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 97; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 98; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 99; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 100; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 101; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 102; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 103; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 104; CHECK-MVE-NEXT: bx lr 105; 106; CHECK-MVEFP-LABEL: shuffle_trunc2: 107; CHECK-MVEFP: @ %bb.0: @ %entry 108; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 109; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q0 110; CHECK-MVEFP-NEXT: vmov q0, q1 111; CHECK-MVEFP-NEXT: bx lr 112entry: 113 %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3> 114 %out = fptrunc <8 x float> %strided.vec to <8 x half> 115 ret <8 x half> %out 116} 117 118define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) { 119; CHECK-MVE-LABEL: shuffle_trunc3: 120; CHECK-MVE: @ %bb.0: @ %entry 121; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 122; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 123; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 124; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 125; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4 126; CHECK-MVE-NEXT: vcvtb.f16.f32 s5, s5 127; CHECK-MVE-NEXT: vcvtb.f16.f32 s6, s6 128; CHECK-MVE-NEXT: vcvtb.f16.f32 s7, s7 129; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 130; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 131; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 132; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 133; CHECK-MVE-NEXT: vcvtt.f16.f32 s4, s12 134; CHECK-MVE-NEXT: vcvtt.f16.f32 s5, s13 135; CHECK-MVE-NEXT: vcvtt.f16.f32 s6, s14 136; CHECK-MVE-NEXT: vcvtt.f16.f32 s7, s15 137; CHECK-MVE-NEXT: bx lr 138; 139; CHECK-MVEFP-LABEL: shuffle_trunc3: 140; CHECK-MVEFP: @ %bb.0: @ %entry 141; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 142; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 143; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2 144; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3 145; CHECK-MVEFP-NEXT: bx lr 146entry: 147 %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 148 %out = fptrunc <16 x float> %strided.vec to <16 x half> 149 ret <16 x half> %out 150} 151 152define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) { 153; CHECK-MVE-LABEL: shuffle_trunc4: 154; CHECK-MVE: @ %bb.0: @ %entry 155; CHECK-MVE-NEXT: .vsave {d8, d9} 156; CHECK-MVE-NEXT: vpush {d8, d9} 157; CHECK-MVE-NEXT: vmov q4, q0 158; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 159; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9 160; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10 161; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11 162; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s12 163; CHECK-MVE-NEXT: vcvtb.f16.f32 s9, s13 164; CHECK-MVE-NEXT: vcvtb.f16.f32 s10, s14 165; CHECK-MVE-NEXT: vcvtb.f16.f32 s11, s15 166; CHECK-MVE-NEXT: vcvtt.f16.f32 s8, s4 167; CHECK-MVE-NEXT: vcvtt.f16.f32 s9, s5 168; CHECK-MVE-NEXT: vcvtt.f16.f32 s10, s6 169; CHECK-MVE-NEXT: vcvtt.f16.f32 s11, s7 170; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s16 171; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s17 172; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s18 173; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s19 174; CHECK-MVE-NEXT: vmov q1, q2 175; CHECK-MVE-NEXT: vpop {d8, d9} 176; CHECK-MVE-NEXT: bx lr 177; 178; CHECK-MVEFP-LABEL: shuffle_trunc4: 179; CHECK-MVEFP: @ %bb.0: @ %entry 180; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2 181; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3 182; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q2, q0 183; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q3, q1 184; CHECK-MVEFP-NEXT: vmov q0, q2 185; CHECK-MVEFP-NEXT: vmov q1, q3 186; CHECK-MVEFP-NEXT: bx lr 187entry: 188 %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7> 189 %out = fptrunc <16 x float> %strided.vec to <16 x half> 190 ret <16 x half> %out 191} 192 193define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) { 194; CHECK-MVE-LABEL: shuffle_trunc5: 195; CHECK-MVE: @ %bb.0: @ %entry 196; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 197; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 198; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 199; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 200; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 201; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 202; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 203; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 204; CHECK-MVE-NEXT: bx lr 205; 206; CHECK-MVEFP-LABEL: shuffle_trunc5: 207; CHECK-MVEFP: @ %bb.0: @ %entry 208; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 209; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1 210; CHECK-MVEFP-NEXT: bx lr 211entry: 212 %out1 = fptrunc <4 x float> %src1 to <4 x half> 213 %out2 = fptrunc <4 x float> %src2 to <4 x half> 214 %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 215 ret <8 x half> %s 216} 217 218define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) { 219; CHECK-MVE-LABEL: shuffle_trunc6: 220; CHECK-MVE: @ %bb.0: @ %entry 221; CHECK-MVE-NEXT: vmov q2, q0 222; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 223; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 224; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 225; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 226; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 227; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 228; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 229; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 230; CHECK-MVE-NEXT: bx lr 231; 232; CHECK-MVEFP-LABEL: shuffle_trunc6: 233; CHECK-MVEFP: @ %bb.0: @ %entry 234; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 235; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q0 236; CHECK-MVEFP-NEXT: vmov q0, q1 237; CHECK-MVEFP-NEXT: bx lr 238entry: 239 %out1 = fptrunc <4 x float> %src1 to <4 x half> 240 %out2 = fptrunc <4 x float> %src2 to <4 x half> 241 %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3> 242 ret <8 x half> %s 243} 244 245define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) { 246; CHECK-MVE-LABEL: shuffle_trunc7: 247; CHECK-MVE: @ %bb.0: @ %entry 248; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 249; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 250; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 251; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 252; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4 253; CHECK-MVE-NEXT: vcvtb.f16.f32 s5, s5 254; CHECK-MVE-NEXT: vcvtb.f16.f32 s6, s6 255; CHECK-MVE-NEXT: vcvtb.f16.f32 s7, s7 256; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 257; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 258; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 259; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 260; CHECK-MVE-NEXT: vcvtt.f16.f32 s4, s12 261; CHECK-MVE-NEXT: vcvtt.f16.f32 s5, s13 262; CHECK-MVE-NEXT: vcvtt.f16.f32 s6, s14 263; CHECK-MVE-NEXT: vcvtt.f16.f32 s7, s15 264; CHECK-MVE-NEXT: bx lr 265; 266; CHECK-MVEFP-LABEL: shuffle_trunc7: 267; CHECK-MVEFP: @ %bb.0: @ %entry 268; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 269; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 270; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2 271; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3 272; CHECK-MVEFP-NEXT: bx lr 273entry: 274 %out1 = fptrunc <8 x float> %src1 to <8 x half> 275 %out2 = fptrunc <8 x float> %src2 to <8 x half> 276 %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 277 ret <16 x half> %s 278} 279 280define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) { 281; CHECK-MVE-LABEL: shuffle_trunc8: 282; CHECK-MVE: @ %bb.0: @ %entry 283; CHECK-MVE-NEXT: .vsave {d8, d9} 284; CHECK-MVE-NEXT: vpush {d8, d9} 285; CHECK-MVE-NEXT: vmov q4, q0 286; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 287; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9 288; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10 289; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11 290; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s12 291; CHECK-MVE-NEXT: vcvtb.f16.f32 s9, s13 292; CHECK-MVE-NEXT: vcvtb.f16.f32 s10, s14 293; CHECK-MVE-NEXT: vcvtb.f16.f32 s11, s15 294; CHECK-MVE-NEXT: vcvtt.f16.f32 s8, s4 295; CHECK-MVE-NEXT: vcvtt.f16.f32 s9, s5 296; CHECK-MVE-NEXT: vcvtt.f16.f32 s10, s6 297; CHECK-MVE-NEXT: vcvtt.f16.f32 s11, s7 298; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s16 299; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s17 300; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s18 301; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s19 302; CHECK-MVE-NEXT: vmov q1, q2 303; CHECK-MVE-NEXT: vpop {d8, d9} 304; CHECK-MVE-NEXT: bx lr 305; 306; CHECK-MVEFP-LABEL: shuffle_trunc8: 307; CHECK-MVEFP: @ %bb.0: @ %entry 308; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2 309; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3 310; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q2, q0 311; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q3, q1 312; CHECK-MVEFP-NEXT: vmov q0, q2 313; CHECK-MVEFP-NEXT: vmov q1, q3 314; CHECK-MVEFP-NEXT: bx lr 315entry: 316 %out1 = fptrunc <8 x float> %src1 to <8 x half> 317 %out2 = fptrunc <8 x float> %src2 to <8 x half> 318 %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7> 319 ret <16 x half> %s 320} 321 322 323 324 325define arm_aapcs_vfpcc <4 x float> @load_ext_4(ptr %src) { 326; CHECK-MVE-LABEL: load_ext_4: 327; CHECK-MVE: @ %bb.0: @ %entry 328; CHECK-MVE-NEXT: ldrd r0, r1, [r0] 329; CHECK-MVE-NEXT: vmov.32 q0[0], r0 330; CHECK-MVE-NEXT: vmov.32 q0[1], r1 331; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s1 332; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s1 333; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s0 334; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 335; CHECK-MVE-NEXT: bx lr 336; 337; CHECK-MVEFP-LABEL: load_ext_4: 338; CHECK-MVEFP: @ %bb.0: @ %entry 339; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0] 340; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 341; CHECK-MVEFP-NEXT: bx lr 342entry: 343 %wide.load = load <4 x half>, ptr %src, align 4 344 %e = fpext <4 x half> %wide.load to <4 x float> 345 ret <4 x float> %e 346} 347 348define arm_aapcs_vfpcc <8 x float> @load_ext_8(ptr %src) { 349; CHECK-MVE-LABEL: load_ext_8: 350; CHECK-MVE: @ %bb.0: @ %entry 351; CHECK-MVE-NEXT: vldrw.u32 q2, [r0] 352; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s9 353; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s9 354; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s8 355; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s8 356; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s11 357; CHECK-MVE-NEXT: vcvtb.f32.f16 s6, s11 358; CHECK-MVE-NEXT: vcvtt.f32.f16 s5, s10 359; CHECK-MVE-NEXT: vcvtb.f32.f16 s4, s10 360; CHECK-MVE-NEXT: bx lr 361; 362; CHECK-MVEFP-LABEL: load_ext_8: 363; CHECK-MVEFP: @ %bb.0: @ %entry 364; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0] 365; CHECK-MVEFP-NEXT: vldrh.u32 q1, [r0, #8] 366; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 367; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q1, q1 368; CHECK-MVEFP-NEXT: bx lr 369entry: 370 %wide.load = load <8 x half>, ptr %src, align 4 371 %e = fpext <8 x half> %wide.load to <8 x float> 372 ret <8 x float> %e 373} 374 375define arm_aapcs_vfpcc <16 x float> @load_ext_16(ptr %src) { 376; CHECK-MVE-LABEL: load_ext_16: 377; CHECK-MVE: @ %bb.0: @ %entry 378; CHECK-MVE-NEXT: .vsave {d8, d9} 379; CHECK-MVE-NEXT: vpush {d8, d9} 380; CHECK-MVE-NEXT: vldrw.u32 q2, [r0], #16 381; CHECK-MVE-NEXT: vldrw.u32 q4, [r0] 382; CHECK-MVE-NEXT: vcvtt.f32.f16 s3, s9 383; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s9 384; CHECK-MVE-NEXT: vcvtt.f32.f16 s1, s8 385; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s8 386; CHECK-MVE-NEXT: vcvtt.f32.f16 s7, s11 387; CHECK-MVE-NEXT: vcvtb.f32.f16 s6, s11 388; CHECK-MVE-NEXT: vcvtt.f32.f16 s5, s10 389; CHECK-MVE-NEXT: vcvtb.f32.f16 s4, s10 390; CHECK-MVE-NEXT: vcvtt.f32.f16 s11, s17 391; CHECK-MVE-NEXT: vcvtb.f32.f16 s10, s17 392; CHECK-MVE-NEXT: vcvtt.f32.f16 s9, s16 393; CHECK-MVE-NEXT: vcvtb.f32.f16 s8, s16 394; CHECK-MVE-NEXT: vcvtt.f32.f16 s15, s19 395; CHECK-MVE-NEXT: vcvtb.f32.f16 s14, s19 396; CHECK-MVE-NEXT: vcvtt.f32.f16 s13, s18 397; CHECK-MVE-NEXT: vcvtb.f32.f16 s12, s18 398; CHECK-MVE-NEXT: vpop {d8, d9} 399; CHECK-MVE-NEXT: bx lr 400; 401; CHECK-MVEFP-LABEL: load_ext_16: 402; CHECK-MVEFP: @ %bb.0: @ %entry 403; CHECK-MVEFP-NEXT: vldrh.u32 q0, [r0] 404; CHECK-MVEFP-NEXT: vldrh.u32 q1, [r0, #8] 405; CHECK-MVEFP-NEXT: vldrh.u32 q2, [r0, #16] 406; CHECK-MVEFP-NEXT: vldrh.u32 q3, [r0, #24] 407; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 408; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q1, q1 409; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q2, q2 410; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q3, q3 411; CHECK-MVEFP-NEXT: bx lr 412entry: 413 %wide.load = load <16 x half>, ptr %src, align 4 414 %e = fpext <16 x half> %wide.load to <16 x float> 415 ret <16 x float> %e 416} 417 418define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(ptr %src) { 419; CHECK-MVE-LABEL: load_shuffleext_8: 420; CHECK-MVE: @ %bb.0: @ %entry 421; CHECK-MVE-NEXT: vldrw.u32 q0, [r0] 422; CHECK-MVE-NEXT: vcvtb.f32.f16 s3, s3 423; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 424; CHECK-MVE-NEXT: vcvtb.f32.f16 s1, s1 425; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 426; CHECK-MVE-NEXT: bx lr 427; 428; CHECK-MVEFP-LABEL: load_shuffleext_8: 429; CHECK-MVEFP: @ %bb.0: @ %entry 430; CHECK-MVEFP-NEXT: vldrw.u32 q0, [r0] 431; CHECK-MVEFP-NEXT: vcvtb.f32.f16 q0, q0 432; CHECK-MVEFP-NEXT: bx lr 433entry: 434 %wide.load = load <8 x half>, ptr %src, align 4 435 %sh = shufflevector <8 x half> %wide.load, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 436 %e = fpext <4 x half> %sh to <4 x float> 437 ret <4 x float> %e 438} 439 440define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(ptr %src) { 441; CHECK-LABEL: load_shuffleext_16: 442; CHECK: @ %bb.0: @ %entry 443; CHECK-NEXT: vld20.16 {q2, q3}, [r0] 444; CHECK-NEXT: vld21.16 {q2, q3}, [r0] 445; CHECK-NEXT: vcvtt.f32.f16 s3, s9 446; CHECK-NEXT: vcvtb.f32.f16 s2, s9 447; CHECK-NEXT: vcvtt.f32.f16 s1, s8 448; CHECK-NEXT: vcvtb.f32.f16 s0, s8 449; CHECK-NEXT: vcvtt.f32.f16 s7, s11 450; CHECK-NEXT: vcvtb.f32.f16 s6, s11 451; CHECK-NEXT: vcvtt.f32.f16 s5, s10 452; CHECK-NEXT: vcvtb.f32.f16 s4, s10 453; CHECK-NEXT: bx lr 454entry: 455 %wide.load = load <16 x half>, ptr %src, align 4 456 %sh = shufflevector <16 x half> %wide.load, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 457 %e = fpext <8 x half> %sh to <8 x float> 458 ret <8 x float> %e 459} 460 461 462 463 464define arm_aapcs_vfpcc void @store_trunc_4(ptr %src, <4 x float> %val) { 465; CHECK-MVE-LABEL: store_trunc_4: 466; CHECK-MVE: @ %bb.0: @ %entry 467; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 468; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1 469; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2 470; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3 471; CHECK-MVE-NEXT: vmov r1, r2, d0 472; CHECK-MVE-NEXT: strd r1, r2, [r0] 473; CHECK-MVE-NEXT: bx lr 474; 475; CHECK-MVEFP-LABEL: store_trunc_4: 476; CHECK-MVEFP: @ %bb.0: @ %entry 477; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 478; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0] 479; CHECK-MVEFP-NEXT: bx lr 480entry: 481 %e = fptrunc <4 x float> %val to <4 x half> 482 store <4 x half> %e, ptr %src, align 4 483 ret void 484} 485 486define arm_aapcs_vfpcc void @store_trunc_8(ptr %src, <8 x float> %val) { 487; CHECK-MVE-LABEL: store_trunc_8: 488; CHECK-MVE: @ %bb.0: @ %entry 489; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 490; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1 491; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2 492; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s4 493; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3 494; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s6 495; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s5 496; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 497; CHECK-MVE-NEXT: vstrw.32 q0, [r0] 498; CHECK-MVE-NEXT: bx lr 499; 500; CHECK-MVEFP-LABEL: store_trunc_8: 501; CHECK-MVEFP: @ %bb.0: @ %entry 502; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 503; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 504; CHECK-MVEFP-NEXT: vstrh.32 q1, [r0, #8] 505; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0] 506; CHECK-MVEFP-NEXT: bx lr 507entry: 508 %e = fptrunc <8 x float> %val to <8 x half> 509 store <8 x half> %e, ptr %src, align 4 510 ret void 511} 512 513define arm_aapcs_vfpcc void @store_trunc_16(ptr %src, <16 x float> %val) { 514; CHECK-MVE-LABEL: store_trunc_16: 515; CHECK-MVE: @ %bb.0: @ %entry 516; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 517; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s1 518; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s2 519; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s4 520; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s3 521; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s6 522; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s5 523; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 524; CHECK-MVE-NEXT: vstrb.8 q0, [r0], #16 525; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 526; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s10 527; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s12 528; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s14 529; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s9 530; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s11 531; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s13 532; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s15 533; CHECK-MVE-NEXT: vstrw.32 q0, [r0] 534; CHECK-MVE-NEXT: bx lr 535; 536; CHECK-MVEFP-LABEL: store_trunc_16: 537; CHECK-MVEFP: @ %bb.0: @ %entry 538; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q3, q3 539; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q2, q2 540; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 541; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 542; CHECK-MVEFP-NEXT: vstrh.32 q3, [r0, #24] 543; CHECK-MVEFP-NEXT: vstrh.32 q2, [r0, #16] 544; CHECK-MVEFP-NEXT: vstrh.32 q1, [r0, #8] 545; CHECK-MVEFP-NEXT: vstrh.32 q0, [r0] 546; CHECK-MVEFP-NEXT: bx lr 547entry: 548 %e = fptrunc <16 x float> %val to <16 x half> 549 store <16 x half> %e, ptr %src, align 4 550 ret void 551} 552 553define arm_aapcs_vfpcc void @store_shuffletrunc_8(ptr %src, <4 x float> %val1, <4 x float> %val2) { 554; CHECK-MVE-LABEL: store_shuffletrunc_8: 555; CHECK-MVE: @ %bb.0: @ %entry 556; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 557; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 558; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 559; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 560; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 561; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 562; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 563; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 564; CHECK-MVE-NEXT: vstrw.32 q0, [r0] 565; CHECK-MVE-NEXT: bx lr 566; 567; CHECK-MVEFP-LABEL: store_shuffletrunc_8: 568; CHECK-MVEFP: @ %bb.0: @ %entry 569; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 570; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q1 571; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] 572; CHECK-MVEFP-NEXT: bx lr 573entry: 574 %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 575 %out = fptrunc <8 x float> %strided.vec to <8 x half> 576 store <8 x half> %out, ptr %src, align 4 577 ret void 578} 579 580define arm_aapcs_vfpcc void @store_shuffletrunc_16(ptr %src, <8 x float> %val1, <8 x float> %val2) { 581; CHECK-MVE-LABEL: store_shuffletrunc_16: 582; CHECK-MVE: @ %bb.0: @ %entry 583; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 584; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s1 585; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 586; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s3 587; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 588; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 589; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 590; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 591; CHECK-MVE-NEXT: vstrb.8 q0, [r0], #16 592; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 593; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 594; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 595; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 596; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s12 597; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s13 598; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s14 599; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s15 600; CHECK-MVE-NEXT: vstrw.32 q0, [r0] 601; CHECK-MVE-NEXT: bx lr 602; 603; CHECK-MVEFP-LABEL: store_shuffletrunc_16: 604; CHECK-MVEFP: @ %bb.0: @ %entry 605; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q1, q1 606; CHECK-MVEFP-NEXT: vcvtb.f16.f32 q0, q0 607; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q1, q3 608; CHECK-MVEFP-NEXT: vcvtt.f16.f32 q0, q2 609; CHECK-MVEFP-NEXT: vstrw.32 q1, [r0, #16] 610; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0] 611; CHECK-MVEFP-NEXT: bx lr 612entry: 613 %strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 614 %out = fptrunc <16 x float> %strided.vec to <16 x half> 615 store <16 x half> %out, ptr %src, align 4 616 ret void 617} 618