1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define void @to_4(ptr nocapture readonly %x, ptr noalias nocapture %y) { 5; CHECK-LABEL: to_4: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: mov.w lr, #256 10; CHECK-NEXT: movw r2, #26214 11; CHECK-NEXT: movt r2, #16390 12; CHECK-NEXT: .LBB0_1: @ %vector.body 13; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 14; CHECK-NEXT: vldrw.u32 q0, [r0], #16 15; CHECK-NEXT: vmul.f32 q0, q0, r2 16; CHECK-NEXT: vcvtb.f16.f32 q0, q0 17; CHECK-NEXT: vstrh.32 q0, [r1], #8 18; CHECK-NEXT: le lr, .LBB0_1 19; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 20; CHECK-NEXT: pop {r7, pc} 21entry: 22 br label %vector.body 23 24vector.body: ; preds = %vector.body, %entry 25 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 26 %0 = getelementptr inbounds float, ptr %x, i32 %index 27 %wide.load = load <4 x float>, ptr %0, align 4 28 %1 = fmul <4 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 29 %2 = fptrunc <4 x float> %1 to <4 x half> 30 %3 = getelementptr inbounds half, ptr %y, i32 %index 31 store <4 x half> %2, ptr %3, align 2 32 %index.next = add i32 %index, 4 33 %4 = icmp eq i32 %index.next, 1024 34 br i1 %4, label %for.cond.cleanup, label %vector.body 35 36for.cond.cleanup: ; preds = %vector.body 37 ret void 38} 39 40define void @to_8(ptr nocapture readonly %x, ptr noalias nocapture %y) { 41; CHECK-LABEL: to_8: 42; CHECK: @ %bb.0: @ %entry 43; CHECK-NEXT: .save {r7, lr} 44; CHECK-NEXT: push {r7, lr} 45; CHECK-NEXT: mov.w lr, #128 46; CHECK-NEXT: movw r2, #26214 47; CHECK-NEXT: movt r2, #16390 48; CHECK-NEXT: .LBB1_1: @ %vector.body 49; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 50; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 51; CHECK-NEXT: vmul.f32 q0, q0, r2 52; CHECK-NEXT: vcvtb.f16.f32 q0, q0 53; CHECK-NEXT: vstrh.32 q0, [r1, #8] 54; CHECK-NEXT: vldrw.u32 q0, [r0], #32 55; CHECK-NEXT: vmul.f32 q0, q0, r2 56; CHECK-NEXT: vcvtb.f16.f32 q0, q0 57; CHECK-NEXT: vstrh.32 q0, [r1], #16 58; CHECK-NEXT: le lr, .LBB1_1 59; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 60; CHECK-NEXT: pop {r7, pc} 61entry: 62 br label %vector.body 63 64vector.body: ; preds = %vector.body, %entry 65 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 66 %0 = getelementptr inbounds float, ptr %x, i32 %index 67 %wide.load = load <8 x float>, ptr %0, align 4 68 %1 = fmul <8 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 69 %2 = fptrunc <8 x float> %1 to <8 x half> 70 %3 = getelementptr inbounds half, ptr %y, i32 %index 71 store <8 x half> %2, ptr %3, align 2 72 %index.next = add i32 %index, 8 73 %4 = icmp eq i32 %index.next, 1024 74 br i1 %4, label %for.cond.cleanup, label %vector.body 75 76for.cond.cleanup: ; preds = %vector.body 77 ret void 78} 79 80define void @to_16(ptr nocapture readonly %x, ptr noalias nocapture %y) { 81; CHECK-LABEL: to_16: 82; CHECK: @ %bb.0: @ %entry 83; CHECK-NEXT: .save {r7, lr} 84; CHECK-NEXT: push {r7, lr} 85; CHECK-NEXT: mov.w lr, #64 86; CHECK-NEXT: movw r2, #26214 87; CHECK-NEXT: movt r2, #16390 88; CHECK-NEXT: .LBB2_1: @ %vector.body 89; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 90; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 91; CHECK-NEXT: vmul.f32 q0, q0, r2 92; CHECK-NEXT: vcvtb.f16.f32 q0, q0 93; CHECK-NEXT: vstrh.32 q0, [r1, #24] 94; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 95; CHECK-NEXT: vmul.f32 q0, q0, r2 96; CHECK-NEXT: vcvtb.f16.f32 q0, q0 97; CHECK-NEXT: vstrh.32 q0, [r1, #16] 98; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 99; CHECK-NEXT: vmul.f32 q0, q0, r2 100; CHECK-NEXT: vcvtb.f16.f32 q0, q0 101; CHECK-NEXT: vstrh.32 q0, [r1, #8] 102; CHECK-NEXT: vldrw.u32 q0, [r0], #64 103; CHECK-NEXT: vmul.f32 q0, q0, r2 104; CHECK-NEXT: vcvtb.f16.f32 q0, q0 105; CHECK-NEXT: vstrh.32 q0, [r1], #32 106; CHECK-NEXT: le lr, .LBB2_1 107; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 108; CHECK-NEXT: pop {r7, pc} 109entry: 110 br label %vector.body 111 112vector.body: ; preds = %vector.body, %entry 113 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 114 %0 = getelementptr inbounds float, ptr %x, i32 %index 115 %wide.load = load <16 x float>, ptr %0, align 4 116 %1 = fmul <16 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 117 %2 = fptrunc <16 x float> %1 to <16 x half> 118 %3 = getelementptr inbounds half, ptr %y, i32 %index 119 store <16 x half> %2, ptr %3, align 2 120 %index.next = add i32 %index, 16 121 %4 = icmp eq i32 %index.next, 1024 122 br i1 %4, label %for.cond.cleanup, label %vector.body 123 124for.cond.cleanup: ; preds = %vector.body 125 ret void 126} 127 128define void @from_4(ptr nocapture readonly %x, ptr noalias nocapture %y) { 129; CHECK-LABEL: from_4: 130; CHECK: @ %bb.0: @ %entry 131; CHECK-NEXT: .save {r7, lr} 132; CHECK-NEXT: push {r7, lr} 133; CHECK-NEXT: mov.w lr, #256 134; CHECK-NEXT: movw r2, #26214 135; CHECK-NEXT: movt r2, #16390 136; CHECK-NEXT: .LBB3_1: @ %vector.body 137; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 138; CHECK-NEXT: vldrh.u32 q0, [r0], #8 139; CHECK-NEXT: vcvtb.f32.f16 q0, q0 140; CHECK-NEXT: vmul.f32 q0, q0, r2 141; CHECK-NEXT: vstrb.8 q0, [r1], #16 142; CHECK-NEXT: le lr, .LBB3_1 143; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 144; CHECK-NEXT: pop {r7, pc} 145entry: 146 br label %vector.body 147 148vector.body: ; preds = %vector.body, %entry 149 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 150 %0 = getelementptr inbounds half, ptr %x, i32 %index 151 %wide.load = load <4 x half>, ptr %0, align 2 152 %1 = fpext <4 x half> %wide.load to <4 x float> 153 %2 = fmul <4 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 154 %3 = getelementptr inbounds float, ptr %y, i32 %index 155 store <4 x float> %2, ptr %3, align 4 156 %index.next = add i32 %index, 4 157 %4 = icmp eq i32 %index.next, 1024 158 br i1 %4, label %for.cond.cleanup, label %vector.body 159 160for.cond.cleanup: ; preds = %vector.body 161 ret void 162} 163 164define void @from_8(ptr nocapture readonly %x, ptr noalias nocapture %y) { 165; CHECK-LABEL: from_8: 166; CHECK: @ %bb.0: @ %entry 167; CHECK-NEXT: .save {r7, lr} 168; CHECK-NEXT: push {r7, lr} 169; CHECK-NEXT: mov.w lr, #128 170; CHECK-NEXT: movw r2, #26214 171; CHECK-NEXT: movt r2, #16390 172; CHECK-NEXT: .LBB4_1: @ %vector.body 173; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 174; CHECK-NEXT: vldrh.u32 q0, [r0], #16 175; CHECK-NEXT: vldrh.u32 q1, [r0, #-8] 176; CHECK-NEXT: vcvtb.f32.f16 q0, q0 177; CHECK-NEXT: vmul.f32 q0, q0, r2 178; CHECK-NEXT: vcvtb.f32.f16 q1, q1 179; CHECK-NEXT: vmul.f32 q1, q1, r2 180; CHECK-NEXT: vstrw.32 q1, [r1, #16] 181; CHECK-NEXT: vstrw.32 q0, [r1], #32 182; CHECK-NEXT: le lr, .LBB4_1 183; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 184; CHECK-NEXT: pop {r7, pc} 185entry: 186 br label %vector.body 187 188vector.body: ; preds = %vector.body, %entry 189 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 190 %0 = getelementptr inbounds half, ptr %x, i32 %index 191 %wide.load = load <8 x half>, ptr %0, align 2 192 %1 = fpext <8 x half> %wide.load to <8 x float> 193 %2 = fmul <8 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 194 %3 = getelementptr inbounds float, ptr %y, i32 %index 195 store <8 x float> %2, ptr %3, align 4 196 %index.next = add i32 %index, 8 197 %4 = icmp eq i32 %index.next, 1024 198 br i1 %4, label %for.cond.cleanup, label %vector.body 199 200for.cond.cleanup: ; preds = %vector.body 201 ret void 202} 203 204define void @from_16(ptr nocapture readonly %x, ptr noalias nocapture %y) { 205; CHECK-LABEL: from_16: 206; CHECK: @ %bb.0: @ %entry 207; CHECK-NEXT: .save {r7, lr} 208; CHECK-NEXT: push {r7, lr} 209; CHECK-NEXT: mov.w lr, #64 210; CHECK-NEXT: movw r2, #26214 211; CHECK-NEXT: movt r2, #16390 212; CHECK-NEXT: .LBB5_1: @ %vector.body 213; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 214; CHECK-NEXT: vldrh.u32 q0, [r0], #32 215; CHECK-NEXT: vldrh.u32 q1, [r0, #-24] 216; CHECK-NEXT: vldrh.u32 q2, [r0, #-16] 217; CHECK-NEXT: vldrh.u32 q3, [r0, #-8] 218; CHECK-NEXT: vcvtb.f32.f16 q0, q0 219; CHECK-NEXT: vcvtb.f32.f16 q1, q1 220; CHECK-NEXT: vcvtb.f32.f16 q2, q2 221; CHECK-NEXT: vcvtb.f32.f16 q3, q3 222; CHECK-NEXT: vmul.f32 q2, q2, r2 223; CHECK-NEXT: vmul.f32 q3, q3, r2 224; CHECK-NEXT: vmul.f32 q1, q1, r2 225; CHECK-NEXT: vmul.f32 q0, q0, r2 226; CHECK-NEXT: vstrw.32 q3, [r1, #48] 227; CHECK-NEXT: vstrw.32 q2, [r1, #32] 228; CHECK-NEXT: vstrw.32 q1, [r1, #16] 229; CHECK-NEXT: vstrw.32 q0, [r1], #64 230; CHECK-NEXT: le lr, .LBB5_1 231; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 232; CHECK-NEXT: pop {r7, pc} 233entry: 234 br label %vector.body 235 236vector.body: ; preds = %vector.body, %entry 237 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 238 %0 = getelementptr inbounds half, ptr %x, i32 %index 239 %wide.load = load <16 x half>, ptr %0, align 2 240 %1 = fpext <16 x half> %wide.load to <16 x float> 241 %2 = fmul <16 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 242 %3 = getelementptr inbounds float, ptr %y, i32 %index 243 store <16 x float> %2, ptr %3, align 4 244 %index.next = add i32 %index, 16 245 %4 = icmp eq i32 %index.next, 1024 246 br i1 %4, label %for.cond.cleanup, label %vector.body 247 248for.cond.cleanup: ; preds = %vector.body 249 ret void 250} 251 252define void @both_4(ptr nocapture readonly %x, ptr noalias nocapture %y) { 253; CHECK-LABEL: both_4: 254; CHECK: @ %bb.0: @ %entry 255; CHECK-NEXT: .save {r7, lr} 256; CHECK-NEXT: push {r7, lr} 257; CHECK-NEXT: mov.w lr, #256 258; CHECK-NEXT: movw r2, #26214 259; CHECK-NEXT: movt r2, #16390 260; CHECK-NEXT: .LBB6_1: @ %vector.body 261; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 262; CHECK-NEXT: vldrh.u32 q0, [r0], #8 263; CHECK-NEXT: vcvtb.f32.f16 q0, q0 264; CHECK-NEXT: vmul.f32 q0, q0, r2 265; CHECK-NEXT: vcvtb.f16.f32 q0, q0 266; CHECK-NEXT: vstrh.32 q0, [r1], #8 267; CHECK-NEXT: le lr, .LBB6_1 268; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 269; CHECK-NEXT: pop {r7, pc} 270entry: 271 br label %vector.body 272 273vector.body: ; preds = %vector.body, %entry 274 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 275 %0 = getelementptr inbounds half, ptr %x, i32 %index 276 %wide.load = load <4 x half>, ptr %0, align 2 277 %1 = fpext <4 x half> %wide.load to <4 x float> 278 %2 = fmul <4 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 279 %3 = fptrunc <4 x float> %2 to <4 x half> 280 %4 = getelementptr inbounds half, ptr %y, i32 %index 281 store <4 x half> %3, ptr %4, align 2 282 %index.next = add i32 %index, 4 283 %5 = icmp eq i32 %index.next, 1024 284 br i1 %5, label %for.cond.cleanup, label %vector.body 285 286for.cond.cleanup: ; preds = %vector.body 287 ret void 288} 289 290define void @both_8(ptr nocapture readonly %x, ptr noalias nocapture %y) { 291; CHECK-LABEL: both_8: 292; CHECK: @ %bb.0: @ %entry 293; CHECK-NEXT: .save {r7, lr} 294; CHECK-NEXT: push {r7, lr} 295; CHECK-NEXT: mov.w lr, #128 296; CHECK-NEXT: movw r2, #26214 297; CHECK-NEXT: movt r2, #16390 298; CHECK-NEXT: .LBB7_1: @ %vector.body 299; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 300; CHECK-NEXT: vldrh.u16 q0, [r0], #16 301; CHECK-NEXT: vcvtb.f32.f16 q1, q0 302; CHECK-NEXT: vcvtt.f32.f16 q0, q0 303; CHECK-NEXT: vmul.f32 q1, q1, r2 304; CHECK-NEXT: vmul.f32 q0, q0, r2 305; CHECK-NEXT: vcvtb.f16.f32 q1, q1 306; CHECK-NEXT: vcvtt.f16.f32 q1, q0 307; CHECK-NEXT: vstrb.8 q1, [r1], #16 308; CHECK-NEXT: le lr, .LBB7_1 309; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 310; CHECK-NEXT: pop {r7, pc} 311entry: 312 br label %vector.body 313 314vector.body: ; preds = %vector.body, %entry 315 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 316 %0 = getelementptr inbounds half, ptr %x, i32 %index 317 %wide.load = load <8 x half>, ptr %0, align 2 318 %1 = fpext <8 x half> %wide.load to <8 x float> 319 %2 = fmul <8 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 320 %3 = fptrunc <8 x float> %2 to <8 x half> 321 %4 = getelementptr inbounds half, ptr %y, i32 %index 322 store <8 x half> %3, ptr %4, align 2 323 %index.next = add i32 %index, 8 324 %5 = icmp eq i32 %index.next, 1024 325 br i1 %5, label %for.cond.cleanup, label %vector.body 326 327for.cond.cleanup: ; preds = %vector.body 328 ret void 329} 330 331define void @both_16(ptr nocapture readonly %x, ptr noalias nocapture %y) { 332; CHECK-LABEL: both_16: 333; CHECK: @ %bb.0: @ %entry 334; CHECK-NEXT: .save {r7, lr} 335; CHECK-NEXT: push {r7, lr} 336; CHECK-NEXT: mov.w lr, #64 337; CHECK-NEXT: movw r2, #26214 338; CHECK-NEXT: movt r2, #16390 339; CHECK-NEXT: .LBB8_1: @ %vector.body 340; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 341; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 342; CHECK-NEXT: vcvtb.f32.f16 q1, q0 343; CHECK-NEXT: vcvtt.f32.f16 q0, q0 344; CHECK-NEXT: vmul.f32 q1, q1, r2 345; CHECK-NEXT: vmul.f32 q0, q0, r2 346; CHECK-NEXT: vcvtb.f16.f32 q1, q1 347; CHECK-NEXT: vcvtt.f16.f32 q1, q0 348; CHECK-NEXT: vldrh.u16 q0, [r0], #32 349; CHECK-NEXT: vstrh.16 q1, [r1, #16] 350; CHECK-NEXT: vcvtb.f32.f16 q1, q0 351; CHECK-NEXT: vcvtt.f32.f16 q0, q0 352; CHECK-NEXT: vmul.f32 q1, q1, r2 353; CHECK-NEXT: vmul.f32 q0, q0, r2 354; CHECK-NEXT: vcvtb.f16.f32 q1, q1 355; CHECK-NEXT: vcvtt.f16.f32 q1, q0 356; CHECK-NEXT: vstrh.16 q1, [r1], #32 357; CHECK-NEXT: le lr, .LBB8_1 358; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 359; CHECK-NEXT: pop {r7, pc} 360entry: 361 br label %vector.body 362 363vector.body: ; preds = %vector.body, %entry 364 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 365 %0 = getelementptr inbounds half, ptr %x, i32 %index 366 %wide.load = load <16 x half>, ptr %0, align 2 367 %1 = fpext <16 x half> %wide.load to <16 x float> 368 %2 = fmul <16 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 369 %3 = fptrunc <16 x float> %2 to <16 x half> 370 %4 = getelementptr inbounds half, ptr %y, i32 %index 371 store <16 x half> %3, ptr %4, align 2 372 %index.next = add i32 %index, 16 373 %5 = icmp eq i32 %index.next, 1024 374 br i1 %5, label %for.cond.cleanup, label %vector.body 375 376for.cond.cleanup: ; preds = %vector.body 377 ret void 378} 379 380define void @both_8_I(ptr nocapture readonly %x, ptr noalias nocapture %y) { 381; CHECK-LABEL: both_8_I: 382; CHECK: @ %bb.0: @ %entry 383; CHECK-NEXT: .save {r7, lr} 384; CHECK-NEXT: push {r7, lr} 385; CHECK-NEXT: mov.w lr, #128 386; CHECK-NEXT: movw r2, #26214 387; CHECK-NEXT: movt r2, #16390 388; CHECK-NEXT: .LBB9_1: @ %vector.body 389; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 390; CHECK-NEXT: vldrh.u16 q0, [r0], #16 391; CHECK-NEXT: vcvtb.f32.f16 q1, q0 392; CHECK-NEXT: vcvtt.f32.f16 q0, q0 393; CHECK-NEXT: vmul.f32 q1, q1, r2 394; CHECK-NEXT: vmul.f32 q0, q0, r2 395; CHECK-NEXT: vcvtb.f16.f32 q1, q1 396; CHECK-NEXT: vcvtt.f16.f32 q1, q0 397; CHECK-NEXT: vstrb.8 q1, [r1], #16 398; CHECK-NEXT: le lr, .LBB9_1 399; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 400; CHECK-NEXT: pop {r7, pc} 401entry: 402 br label %vector.body 403 404vector.body: ; preds = %vector.body, %entry 405 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 406 %0 = getelementptr inbounds half, ptr %x, i32 %index 407 %wide.load = load <8 x half>, ptr %0, align 2 408 %1 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 409 %2 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 410 %3 = fpext <4 x half> %1 to <4 x float> 411 %4 = fpext <4 x half> %2 to <4 x float> 412 %5 = fmul <4 x float> %3, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 413 %6 = fmul <4 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 414 %7 = shufflevector <4 x float> %5, <4 x float> %6, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 415 %8 = fptrunc <8 x float> %7 to <8 x half> 416 %9 = getelementptr inbounds half, ptr %y, i32 %index 417 store <8 x half> %8, ptr %9, align 2 418 %index.next = add i32 %index, 8 419 %10 = icmp eq i32 %index.next, 1024 420 br i1 %10, label %for.cond.cleanup, label %vector.body 421 422for.cond.cleanup: ; preds = %vector.body 423 ret void 424} 425 426define void @both_16_I(ptr nocapture readonly %x, ptr noalias nocapture %y) { 427; CHECK-LABEL: both_16_I: 428; CHECK: @ %bb.0: @ %entry 429; CHECK-NEXT: .save {r7, lr} 430; CHECK-NEXT: push {r7, lr} 431; CHECK-NEXT: mov.w lr, #128 432; CHECK-NEXT: movw r2, #26214 433; CHECK-NEXT: movt r2, #16390 434; CHECK-NEXT: .LBB10_1: @ %vector.body 435; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 436; CHECK-NEXT: vldrh.u16 q0, [r0] 437; CHECK-NEXT: vcvtb.f32.f16 q1, q0 438; CHECK-NEXT: vcvtt.f32.f16 q0, q0 439; CHECK-NEXT: vmul.f32 q1, q1, r2 440; CHECK-NEXT: vmul.f32 q0, q0, r2 441; CHECK-NEXT: vcvtb.f16.f32 q1, q1 442; CHECK-NEXT: vcvtt.f16.f32 q1, q0 443; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! 444; CHECK-NEXT: vstrh.16 q1, [r1] 445; CHECK-NEXT: vcvtb.f32.f16 q1, q0 446; CHECK-NEXT: vcvtt.f32.f16 q0, q0 447; CHECK-NEXT: vmul.f32 q1, q1, r2 448; CHECK-NEXT: vmul.f32 q0, q0, r2 449; CHECK-NEXT: vcvtb.f16.f32 q1, q1 450; CHECK-NEXT: vcvtt.f16.f32 q1, q0 451; CHECK-NEXT: vstrb.8 q1, [r1, #16]! 452; CHECK-NEXT: le lr, .LBB10_1 453; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 454; CHECK-NEXT: pop {r7, pc} 455entry: 456 br label %vector.body 457 458vector.body: ; preds = %vector.body, %entry 459 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 460 %0 = getelementptr inbounds half, ptr %x, i32 %index 461 %wide.load = load <16 x half>, ptr %0, align 2 462 %1 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 463 %2 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 464 %3 = fpext <8 x half> %1 to <8 x float> 465 %4 = fpext <8 x half> %2 to <8 x float> 466 %5 = fmul <8 x float> %3, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 467 %6 = fmul <8 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 468 %7 = shufflevector <8 x float> %5, <8 x float> %6, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 469 %8 = fptrunc <16 x float> %7 to <16 x half> 470 %9 = getelementptr inbounds half, ptr %y, i32 %index 471 store <16 x half> %8, ptr %9, align 2 472 %index.next = add i32 %index, 8 473 %10 = icmp eq i32 %index.next, 1024 474 br i1 %10, label %for.cond.cleanup, label %vector.body 475 476for.cond.cleanup: ; preds = %vector.body 477 ret void 478} 479