xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define void @to_4(ptr nocapture readonly %x, ptr noalias nocapture %y) {
5; CHECK-LABEL: to_4:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    mov.w lr, #256
10; CHECK-NEXT:    movw r2, #26214
11; CHECK-NEXT:    movt r2, #16390
12; CHECK-NEXT:  .LBB0_1: @ %vector.body
13; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
14; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
15; CHECK-NEXT:    vmul.f32 q0, q0, r2
16; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
17; CHECK-NEXT:    vstrh.32 q0, [r1], #8
18; CHECK-NEXT:    le lr, .LBB0_1
19; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
20; CHECK-NEXT:    pop {r7, pc}
21entry:
22  br label %vector.body
23
24vector.body:                                      ; preds = %vector.body, %entry
25  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
26  %0 = getelementptr inbounds float, ptr %x, i32 %index
27  %wide.load = load <4 x float>, ptr %0, align 4
28  %1 = fmul <4 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
29  %2 = fptrunc <4 x float> %1 to <4 x half>
30  %3 = getelementptr inbounds half, ptr %y, i32 %index
31  store <4 x half> %2, ptr %3, align 2
32  %index.next = add i32 %index, 4
33  %4 = icmp eq i32 %index.next, 1024
34  br i1 %4, label %for.cond.cleanup, label %vector.body
35
36for.cond.cleanup:                                 ; preds = %vector.body
37  ret void
38}
39
40define void @to_8(ptr nocapture readonly %x, ptr noalias nocapture %y) {
41; CHECK-LABEL: to_8:
42; CHECK:       @ %bb.0: @ %entry
43; CHECK-NEXT:    .save {r7, lr}
44; CHECK-NEXT:    push {r7, lr}
45; CHECK-NEXT:    mov.w lr, #128
46; CHECK-NEXT:    movw r2, #26214
47; CHECK-NEXT:    movt r2, #16390
48; CHECK-NEXT:  .LBB1_1: @ %vector.body
49; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
50; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
51; CHECK-NEXT:    vmul.f32 q0, q0, r2
52; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
53; CHECK-NEXT:    vstrh.32 q0, [r1, #8]
54; CHECK-NEXT:    vldrw.u32 q0, [r0], #32
55; CHECK-NEXT:    vmul.f32 q0, q0, r2
56; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
57; CHECK-NEXT:    vstrh.32 q0, [r1], #16
58; CHECK-NEXT:    le lr, .LBB1_1
59; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
60; CHECK-NEXT:    pop {r7, pc}
61entry:
62  br label %vector.body
63
64vector.body:                                      ; preds = %vector.body, %entry
65  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
66  %0 = getelementptr inbounds float, ptr %x, i32 %index
67  %wide.load = load <8 x float>, ptr %0, align 4
68  %1 = fmul <8 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
69  %2 = fptrunc <8 x float> %1 to <8 x half>
70  %3 = getelementptr inbounds half, ptr %y, i32 %index
71  store <8 x half> %2, ptr %3, align 2
72  %index.next = add i32 %index, 8
73  %4 = icmp eq i32 %index.next, 1024
74  br i1 %4, label %for.cond.cleanup, label %vector.body
75
76for.cond.cleanup:                                 ; preds = %vector.body
77  ret void
78}
79
80define void @to_16(ptr nocapture readonly %x, ptr noalias nocapture %y) {
81; CHECK-LABEL: to_16:
82; CHECK:       @ %bb.0: @ %entry
83; CHECK-NEXT:    .save {r7, lr}
84; CHECK-NEXT:    push {r7, lr}
85; CHECK-NEXT:    mov.w lr, #64
86; CHECK-NEXT:    movw r2, #26214
87; CHECK-NEXT:    movt r2, #16390
88; CHECK-NEXT:  .LBB2_1: @ %vector.body
89; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
90; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
91; CHECK-NEXT:    vmul.f32 q0, q0, r2
92; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
93; CHECK-NEXT:    vstrh.32 q0, [r1, #24]
94; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
95; CHECK-NEXT:    vmul.f32 q0, q0, r2
96; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
97; CHECK-NEXT:    vstrh.32 q0, [r1, #16]
98; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
99; CHECK-NEXT:    vmul.f32 q0, q0, r2
100; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
101; CHECK-NEXT:    vstrh.32 q0, [r1, #8]
102; CHECK-NEXT:    vldrw.u32 q0, [r0], #64
103; CHECK-NEXT:    vmul.f32 q0, q0, r2
104; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
105; CHECK-NEXT:    vstrh.32 q0, [r1], #32
106; CHECK-NEXT:    le lr, .LBB2_1
107; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
108; CHECK-NEXT:    pop {r7, pc}
109entry:
110  br label %vector.body
111
112vector.body:                                      ; preds = %vector.body, %entry
113  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
114  %0 = getelementptr inbounds float, ptr %x, i32 %index
115  %wide.load = load <16 x float>, ptr %0, align 4
116  %1 = fmul <16 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
117  %2 = fptrunc <16 x float> %1 to <16 x half>
118  %3 = getelementptr inbounds half, ptr %y, i32 %index
119  store <16 x half> %2, ptr %3, align 2
120  %index.next = add i32 %index, 16
121  %4 = icmp eq i32 %index.next, 1024
122  br i1 %4, label %for.cond.cleanup, label %vector.body
123
124for.cond.cleanup:                                 ; preds = %vector.body
125  ret void
126}
127
128define void @from_4(ptr nocapture readonly %x, ptr noalias nocapture %y) {
129; CHECK-LABEL: from_4:
130; CHECK:       @ %bb.0: @ %entry
131; CHECK-NEXT:    .save {r7, lr}
132; CHECK-NEXT:    push {r7, lr}
133; CHECK-NEXT:    mov.w lr, #256
134; CHECK-NEXT:    movw r2, #26214
135; CHECK-NEXT:    movt r2, #16390
136; CHECK-NEXT:  .LBB3_1: @ %vector.body
137; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
138; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
139; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
140; CHECK-NEXT:    vmul.f32 q0, q0, r2
141; CHECK-NEXT:    vstrb.8 q0, [r1], #16
142; CHECK-NEXT:    le lr, .LBB3_1
143; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
144; CHECK-NEXT:    pop {r7, pc}
145entry:
146  br label %vector.body
147
148vector.body:                                      ; preds = %vector.body, %entry
149  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
150  %0 = getelementptr inbounds half, ptr %x, i32 %index
151  %wide.load = load <4 x half>, ptr %0, align 2
152  %1 = fpext <4 x half> %wide.load to <4 x float>
153  %2 = fmul <4 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
154  %3 = getelementptr inbounds float, ptr %y, i32 %index
155  store <4 x float> %2, ptr %3, align 4
156  %index.next = add i32 %index, 4
157  %4 = icmp eq i32 %index.next, 1024
158  br i1 %4, label %for.cond.cleanup, label %vector.body
159
160for.cond.cleanup:                                 ; preds = %vector.body
161  ret void
162}
163
164define void @from_8(ptr nocapture readonly %x, ptr noalias nocapture %y) {
165; CHECK-LABEL: from_8:
166; CHECK:       @ %bb.0: @ %entry
167; CHECK-NEXT:    .save {r7, lr}
168; CHECK-NEXT:    push {r7, lr}
169; CHECK-NEXT:    mov.w lr, #128
170; CHECK-NEXT:    movw r2, #26214
171; CHECK-NEXT:    movt r2, #16390
172; CHECK-NEXT:  .LBB4_1: @ %vector.body
173; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
174; CHECK-NEXT:    vldrh.u32 q0, [r0], #16
175; CHECK-NEXT:    vldrh.u32 q1, [r0, #-8]
176; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
177; CHECK-NEXT:    vmul.f32 q0, q0, r2
178; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
179; CHECK-NEXT:    vmul.f32 q1, q1, r2
180; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
181; CHECK-NEXT:    vstrw.32 q0, [r1], #32
182; CHECK-NEXT:    le lr, .LBB4_1
183; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
184; CHECK-NEXT:    pop {r7, pc}
185entry:
186  br label %vector.body
187
188vector.body:                                      ; preds = %vector.body, %entry
189  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
190  %0 = getelementptr inbounds half, ptr %x, i32 %index
191  %wide.load = load <8 x half>, ptr %0, align 2
192  %1 = fpext <8 x half> %wide.load to <8 x float>
193  %2 = fmul <8 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
194  %3 = getelementptr inbounds float, ptr %y, i32 %index
195  store <8 x float> %2, ptr %3, align 4
196  %index.next = add i32 %index, 8
197  %4 = icmp eq i32 %index.next, 1024
198  br i1 %4, label %for.cond.cleanup, label %vector.body
199
200for.cond.cleanup:                                 ; preds = %vector.body
201  ret void
202}
203
204define void @from_16(ptr nocapture readonly %x, ptr noalias nocapture %y) {
205; CHECK-LABEL: from_16:
206; CHECK:       @ %bb.0: @ %entry
207; CHECK-NEXT:    .save {r7, lr}
208; CHECK-NEXT:    push {r7, lr}
209; CHECK-NEXT:    mov.w lr, #64
210; CHECK-NEXT:    movw r2, #26214
211; CHECK-NEXT:    movt r2, #16390
212; CHECK-NEXT:  .LBB5_1: @ %vector.body
213; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
214; CHECK-NEXT:    vldrh.u32 q0, [r0], #32
215; CHECK-NEXT:    vldrh.u32 q1, [r0, #-24]
216; CHECK-NEXT:    vldrh.u32 q2, [r0, #-16]
217; CHECK-NEXT:    vldrh.u32 q3, [r0, #-8]
218; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
219; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
220; CHECK-NEXT:    vcvtb.f32.f16 q2, q2
221; CHECK-NEXT:    vcvtb.f32.f16 q3, q3
222; CHECK-NEXT:    vmul.f32 q2, q2, r2
223; CHECK-NEXT:    vmul.f32 q3, q3, r2
224; CHECK-NEXT:    vmul.f32 q1, q1, r2
225; CHECK-NEXT:    vmul.f32 q0, q0, r2
226; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
227; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
228; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
229; CHECK-NEXT:    vstrw.32 q0, [r1], #64
230; CHECK-NEXT:    le lr, .LBB5_1
231; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
232; CHECK-NEXT:    pop {r7, pc}
233entry:
234  br label %vector.body
235
236vector.body:                                      ; preds = %vector.body, %entry
237  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
238  %0 = getelementptr inbounds half, ptr %x, i32 %index
239  %wide.load = load <16 x half>, ptr %0, align 2
240  %1 = fpext <16 x half> %wide.load to <16 x float>
241  %2 = fmul <16 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
242  %3 = getelementptr inbounds float, ptr %y, i32 %index
243  store <16 x float> %2, ptr %3, align 4
244  %index.next = add i32 %index, 16
245  %4 = icmp eq i32 %index.next, 1024
246  br i1 %4, label %for.cond.cleanup, label %vector.body
247
248for.cond.cleanup:                                 ; preds = %vector.body
249  ret void
250}
251
252define void @both_4(ptr nocapture readonly %x, ptr noalias nocapture %y) {
253; CHECK-LABEL: both_4:
254; CHECK:       @ %bb.0: @ %entry
255; CHECK-NEXT:    .save {r7, lr}
256; CHECK-NEXT:    push {r7, lr}
257; CHECK-NEXT:    mov.w lr, #256
258; CHECK-NEXT:    movw r2, #26214
259; CHECK-NEXT:    movt r2, #16390
260; CHECK-NEXT:  .LBB6_1: @ %vector.body
261; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
262; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
263; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
264; CHECK-NEXT:    vmul.f32 q0, q0, r2
265; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
266; CHECK-NEXT:    vstrh.32 q0, [r1], #8
267; CHECK-NEXT:    le lr, .LBB6_1
268; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
269; CHECK-NEXT:    pop {r7, pc}
270entry:
271  br label %vector.body
272
273vector.body:                                      ; preds = %vector.body, %entry
274  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
275  %0 = getelementptr inbounds half, ptr %x, i32 %index
276  %wide.load = load <4 x half>, ptr %0, align 2
277  %1 = fpext <4 x half> %wide.load to <4 x float>
278  %2 = fmul <4 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
279  %3 = fptrunc <4 x float> %2 to <4 x half>
280  %4 = getelementptr inbounds half, ptr %y, i32 %index
281  store <4 x half> %3, ptr %4, align 2
282  %index.next = add i32 %index, 4
283  %5 = icmp eq i32 %index.next, 1024
284  br i1 %5, label %for.cond.cleanup, label %vector.body
285
286for.cond.cleanup:                                 ; preds = %vector.body
287  ret void
288}
289
290define void @both_8(ptr nocapture readonly %x, ptr noalias nocapture %y) {
291; CHECK-LABEL: both_8:
292; CHECK:       @ %bb.0: @ %entry
293; CHECK-NEXT:    .save {r7, lr}
294; CHECK-NEXT:    push {r7, lr}
295; CHECK-NEXT:    mov.w lr, #128
296; CHECK-NEXT:    movw r2, #26214
297; CHECK-NEXT:    movt r2, #16390
298; CHECK-NEXT:  .LBB7_1: @ %vector.body
299; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
300; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
301; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
302; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
303; CHECK-NEXT:    vmul.f32 q1, q1, r2
304; CHECK-NEXT:    vmul.f32 q0, q0, r2
305; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
306; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
307; CHECK-NEXT:    vstrb.8 q1, [r1], #16
308; CHECK-NEXT:    le lr, .LBB7_1
309; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
310; CHECK-NEXT:    pop {r7, pc}
311entry:
312  br label %vector.body
313
314vector.body:                                      ; preds = %vector.body, %entry
315  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
316  %0 = getelementptr inbounds half, ptr %x, i32 %index
317  %wide.load = load <8 x half>, ptr %0, align 2
318  %1 = fpext <8 x half> %wide.load to <8 x float>
319  %2 = fmul <8 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
320  %3 = fptrunc <8 x float> %2 to <8 x half>
321  %4 = getelementptr inbounds half, ptr %y, i32 %index
322  store <8 x half> %3, ptr %4, align 2
323  %index.next = add i32 %index, 8
324  %5 = icmp eq i32 %index.next, 1024
325  br i1 %5, label %for.cond.cleanup, label %vector.body
326
327for.cond.cleanup:                                 ; preds = %vector.body
328  ret void
329}
330
331define void @both_16(ptr nocapture readonly %x, ptr noalias nocapture %y) {
332; CHECK-LABEL: both_16:
333; CHECK:       @ %bb.0: @ %entry
334; CHECK-NEXT:    .save {r7, lr}
335; CHECK-NEXT:    push {r7, lr}
336; CHECK-NEXT:    mov.w lr, #64
337; CHECK-NEXT:    movw r2, #26214
338; CHECK-NEXT:    movt r2, #16390
339; CHECK-NEXT:  .LBB8_1: @ %vector.body
340; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
341; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
342; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
343; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
344; CHECK-NEXT:    vmul.f32 q1, q1, r2
345; CHECK-NEXT:    vmul.f32 q0, q0, r2
346; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
347; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
348; CHECK-NEXT:    vldrh.u16 q0, [r0], #32
349; CHECK-NEXT:    vstrh.16 q1, [r1, #16]
350; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
351; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
352; CHECK-NEXT:    vmul.f32 q1, q1, r2
353; CHECK-NEXT:    vmul.f32 q0, q0, r2
354; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
355; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
356; CHECK-NEXT:    vstrh.16 q1, [r1], #32
357; CHECK-NEXT:    le lr, .LBB8_1
358; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
359; CHECK-NEXT:    pop {r7, pc}
360entry:
361  br label %vector.body
362
363vector.body:                                      ; preds = %vector.body, %entry
364  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
365  %0 = getelementptr inbounds half, ptr %x, i32 %index
366  %wide.load = load <16 x half>, ptr %0, align 2
367  %1 = fpext <16 x half> %wide.load to <16 x float>
368  %2 = fmul <16 x float> %1, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
369  %3 = fptrunc <16 x float> %2 to <16 x half>
370  %4 = getelementptr inbounds half, ptr %y, i32 %index
371  store <16 x half> %3, ptr %4, align 2
372  %index.next = add i32 %index, 16
373  %5 = icmp eq i32 %index.next, 1024
374  br i1 %5, label %for.cond.cleanup, label %vector.body
375
376for.cond.cleanup:                                 ; preds = %vector.body
377  ret void
378}
379
380define void @both_8_I(ptr nocapture readonly %x, ptr noalias nocapture %y) {
381; CHECK-LABEL: both_8_I:
382; CHECK:       @ %bb.0: @ %entry
383; CHECK-NEXT:    .save {r7, lr}
384; CHECK-NEXT:    push {r7, lr}
385; CHECK-NEXT:    mov.w lr, #128
386; CHECK-NEXT:    movw r2, #26214
387; CHECK-NEXT:    movt r2, #16390
388; CHECK-NEXT:  .LBB9_1: @ %vector.body
389; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
390; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
391; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
392; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
393; CHECK-NEXT:    vmul.f32 q1, q1, r2
394; CHECK-NEXT:    vmul.f32 q0, q0, r2
395; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
396; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
397; CHECK-NEXT:    vstrb.8 q1, [r1], #16
398; CHECK-NEXT:    le lr, .LBB9_1
399; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
400; CHECK-NEXT:    pop {r7, pc}
401entry:
402  br label %vector.body
403
404vector.body:                                      ; preds = %vector.body, %entry
405  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
406  %0 = getelementptr inbounds half, ptr %x, i32 %index
407  %wide.load = load <8 x half>, ptr %0, align 2
408  %1 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
409  %2 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
410  %3 = fpext <4 x half> %1 to <4 x float>
411  %4 = fpext <4 x half> %2 to <4 x float>
412  %5 = fmul <4 x float> %3, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
413  %6 = fmul <4 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
414  %7 = shufflevector <4 x float> %5, <4 x float> %6, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
415  %8 = fptrunc <8 x float> %7 to <8 x half>
416  %9 = getelementptr inbounds half, ptr %y, i32 %index
417  store <8 x half> %8, ptr %9, align 2
418  %index.next = add i32 %index, 8
419  %10 = icmp eq i32 %index.next, 1024
420  br i1 %10, label %for.cond.cleanup, label %vector.body
421
422for.cond.cleanup:                                 ; preds = %vector.body
423  ret void
424}
425
426define void @both_16_I(ptr nocapture readonly %x, ptr noalias nocapture %y) {
427; CHECK-LABEL: both_16_I:
428; CHECK:       @ %bb.0: @ %entry
429; CHECK-NEXT:    .save {r7, lr}
430; CHECK-NEXT:    push {r7, lr}
431; CHECK-NEXT:    mov.w lr, #128
432; CHECK-NEXT:    movw r2, #26214
433; CHECK-NEXT:    movt r2, #16390
434; CHECK-NEXT:  .LBB10_1: @ %vector.body
435; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
436; CHECK-NEXT:    vldrh.u16 q0, [r0]
437; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
438; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
439; CHECK-NEXT:    vmul.f32 q1, q1, r2
440; CHECK-NEXT:    vmul.f32 q0, q0, r2
441; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
442; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
443; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]!
444; CHECK-NEXT:    vstrh.16 q1, [r1]
445; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
446; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
447; CHECK-NEXT:    vmul.f32 q1, q1, r2
448; CHECK-NEXT:    vmul.f32 q0, q0, r2
449; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
450; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
451; CHECK-NEXT:    vstrb.8 q1, [r1, #16]!
452; CHECK-NEXT:    le lr, .LBB10_1
453; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
454; CHECK-NEXT:    pop {r7, pc}
455entry:
456  br label %vector.body
457
458vector.body:                                      ; preds = %vector.body, %entry
459  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
460  %0 = getelementptr inbounds half, ptr %x, i32 %index
461  %wide.load = load <16 x half>, ptr %0, align 2
462  %1 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
463  %2 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
464  %3 = fpext <8 x half> %1 to <8 x float>
465  %4 = fpext <8 x half> %2 to <8 x float>
466  %5 = fmul <8 x float> %3, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
467  %6 = fmul <8 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
468  %7 = shufflevector <8 x float> %5, <8 x float> %6, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
469  %8 = fptrunc <16 x float> %7 to <16 x half>
470  %9 = getelementptr inbounds half, ptr %y, i32 %index
471  store <16 x half> %8, ptr %9, align 2
472  %index.next = add i32 %index, 8
473  %10 = icmp eq i32 %index.next, 1024
474  br i1 %10, label %for.cond.cleanup, label %vector.body
475
476for.cond.cleanup:                                 ; preds = %vector.body
477  ret void
478}
479