xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll (revision e0919b189bf2df4f97f22ba40260ab5153988b14)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
3
4%struct.arm_2d_size_t = type { i16, i16 }
5define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) {
6; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
9; CHECK-NEXT:    cmp.w r12, #1
10; CHECK-NEXT:    it lt
11; CHECK-NEXT:    bxlt lr
12; CHECK-NEXT:  .LBB0_1: @ %for.cond3.preheader.lr.ph
13; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
14; CHECK-NEXT:    sub sp, #4
15; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
16; CHECK-NEXT:    sub sp, #64
17; CHECK-NEXT:    ldrsh.w r7, [r2]
18; CHECK-NEXT:    cmp r7, #1
19; CHECK-NEXT:    blt.w .LBB0_6
20; CHECK-NEXT:  @ %bb.2: @ %for.cond3.preheader.us.preheader
21; CHECK-NEXT:    movs r2, #252
22; CHECK-NEXT:    ldr r4, [sp, #152]
23; CHECK-NEXT:    and.w r6, r2, r3, lsr #3
24; CHECK-NEXT:    movs r2, #120
25; CHECK-NEXT:    and.w r5, r2, r3, lsr #9
26; CHECK-NEXT:    lsls r3, r3, #3
27; CHECK-NEXT:    uxtb r3, r3
28; CHECK-NEXT:    muls r6, r4, r6
29; CHECK-NEXT:    rsb.w r2, r4, #256
30; CHECK-NEXT:    vmov.i16 q2, #0xfc
31; CHECK-NEXT:    mul lr, r5, r4
32; CHECK-NEXT:    vdup.16 q4, r6
33; CHECK-NEXT:    mov.w r6, #2016
34; CHECK-NEXT:    vmov.i16 q6, #0xf8
35; CHECK-NEXT:    mul r5, r3, r4
36; CHECK-NEXT:    adds r3, r7, #7
37; CHECK-NEXT:    bic r3, r3, #7
38; CHECK-NEXT:    vdup.16 q3, lr
39; CHECK-NEXT:    subs r3, #8
40; CHECK-NEXT:    movs r4, #1
41; CHECK-NEXT:    vdup.16 q0, r5
42; CHECK-NEXT:    lsls r1, r1, #1
43; CHECK-NEXT:    add.w r3, r4, r3, lsr #3
44; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
45; CHECK-NEXT:    vmov.i16 q0, #0xf800
46; CHECK-NEXT:    movs r4, #0
47; CHECK-NEXT:    vdup.16 q5, r6
48; CHECK-NEXT:    vmov.i16 q7, #0x78
49; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
50; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
51; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
52; CHECK-NEXT:  .LBB0_3: @ %vector.ph
53; CHECK-NEXT:    @ =>This Loop Header: Depth=1
54; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
55; CHECK-NEXT:    mov r5, r0
56; CHECK-NEXT:    mov r6, r7
57; CHECK-NEXT:    dls lr, r3
58; CHECK-NEXT:  .LBB0_4: @ %vector.body
59; CHECK-NEXT:    @ Parent Loop BB0_3 Depth=1
60; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
61; CHECK-NEXT:    vctp.16 r6
62; CHECK-NEXT:    subs r6, #8
63; CHECK-NEXT:    vpst
64; CHECK-NEXT:    vldrht.u16 q0, [r5]
65; CHECK-NEXT:    vshr.u16 q1, q0, #3
66; CHECK-NEXT:    vand q1, q1, q2
67; CHECK-NEXT:    vmov q2, q4
68; CHECK-NEXT:    vmla.i16 q2, q1, r2
69; CHECK-NEXT:    vshr.u16 q1, q2, #5
70; CHECK-NEXT:    vshl.i16 q2, q0, #3
71; CHECK-NEXT:    vand q3, q1, q5
72; CHECK-NEXT:    vmov q1, q7
73; CHECK-NEXT:    vand q2, q2, q6
74; CHECK-NEXT:    vmov q7, q6
75; CHECK-NEXT:    vmov q6, q5
76; CHECK-NEXT:    vmov q5, q4
77; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
78; CHECK-NEXT:    vshr.u16 q0, q0, #9
79; CHECK-NEXT:    vmla.i16 q4, q2, r2
80; CHECK-NEXT:    vshr.u16 q2, q4, #11
81; CHECK-NEXT:    vmov q4, q5
82; CHECK-NEXT:    vmov q5, q6
83; CHECK-NEXT:    vmov q6, q7
84; CHECK-NEXT:    vmov q7, q1
85; CHECK-NEXT:    vorr q1, q3, q2
86; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
87; CHECK-NEXT:    vand q0, q0, q7
88; CHECK-NEXT:    vmla.i16 q2, q0, r2
89; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
90; CHECK-NEXT:    vand q0, q2, q0
91; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
92; CHECK-NEXT:    vorr q0, q1, q0
93; CHECK-NEXT:    vpst
94; CHECK-NEXT:    vstrht.16 q0, [r5], #16
95; CHECK-NEXT:    le lr, .LBB0_4
96; CHECK-NEXT:  @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
97; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
98; CHECK-NEXT:    adds r4, #1
99; CHECK-NEXT:    add r0, r1
100; CHECK-NEXT:    cmp r4, r12
101; CHECK-NEXT:    bne .LBB0_3
102; CHECK-NEXT:  .LBB0_6:
103; CHECK-NEXT:    add sp, #64
104; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
105; CHECK-NEXT:    add sp, #4
106; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
107; CHECK-NEXT:    bx lr
108entry:
109  %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
110  %0 = load i16, ptr %iHeight, align 2
111  %conv1 = sext i16 %0 to i32
112  %and.i = shl i16 %hwColour, 3
113  %shl.i = and i16 %and.i, 248
114  %1 = lshr i16 %hwColour, 9
115  %shl4.i = and i16 %1, 120
116  %2 = lshr i16 %hwColour, 3
117  %3 = and i16 %2, 252
118  %4 = trunc i32 %chRatio to i16
119  %5 = sub i16 256, %4
120  %conv30 = sext i16 %iTargetStride to i32
121  %cmp61 = icmp sgt i16 %0, 0
122  br i1 %cmp61, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
123
124for.cond3.preheader.lr.ph:                        ; preds = %entry
125  %6 = load i16, ptr %ptCopySize, align 2
126  %conv4 = sext i16 %6 to i32
127  %cmp558 = icmp sgt i16 %6, 0
128  br i1 %cmp558, label %for.cond3.preheader.us.preheader, label %for.cond.cleanup
129
130for.cond3.preheader.us.preheader:                 ; preds = %for.cond3.preheader.lr.ph
131  %conv15.us = mul i16 %shl.i, %4
132  %conv15.us.1 = mul i16 %3, %4
133  %conv15.us.2 = mul i16 %shl4.i, %4
134  %n.rnd.up = add nsw i32 %conv4, 7
135  %n.vec = and i32 %n.rnd.up, -8
136  %broadcast.splatinsert75 = insertelement <8 x i16> poison, i16 %5, i32 0
137  %broadcast.splat76 = shufflevector <8 x i16> %broadcast.splatinsert75, <8 x i16> poison, <8 x i32> zeroinitializer
138  %broadcast.splatinsert77 = insertelement <8 x i16> poison, i16 %conv15.us, i32 0
139  %broadcast.splat78 = shufflevector <8 x i16> %broadcast.splatinsert77, <8 x i16> poison, <8 x i32> zeroinitializer
140  %broadcast.splatinsert79 = insertelement <8 x i16> poison, i16 %conv15.us.1, i32 0
141  %broadcast.splat80 = shufflevector <8 x i16> %broadcast.splatinsert79, <8 x i16> poison, <8 x i32> zeroinitializer
142  %broadcast.splatinsert81 = insertelement <8 x i16> poison, i16 %conv15.us.2, i32 0
143  %broadcast.splat82 = shufflevector <8 x i16> %broadcast.splatinsert81, <8 x i16> poison, <8 x i32> zeroinitializer
144  br label %vector.ph
145
146vector.ph:                                        ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.us.preheader
147  %phwTargetBase.addr.063.us = phi ptr [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
148  %y.062.us = phi i32 [ %inc32.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
149  br label %vector.body
150
151vector.body:                                      ; preds = %vector.body, %vector.ph
152  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
153  %next.gep = getelementptr i16, ptr %phwTargetBase.addr.063.us, i32 %index
154  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %conv4)
155  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
156  %7 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
157  %8 = and <8 x i16> %7, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
158  %9 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
159  %10 = and <8 x i16> %9, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
160  %11 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
161  %12 = and <8 x i16> %11, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
162  %13 = mul <8 x i16> %8, %broadcast.splat76
163  %14 = add <8 x i16> %13, %broadcast.splat78
164  %15 = lshr <8 x i16> %14, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
165  %16 = mul <8 x i16> %12, %broadcast.splat76
166  %17 = add <8 x i16> %16, %broadcast.splat80
167  %18 = mul <8 x i16> %10, %broadcast.splat76
168  %19 = add <8 x i16> %18, %broadcast.splat82
169  %20 = lshr <8 x i16> %17, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
170  %21 = and <8 x i16> %20, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
171  %22 = or <8 x i16> %21, %15
172  %23 = and <8 x i16> %19, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
173  %24 = or <8 x i16> %22, %23
174  call void @llvm.masked.store.v8i16.p0(<8 x i16> %24, ptr %next.gep, i32 2, <8 x i1> %active.lane.mask)
175  %index.next = add i32 %index, 8
176  %25 = icmp eq i32 %index.next, %n.vec
177  br i1 %25, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
178
179for.cond3.for.cond.cleanup7_crit_edge.us:         ; preds = %vector.body
180  %add.ptr.us = getelementptr inbounds i16, ptr %phwTargetBase.addr.063.us, i32 %conv30
181  %inc32.us = add nuw nsw i32 %y.062.us, 1
182  %exitcond66.not = icmp eq i32 %inc32.us, %conv1
183  br i1 %exitcond66.not, label %for.cond.cleanup, label %vector.ph
184
185for.cond.cleanup:                                 ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.lr.ph, %entry
186  ret void
187}
188define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias nocapture %phwTargetBase, i16 signext %iTargetStride, ptr noalias nocapture readonly %ptCopySize, i16 zeroext %hwColour, i32 %chRatio) "target-cpu"="cortex-m55" {
189; CHECK-LABEL: __arm_2d_impl_rgb16_colour_filling_with_alpha_sched:
190; CHECK:       @ %bb.0: @ %entry
191; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
192; CHECK-NEXT:    cmp.w r12, #1
193; CHECK-NEXT:    blt.w .LBB1_7
194; CHECK-NEXT:  @ %bb.1: @ %for.cond3.preheader.lr.ph
195; CHECK-NEXT:    ldrsh.w r2, [r2]
196; CHECK-NEXT:    cmp r2, #1
197; CHECK-NEXT:    it lt
198; CHECK-NEXT:    bxlt lr
199; CHECK-NEXT:  .LBB1_2: @ %for.cond3.preheader.us.preheader
200; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
201; CHECK-NEXT:    sub sp, #4
202; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
203; CHECK-NEXT:    sub sp, #80
204; CHECK-NEXT:    ldr r7, [sp, #168]
205; CHECK-NEXT:    movs r5, #120
206; CHECK-NEXT:    lsls r6, r3, #3
207; CHECK-NEXT:    movs r4, #252
208; CHECK-NEXT:    and.w r5, r5, r3, lsr #9
209; CHECK-NEXT:    uxtb r6, r6
210; CHECK-NEXT:    and.w r3, r4, r3, lsr #3
211; CHECK-NEXT:    muls r6, r7, r6
212; CHECK-NEXT:    mul lr, r3, r7
213; CHECK-NEXT:    vdup.16 q0, r6
214; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
215; CHECK-NEXT:    vdup.16 q0, lr
216; CHECK-NEXT:    muls r5, r7, r5
217; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
218; CHECK-NEXT:    vmov.i16 q0, #0xfc
219; CHECK-NEXT:    mov.w r6, #2016
220; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
221; CHECK-NEXT:    vdup.16 q0, r5
222; CHECK-NEXT:    rsb.w r3, r7, #256
223; CHECK-NEXT:    lsls r7, r1, #1
224; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
225; CHECK-NEXT:    vdup.16 q0, r6
226; CHECK-NEXT:    vmov.i16 q2, #0xf8
227; CHECK-NEXT:    vmov.i16 q5, #0x78
228; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
229; CHECK-NEXT:    vmov.i16 q6, #0xf800
230; CHECK-NEXT:    movs r4, #0
231; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
232; CHECK-NEXT:    .p2align 2
233; CHECK-NEXT:  .LBB1_3: @ %vector.ph
234; CHECK-NEXT:    @ =>This Loop Header: Depth=1
235; CHECK-NEXT:    @ Child Loop BB1_4 Depth 2
236; CHECK-NEXT:    mov r5, r0
237; CHECK-NEXT:    dlstp.16 lr, r2
238; CHECK-NEXT:    .p2align 2
239; CHECK-NEXT:  .LBB1_4: @ %vector.body
240; CHECK-NEXT:    @ Parent Loop BB1_3 Depth=1
241; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
242; CHECK-NEXT:    vldrh.u16 q0, [r5]
243; CHECK-NEXT:    vshl.i16 q1, q0, #3
244; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
245; CHECK-NEXT:    vand q1, q1, q2
246; CHECK-NEXT:    vmla.i16 q3, q1, r3
247; CHECK-NEXT:    vmov.f64 d8, d4
248; CHECK-NEXT:    vmov.f64 d9, d5
249; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
250; CHECK-NEXT:    vshr.u16 q2, q0, #9
251; CHECK-NEXT:    vshr.u16 q0, q0, #3
252; CHECK-NEXT:    vand q0, q0, q1
253; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
254; CHECK-NEXT:    vmla.i16 q1, q0, r3
255; CHECK-NEXT:    vand q2, q2, q5
256; CHECK-NEXT:    vshr.u16 q0, q3, #11
257; CHECK-NEXT:    vldrw.u32 q3, [sp, #16] @ 16-byte Reload
258; CHECK-NEXT:    vshr.u16 q1, q1, #5
259; CHECK-NEXT:    vmla.i16 q3, q2, r3
260; CHECK-NEXT:    vand q1, q1, q7
261; CHECK-NEXT:    vorr q0, q1, q0
262; CHECK-NEXT:    vand q1, q3, q6
263; CHECK-NEXT:    vorr q0, q0, q1
264; CHECK-NEXT:    vstrh.16 q0, [r5], #16
265; CHECK-NEXT:    vmov.f64 d4, d8
266; CHECK-NEXT:    vmov.f64 d5, d9
267; CHECK-NEXT:    letp lr, .LBB1_4
268; CHECK-NEXT:  @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
269; CHECK-NEXT:    @ in Loop: Header=BB1_3 Depth=1
270; CHECK-NEXT:    adds r4, #1
271; CHECK-NEXT:    add r0, r7
272; CHECK-NEXT:    cmp r4, r12
273; CHECK-NEXT:    bne .LBB1_3
274; CHECK-NEXT:  @ %bb.6:
275; CHECK-NEXT:    add sp, #80
276; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
277; CHECK-NEXT:    add sp, #4
278; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
279; CHECK-NEXT:  .LBB1_7: @ %for.cond.cleanup
280; CHECK-NEXT:    bx lr
281entry:
282  %iHeight = getelementptr inbounds %struct.arm_2d_size_t, ptr %ptCopySize, i32 0, i32 1
283  %0 = load i16, ptr %iHeight, align 2
284  %conv1 = sext i16 %0 to i32
285  %and.i = shl i16 %hwColour, 3
286  %shl.i = and i16 %and.i, 248
287  %1 = lshr i16 %hwColour, 9
288  %shl4.i = and i16 %1, 120
289  %2 = lshr i16 %hwColour, 3
290  %3 = and i16 %2, 252
291  %4 = trunc i32 %chRatio to i16
292  %5 = sub i16 256, %4
293  %conv30 = sext i16 %iTargetStride to i32
294  %cmp61 = icmp sgt i16 %0, 0
295  br i1 %cmp61, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
296
297for.cond3.preheader.lr.ph:                        ; preds = %entry
298  %6 = load i16, ptr %ptCopySize, align 2
299  %conv4 = sext i16 %6 to i32
300  %cmp558 = icmp sgt i16 %6, 0
301  br i1 %cmp558, label %for.cond3.preheader.us.preheader, label %for.cond.cleanup
302
303for.cond3.preheader.us.preheader:                 ; preds = %for.cond3.preheader.lr.ph
304  %conv15.us = mul i16 %shl.i, %4
305  %conv15.us.1 = mul i16 %3, %4
306  %conv15.us.2 = mul i16 %shl4.i, %4
307  %n.rnd.up = add nsw i32 %conv4, 7
308  %n.vec = and i32 %n.rnd.up, -8
309  %broadcast.splatinsert75 = insertelement <8 x i16> poison, i16 %5, i32 0
310  %broadcast.splat76 = shufflevector <8 x i16> %broadcast.splatinsert75, <8 x i16> poison, <8 x i32> zeroinitializer
311  %broadcast.splatinsert77 = insertelement <8 x i16> poison, i16 %conv15.us, i32 0
312  %broadcast.splat78 = shufflevector <8 x i16> %broadcast.splatinsert77, <8 x i16> poison, <8 x i32> zeroinitializer
313  %broadcast.splatinsert79 = insertelement <8 x i16> poison, i16 %conv15.us.1, i32 0
314  %broadcast.splat80 = shufflevector <8 x i16> %broadcast.splatinsert79, <8 x i16> poison, <8 x i32> zeroinitializer
315  %broadcast.splatinsert81 = insertelement <8 x i16> poison, i16 %conv15.us.2, i32 0
316  %broadcast.splat82 = shufflevector <8 x i16> %broadcast.splatinsert81, <8 x i16> poison, <8 x i32> zeroinitializer
317  br label %vector.ph
318
319vector.ph:                                        ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.us.preheader
320  %phwTargetBase.addr.063.us = phi ptr [ %add.ptr.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ %phwTargetBase, %for.cond3.preheader.us.preheader ]
321  %y.062.us = phi i32 [ %inc32.us, %for.cond3.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
322  br label %vector.body
323
324vector.body:                                      ; preds = %vector.body, %vector.ph
325  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
326  %next.gep = getelementptr i16, ptr %phwTargetBase.addr.063.us, i32 %index
327  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %conv4)
328  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
329  %7 = shl <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
330  %8 = and <8 x i16> %7, <i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248, i16 248>
331  %9 = lshr <8 x i16> %wide.masked.load, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
332  %10 = and <8 x i16> %9, <i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120, i16 120>
333  %11 = lshr <8 x i16> %wide.masked.load, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
334  %12 = and <8 x i16> %11, <i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252, i16 252>
335  %13 = mul <8 x i16> %8, %broadcast.splat76
336  %14 = add <8 x i16> %13, %broadcast.splat78
337  %15 = lshr <8 x i16> %14, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
338  %16 = mul <8 x i16> %12, %broadcast.splat76
339  %17 = add <8 x i16> %16, %broadcast.splat80
340  %18 = mul <8 x i16> %10, %broadcast.splat76
341  %19 = add <8 x i16> %18, %broadcast.splat82
342  %20 = lshr <8 x i16> %17, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
343  %21 = and <8 x i16> %20, <i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016, i16 2016>
344  %22 = or <8 x i16> %21, %15
345  %23 = and <8 x i16> %19, <i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048, i16 -2048>
346  %24 = or <8 x i16> %22, %23
347  call void @llvm.masked.store.v8i16.p0(<8 x i16> %24, ptr %next.gep, i32 2, <8 x i1> %active.lane.mask)
348  %index.next = add i32 %index, 8
349  %25 = icmp eq i32 %index.next, %n.vec
350  br i1 %25, label %for.cond3.for.cond.cleanup7_crit_edge.us, label %vector.body
351
352for.cond3.for.cond.cleanup7_crit_edge.us:         ; preds = %vector.body
353  %add.ptr.us = getelementptr inbounds i16, ptr %phwTargetBase.addr.063.us, i32 %conv30
354  %inc32.us = add nuw nsw i32 %y.062.us, 1
355  %exitcond66.not = icmp eq i32 %inc32.us, %conv1
356  br i1 %exitcond66.not, label %for.cond.cleanup, label %vector.ph
357
358for.cond.cleanup:                                 ; preds = %for.cond3.for.cond.cleanup7_crit_edge.us, %for.cond3.preheader.lr.ph, %entry
359  ret void
360}
361
362declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
363declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) #2
364declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>) #3
365