xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4
5define ptr @ldrwu32_4(ptr %x, ptr %y, ptr %m) {
6; CHECK-LABEL: ldrwu32_4:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrw.u32 q0, [r2]
9; CHECK-NEXT:    vpt.i32 ne, q0, zr
10; CHECK-NEXT:    vldrwt.u32 q0, [r0, #4]
11; CHECK-NEXT:    vstrw.32 q0, [r1]
12; CHECK-NEXT:    bx lr
13entry:
14  %z = getelementptr inbounds i8, ptr %x, i32 4
15  %mask = load <4 x i32>, ptr %m, align 4
16  %c = icmp ne <4 x i32> %mask, zeroinitializer
17  %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef)
18  store <4 x i32> %0, ptr %y, align 4
19  ret ptr %x
20}
21
22define ptr @ldrwu32_3(ptr %x, ptr %y, ptr %m) {
23; CHECK-LABEL: ldrwu32_3:
24; CHECK:       @ %bb.0: @ %entry
25; CHECK-NEXT:    vldrw.u32 q0, [r2]
26; CHECK-NEXT:    adds r3, r0, #3
27; CHECK-NEXT:    vpt.i32 ne, q0, zr
28; CHECK-NEXT:    vldrwt.u32 q0, [r3]
29; CHECK-NEXT:    vstrw.32 q0, [r1]
30; CHECK-NEXT:    bx lr
31entry:
32  %z = getelementptr inbounds i8, ptr %x, i32 3
33  %mask = load <4 x i32>, ptr %m, align 4
34  %c = icmp ne <4 x i32> %mask, zeroinitializer
35  %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef)
36  store <4 x i32> %0, ptr %y, align 4
37  ret ptr %x
38}
39
40define ptr @ldrwu32_2(ptr %x, ptr %y, ptr %m) {
41; CHECK-LABEL: ldrwu32_2:
42; CHECK:       @ %bb.0: @ %entry
43; CHECK-NEXT:    vldrw.u32 q0, [r2]
44; CHECK-NEXT:    adds r3, r0, #2
45; CHECK-NEXT:    vpt.i32 ne, q0, zr
46; CHECK-NEXT:    vldrwt.u32 q0, [r3]
47; CHECK-NEXT:    vstrw.32 q0, [r1]
48; CHECK-NEXT:    bx lr
49entry:
50  %z = getelementptr inbounds i8, ptr %x, i32 2
51  %mask = load <4 x i32>, ptr %m, align 4
52  %c = icmp ne <4 x i32> %mask, zeroinitializer
53  %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef)
54  store <4 x i32> %0, ptr %y, align 4
55  ret ptr %x
56}
57
58define ptr @ldrwu32_508(ptr %x, ptr %y, ptr %m) {
59; CHECK-LABEL: ldrwu32_508:
60; CHECK:       @ %bb.0: @ %entry
61; CHECK-NEXT:    vldrw.u32 q0, [r2]
62; CHECK-NEXT:    vpt.i32 ne, q0, zr
63; CHECK-NEXT:    vldrwt.u32 q0, [r0, #508]
64; CHECK-NEXT:    vstrw.32 q0, [r1]
65; CHECK-NEXT:    bx lr
66entry:
67  %z = getelementptr inbounds i8, ptr %x, i32 508
68  %mask = load <4 x i32>, ptr %m, align 4
69  %c = icmp ne <4 x i32> %mask, zeroinitializer
70  %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef)
71  store <4 x i32> %0, ptr %y, align 4
72  ret ptr %x
73}
74
75define ptr @ldrwu32_512(ptr %x, ptr %y, ptr %m) {
76; CHECK-LABEL: ldrwu32_512:
77; CHECK:       @ %bb.0: @ %entry
78; CHECK-NEXT:    vldrw.u32 q0, [r2]
79; CHECK-NEXT:    add.w r3, r0, #512
80; CHECK-NEXT:    vpt.i32 ne, q0, zr
81; CHECK-NEXT:    vldrwt.u32 q0, [r3]
82; CHECK-NEXT:    vstrw.32 q0, [r1]
83; CHECK-NEXT:    bx lr
84entry:
85  %z = getelementptr inbounds i8, ptr %x, i32 512
86  %mask = load <4 x i32>, ptr %m, align 4
87  %c = icmp ne <4 x i32> %mask, zeroinitializer
88  %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef)
89  store <4 x i32> %0, ptr %y, align 4
90  ret ptr %x
91}
92
93define ptr @ldrwu32_m508(ptr %x, ptr %y, ptr %m) {
94; CHECK-LABEL: ldrwu32_m508:
95; CHECK:       @ %bb.0: @ %entry
96; CHECK-NEXT:    vldrw.u32 q0, [r2]
97; CHECK-NEXT:    vpt.i32 ne, q0, zr
98; CHECK-NEXT:    vldrwt.u32 q0, [r0, #-508]
99; CHECK-NEXT:    vstrw.32 q0, [r1]
100; CHECK-NEXT:    bx lr
101entry:
102  %z = getelementptr inbounds i8, ptr %x, i32 -508
103  %mask = load <4 x i32>, ptr %m, align 4
104  %c = icmp ne <4 x i32> %mask, zeroinitializer
105  %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef)
106  store <4 x i32> %0, ptr %y, align 4
107  ret ptr %x
108}
109
110define ptr @ldrwu32_m512(ptr %x, ptr %y, ptr %m) {
111; CHECK-LABEL: ldrwu32_m512:
112; CHECK:       @ %bb.0: @ %entry
113; CHECK-NEXT:    vldrw.u32 q0, [r2]
114; CHECK-NEXT:    sub.w r3, r0, #512
115; CHECK-NEXT:    vpt.i32 ne, q0, zr
116; CHECK-NEXT:    vldrwt.u32 q0, [r3]
117; CHECK-NEXT:    vstrw.32 q0, [r1]
118; CHECK-NEXT:    bx lr
119entry:
120  %z = getelementptr inbounds i8, ptr %x, i32 -512
121  %mask = load <4 x i32>, ptr %m, align 4
122  %c = icmp ne <4 x i32> %mask, zeroinitializer
123  %0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x i32> undef)
124  store <4 x i32> %0, ptr %y, align 4
125  ret ptr %x
126}
127
128define ptr @ldrhu32_4(ptr %x, ptr %y, ptr %m) {
129; CHECK-LABEL: ldrhu32_4:
130; CHECK:       @ %bb.0: @ %entry
131; CHECK-NEXT:    vldrw.u32 q0, [r2]
132; CHECK-NEXT:    vpt.i32 ne, q0, zr
133; CHECK-NEXT:    vldrht.u32 q0, [r0, #4]
134; CHECK-NEXT:    vstrw.32 q0, [r1]
135; CHECK-NEXT:    bx lr
136entry:
137  %z = getelementptr inbounds i8, ptr %x, i32 4
138  %mask = load <4 x i32>, ptr %m, align 4
139  %c = icmp ne <4 x i32> %mask, zeroinitializer
140  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
141  %1 = zext <4 x i16> %0 to <4 x i32>
142  store <4 x i32> %1, ptr %y, align 4
143  ret ptr %x
144}
145
146define ptr @ldrhu32_3(ptr %x, ptr %y, ptr %m) {
147; CHECK-LABEL: ldrhu32_3:
148; CHECK:       @ %bb.0: @ %entry
149; CHECK-NEXT:    vldrw.u32 q0, [r2]
150; CHECK-NEXT:    adds r3, r0, #3
151; CHECK-NEXT:    vpt.i32 ne, q0, zr
152; CHECK-NEXT:    vldrht.u32 q0, [r3]
153; CHECK-NEXT:    vstrw.32 q0, [r1]
154; CHECK-NEXT:    bx lr
155entry:
156  %z = getelementptr inbounds i8, ptr %x, i32 3
157  %mask = load <4 x i32>, ptr %m, align 4
158  %c = icmp ne <4 x i32> %mask, zeroinitializer
159  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
160  %1 = zext <4 x i16> %0 to <4 x i32>
161  store <4 x i32> %1, ptr %y, align 4
162  ret ptr %x
163}
164
165define ptr @ldrhu32_2(ptr %x, ptr %y, ptr %m) {
166; CHECK-LABEL: ldrhu32_2:
167; CHECK:       @ %bb.0: @ %entry
168; CHECK-NEXT:    vldrw.u32 q0, [r2]
169; CHECK-NEXT:    vpt.i32 ne, q0, zr
170; CHECK-NEXT:    vldrht.u32 q0, [r0, #2]
171; CHECK-NEXT:    vstrw.32 q0, [r1]
172; CHECK-NEXT:    bx lr
173entry:
174  %z = getelementptr inbounds i8, ptr %x, i32 2
175  %mask = load <4 x i32>, ptr %m, align 4
176  %c = icmp ne <4 x i32> %mask, zeroinitializer
177  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
178  %1 = zext <4 x i16> %0 to <4 x i32>
179  store <4 x i32> %1, ptr %y, align 4
180  ret ptr %x
181}
182
183define ptr @ldrhu32_254(ptr %x, ptr %y, ptr %m) {
184; CHECK-LABEL: ldrhu32_254:
185; CHECK:       @ %bb.0: @ %entry
186; CHECK-NEXT:    vldrw.u32 q0, [r2]
187; CHECK-NEXT:    vpt.i32 ne, q0, zr
188; CHECK-NEXT:    vldrht.u32 q0, [r0, #254]
189; CHECK-NEXT:    vstrw.32 q0, [r1]
190; CHECK-NEXT:    bx lr
191entry:
192  %z = getelementptr inbounds i8, ptr %x, i32 254
193  %mask = load <4 x i32>, ptr %m, align 4
194  %c = icmp ne <4 x i32> %mask, zeroinitializer
195  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
196  %1 = zext <4 x i16> %0 to <4 x i32>
197  store <4 x i32> %1, ptr %y, align 4
198  ret ptr %x
199}
200
201define ptr @ldrhu32_256(ptr %x, ptr %y, ptr %m) {
202; CHECK-LABEL: ldrhu32_256:
203; CHECK:       @ %bb.0: @ %entry
204; CHECK-NEXT:    vldrw.u32 q0, [r2]
205; CHECK-NEXT:    add.w r3, r0, #256
206; CHECK-NEXT:    vpt.i32 ne, q0, zr
207; CHECK-NEXT:    vldrht.u32 q0, [r3]
208; CHECK-NEXT:    vstrw.32 q0, [r1]
209; CHECK-NEXT:    bx lr
210entry:
211  %z = getelementptr inbounds i8, ptr %x, i32 256
212  %mask = load <4 x i32>, ptr %m, align 4
213  %c = icmp ne <4 x i32> %mask, zeroinitializer
214  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
215  %1 = zext <4 x i16> %0 to <4 x i32>
216  store <4 x i32> %1, ptr %y, align 4
217  ret ptr %x
218}
219
220define ptr @ldrhu32_m254(ptr %x, ptr %y, ptr %m) {
221; CHECK-LABEL: ldrhu32_m254:
222; CHECK:       @ %bb.0: @ %entry
223; CHECK-NEXT:    vldrw.u32 q0, [r2]
224; CHECK-NEXT:    vpt.i32 ne, q0, zr
225; CHECK-NEXT:    vldrht.u32 q0, [r0, #-254]
226; CHECK-NEXT:    vstrw.32 q0, [r1]
227; CHECK-NEXT:    bx lr
228entry:
229  %z = getelementptr inbounds i8, ptr %x, i32 -254
230  %mask = load <4 x i32>, ptr %m, align 4
231  %c = icmp ne <4 x i32> %mask, zeroinitializer
232  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
233  %1 = zext <4 x i16> %0 to <4 x i32>
234  store <4 x i32> %1, ptr %y, align 4
235  ret ptr %x
236}
237
238define ptr @ldrhu32_m256(ptr %x, ptr %y, ptr %m) {
239; CHECK-LABEL: ldrhu32_m256:
240; CHECK:       @ %bb.0: @ %entry
241; CHECK-NEXT:    vldrw.u32 q0, [r2]
242; CHECK-NEXT:    sub.w r3, r0, #256
243; CHECK-NEXT:    vpt.i32 ne, q0, zr
244; CHECK-NEXT:    vldrht.u32 q0, [r3]
245; CHECK-NEXT:    vstrw.32 q0, [r1]
246; CHECK-NEXT:    bx lr
247entry:
248  %z = getelementptr inbounds i8, ptr %x, i32 -256
249  %mask = load <4 x i32>, ptr %m, align 4
250  %c = icmp ne <4 x i32> %mask, zeroinitializer
251  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
252  %1 = zext <4 x i16> %0 to <4 x i32>
253  store <4 x i32> %1, ptr %y, align 4
254  ret ptr %x
255}
256
257define ptr @ldrhs32_4(ptr %x, ptr %y, ptr %m) {
258; CHECK-LABEL: ldrhs32_4:
259; CHECK:       @ %bb.0: @ %entry
260; CHECK-NEXT:    vldrw.u32 q0, [r2]
261; CHECK-NEXT:    vpt.i32 ne, q0, zr
262; CHECK-NEXT:    vldrht.s32 q0, [r0, #4]
263; CHECK-NEXT:    vstrw.32 q0, [r1]
264; CHECK-NEXT:    bx lr
265entry:
266  %z = getelementptr inbounds i8, ptr %x, i32 4
267  %mask = load <4 x i32>, ptr %m, align 4
268  %c = icmp ne <4 x i32> %mask, zeroinitializer
269  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
270  %1 = sext <4 x i16> %0 to <4 x i32>
271  store <4 x i32> %1, ptr %y, align 4
272  ret ptr %x
273}
274
275define ptr @ldrhs32_3(ptr %x, ptr %y, ptr %m) {
276; CHECK-LABEL: ldrhs32_3:
277; CHECK:       @ %bb.0: @ %entry
278; CHECK-NEXT:    vldrw.u32 q0, [r2]
279; CHECK-NEXT:    adds r3, r0, #3
280; CHECK-NEXT:    vpt.i32 ne, q0, zr
281; CHECK-NEXT:    vldrht.s32 q0, [r3]
282; CHECK-NEXT:    vstrw.32 q0, [r1]
283; CHECK-NEXT:    bx lr
284entry:
285  %z = getelementptr inbounds i8, ptr %x, i32 3
286  %mask = load <4 x i32>, ptr %m, align 4
287  %c = icmp ne <4 x i32> %mask, zeroinitializer
288  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
289  %1 = sext <4 x i16> %0 to <4 x i32>
290  store <4 x i32> %1, ptr %y, align 4
291  ret ptr %x
292}
293
294define ptr @ldrhs32_2(ptr %x, ptr %y, ptr %m) {
295; CHECK-LABEL: ldrhs32_2:
296; CHECK:       @ %bb.0: @ %entry
297; CHECK-NEXT:    vldrw.u32 q0, [r2]
298; CHECK-NEXT:    vpt.i32 ne, q0, zr
299; CHECK-NEXT:    vldrht.s32 q0, [r0, #2]
300; CHECK-NEXT:    vstrw.32 q0, [r1]
301; CHECK-NEXT:    bx lr
302entry:
303  %z = getelementptr inbounds i8, ptr %x, i32 2
304  %mask = load <4 x i32>, ptr %m, align 4
305  %c = icmp ne <4 x i32> %mask, zeroinitializer
306  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
307  %1 = sext <4 x i16> %0 to <4 x i32>
308  store <4 x i32> %1, ptr %y, align 4
309  ret ptr %x
310}
311
312define ptr @ldrhs32_254(ptr %x, ptr %y, ptr %m) {
313; CHECK-LABEL: ldrhs32_254:
314; CHECK:       @ %bb.0: @ %entry
315; CHECK-NEXT:    vldrw.u32 q0, [r2]
316; CHECK-NEXT:    vpt.i32 ne, q0, zr
317; CHECK-NEXT:    vldrht.s32 q0, [r0, #254]
318; CHECK-NEXT:    vstrw.32 q0, [r1]
319; CHECK-NEXT:    bx lr
320entry:
321  %z = getelementptr inbounds i8, ptr %x, i32 254
322  %mask = load <4 x i32>, ptr %m, align 4
323  %c = icmp ne <4 x i32> %mask, zeroinitializer
324  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
325  %1 = sext <4 x i16> %0 to <4 x i32>
326  store <4 x i32> %1, ptr %y, align 4
327  ret ptr %x
328}
329
330define ptr @ldrhs32_256(ptr %x, ptr %y, ptr %m) {
331; CHECK-LABEL: ldrhs32_256:
332; CHECK:       @ %bb.0: @ %entry
333; CHECK-NEXT:    vldrw.u32 q0, [r2]
334; CHECK-NEXT:    add.w r3, r0, #256
335; CHECK-NEXT:    vpt.i32 ne, q0, zr
336; CHECK-NEXT:    vldrht.s32 q0, [r3]
337; CHECK-NEXT:    vstrw.32 q0, [r1]
338; CHECK-NEXT:    bx lr
339entry:
340  %z = getelementptr inbounds i8, ptr %x, i32 256
341  %mask = load <4 x i32>, ptr %m, align 4
342  %c = icmp ne <4 x i32> %mask, zeroinitializer
343  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
344  %1 = sext <4 x i16> %0 to <4 x i32>
345  store <4 x i32> %1, ptr %y, align 4
346  ret ptr %x
347}
348
349define ptr @ldrhs32_m254(ptr %x, ptr %y, ptr %m) {
350; CHECK-LABEL: ldrhs32_m254:
351; CHECK:       @ %bb.0: @ %entry
352; CHECK-NEXT:    vldrw.u32 q0, [r2]
353; CHECK-NEXT:    vpt.i32 ne, q0, zr
354; CHECK-NEXT:    vldrht.s32 q0, [r0, #-254]
355; CHECK-NEXT:    vstrw.32 q0, [r1]
356; CHECK-NEXT:    bx lr
357entry:
358  %z = getelementptr inbounds i8, ptr %x, i32 -254
359  %mask = load <4 x i32>, ptr %m, align 4
360  %c = icmp ne <4 x i32> %mask, zeroinitializer
361  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
362  %1 = sext <4 x i16> %0 to <4 x i32>
363  store <4 x i32> %1, ptr %y, align 4
364  ret ptr %x
365}
366
367define ptr @ldrhs32_m256(ptr %x, ptr %y, ptr %m) {
368; CHECK-LABEL: ldrhs32_m256:
369; CHECK:       @ %bb.0: @ %entry
370; CHECK-NEXT:    vldrw.u32 q0, [r2]
371; CHECK-NEXT:    sub.w r3, r0, #256
372; CHECK-NEXT:    vpt.i32 ne, q0, zr
373; CHECK-NEXT:    vldrht.s32 q0, [r3]
374; CHECK-NEXT:    vstrw.32 q0, [r1]
375; CHECK-NEXT:    bx lr
376entry:
377  %z = getelementptr inbounds i8, ptr %x, i32 -256
378  %mask = load <4 x i32>, ptr %m, align 4
379  %c = icmp ne <4 x i32> %mask, zeroinitializer
380  %0 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %z, i32 2, <4 x i1> %c, <4 x i16> undef)
381  %1 = sext <4 x i16> %0 to <4 x i32>
382  store <4 x i32> %1, ptr %y, align 4
383  ret ptr %x
384}
385
386define ptr @ldrhu16_4(ptr %x, ptr %y, ptr %m) {
387; CHECK-LABEL: ldrhu16_4:
388; CHECK:       @ %bb.0: @ %entry
389; CHECK-NEXT:    vldrh.u16 q0, [r2]
390; CHECK-NEXT:    vpt.i16 ne, q0, zr
391; CHECK-NEXT:    vldrht.u16 q0, [r0, #4]
392; CHECK-NEXT:    vstrh.16 q0, [r1]
393; CHECK-NEXT:    bx lr
394entry:
395  %z = getelementptr inbounds i8, ptr %x, i32 4
396  %mask = load <8 x i16>, ptr %m, align 2
397  %c = icmp ne <8 x i16> %mask, zeroinitializer
398  %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef)
399  store <8 x i16> %0, ptr %y, align 2
400  ret ptr %x
401}
402
403define ptr @ldrhu16_3(ptr %x, ptr %y, ptr %m) {
404; CHECK-LABEL: ldrhu16_3:
405; CHECK:       @ %bb.0: @ %entry
406; CHECK-NEXT:    vldrh.u16 q0, [r2]
407; CHECK-NEXT:    adds r3, r0, #3
408; CHECK-NEXT:    vpt.i16 ne, q0, zr
409; CHECK-NEXT:    vldrht.u16 q0, [r3]
410; CHECK-NEXT:    vstrh.16 q0, [r1]
411; CHECK-NEXT:    bx lr
412entry:
413  %z = getelementptr inbounds i8, ptr %x, i32 3
414  %mask = load <8 x i16>, ptr %m, align 2
415  %c = icmp ne <8 x i16> %mask, zeroinitializer
416  %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef)
417  store <8 x i16> %0, ptr %y, align 2
418  ret ptr %x
419}
420
421define ptr @ldrhu16_2(ptr %x, ptr %y, ptr %m) {
422; CHECK-LABEL: ldrhu16_2:
423; CHECK:       @ %bb.0: @ %entry
424; CHECK-NEXT:    vldrh.u16 q0, [r2]
425; CHECK-NEXT:    vpt.i16 ne, q0, zr
426; CHECK-NEXT:    vldrht.u16 q0, [r0, #2]
427; CHECK-NEXT:    vstrh.16 q0, [r1]
428; CHECK-NEXT:    bx lr
429entry:
430  %z = getelementptr inbounds i8, ptr %x, i32 2
431  %mask = load <8 x i16>, ptr %m, align 2
432  %c = icmp ne <8 x i16> %mask, zeroinitializer
433  %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef)
434  store <8 x i16> %0, ptr %y, align 2
435  ret ptr %x
436}
437
438define ptr @ldrhu16_254(ptr %x, ptr %y, ptr %m) {
439; CHECK-LABEL: ldrhu16_254:
440; CHECK:       @ %bb.0: @ %entry
441; CHECK-NEXT:    vldrh.u16 q0, [r2]
442; CHECK-NEXT:    vpt.i16 ne, q0, zr
443; CHECK-NEXT:    vldrht.u16 q0, [r0, #254]
444; CHECK-NEXT:    vstrh.16 q0, [r1]
445; CHECK-NEXT:    bx lr
446entry:
447  %z = getelementptr inbounds i8, ptr %x, i32 254
448  %mask = load <8 x i16>, ptr %m, align 2
449  %c = icmp ne <8 x i16> %mask, zeroinitializer
450  %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef)
451  store <8 x i16> %0, ptr %y, align 2
452  ret ptr %x
453}
454
455define ptr @ldrhu16_256(ptr %x, ptr %y, ptr %m) {
456; CHECK-LABEL: ldrhu16_256:
457; CHECK:       @ %bb.0: @ %entry
458; CHECK-NEXT:    vldrh.u16 q0, [r2]
459; CHECK-NEXT:    add.w r3, r0, #256
460; CHECK-NEXT:    vpt.i16 ne, q0, zr
461; CHECK-NEXT:    vldrht.u16 q0, [r3]
462; CHECK-NEXT:    vstrh.16 q0, [r1]
463; CHECK-NEXT:    bx lr
464entry:
465  %z = getelementptr inbounds i8, ptr %x, i32 256
466  %mask = load <8 x i16>, ptr %m, align 2
467  %c = icmp ne <8 x i16> %mask, zeroinitializer
468  %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef)
469  store <8 x i16> %0, ptr %y, align 2
470  ret ptr %x
471}
472
473define ptr @ldrhu16_m254(ptr %x, ptr %y, ptr %m) {
474; CHECK-LABEL: ldrhu16_m254:
475; CHECK:       @ %bb.0: @ %entry
476; CHECK-NEXT:    vldrh.u16 q0, [r2]
477; CHECK-NEXT:    vpt.i16 ne, q0, zr
478; CHECK-NEXT:    vldrht.u16 q0, [r0, #-254]
479; CHECK-NEXT:    vstrh.16 q0, [r1]
480; CHECK-NEXT:    bx lr
481entry:
482  %z = getelementptr inbounds i8, ptr %x, i32 -254
483  %mask = load <8 x i16>, ptr %m, align 2
484  %c = icmp ne <8 x i16> %mask, zeroinitializer
485  %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef)
486  store <8 x i16> %0, ptr %y, align 2
487  ret ptr %x
488}
489
490define ptr @ldrhu16_m256(ptr %x, ptr %y, ptr %m) {
491; CHECK-LABEL: ldrhu16_m256:
492; CHECK:       @ %bb.0: @ %entry
493; CHECK-NEXT:    vldrh.u16 q0, [r2]
494; CHECK-NEXT:    sub.w r3, r0, #256
495; CHECK-NEXT:    vpt.i16 ne, q0, zr
496; CHECK-NEXT:    vldrht.u16 q0, [r3]
497; CHECK-NEXT:    vstrh.16 q0, [r1]
498; CHECK-NEXT:    bx lr
499entry:
500  %z = getelementptr inbounds i8, ptr %x, i32 -256
501  %mask = load <8 x i16>, ptr %m, align 2
502  %c = icmp ne <8 x i16> %mask, zeroinitializer
503  %0 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x i16> undef)
504  store <8 x i16> %0, ptr %y, align 2
505  ret ptr %x
506}
507
508define ptr @ldrbu32_4(ptr %x, ptr %y, ptr %m) {
509; CHECK-LABEL: ldrbu32_4:
510; CHECK:       @ %bb.0: @ %entry
511; CHECK-NEXT:    vldrw.u32 q0, [r2]
512; CHECK-NEXT:    vpt.i32 ne, q0, zr
513; CHECK-NEXT:    vldrbt.u32 q0, [r0, #4]
514; CHECK-NEXT:    vstrw.32 q0, [r1]
515; CHECK-NEXT:    bx lr
516entry:
517  %z = getelementptr inbounds i8, ptr %x, i32 4
518  %mask = load <4 x i32>, ptr %m, align 4
519  %c = icmp ne <4 x i32> %mask, zeroinitializer
520  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
521  %1 = zext <4 x i8> %0 to <4 x i32>
522  store <4 x i32> %1, ptr %y, align 4
523  ret ptr %x
524}
525
526define ptr @ldrbu32_3(ptr %x, ptr %y, ptr %m) {
527; CHECK-LABEL: ldrbu32_3:
528; CHECK:       @ %bb.0: @ %entry
529; CHECK-NEXT:    vldrw.u32 q0, [r2]
530; CHECK-NEXT:    vpt.i32 ne, q0, zr
531; CHECK-NEXT:    vldrbt.u32 q0, [r0, #3]
532; CHECK-NEXT:    vstrw.32 q0, [r1]
533; CHECK-NEXT:    bx lr
534entry:
535  %z = getelementptr inbounds i8, ptr %x, i32 3
536  %mask = load <4 x i32>, ptr %m, align 4
537  %c = icmp ne <4 x i32> %mask, zeroinitializer
538  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
539  %1 = zext <4 x i8> %0 to <4 x i32>
540  store <4 x i32> %1, ptr %y, align 4
541  ret ptr %x
542}
543
544define ptr @ldrbu32_2(ptr %x, ptr %y, ptr %m) {
545; CHECK-LABEL: ldrbu32_2:
546; CHECK:       @ %bb.0: @ %entry
547; CHECK-NEXT:    vldrw.u32 q0, [r2]
548; CHECK-NEXT:    vpt.i32 ne, q0, zr
549; CHECK-NEXT:    vldrbt.u32 q0, [r0, #2]
550; CHECK-NEXT:    vstrw.32 q0, [r1]
551; CHECK-NEXT:    bx lr
552entry:
553  %z = getelementptr inbounds i8, ptr %x, i32 2
554  %mask = load <4 x i32>, ptr %m, align 4
555  %c = icmp ne <4 x i32> %mask, zeroinitializer
556  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
557  %1 = zext <4 x i8> %0 to <4 x i32>
558  store <4 x i32> %1, ptr %y, align 4
559  ret ptr %x
560}
561
562define ptr @ldrbu32_127(ptr %x, ptr %y, ptr %m) {
563; CHECK-LABEL: ldrbu32_127:
564; CHECK:       @ %bb.0: @ %entry
565; CHECK-NEXT:    vldrw.u32 q0, [r2]
566; CHECK-NEXT:    vpt.i32 ne, q0, zr
567; CHECK-NEXT:    vldrbt.u32 q0, [r0, #127]
568; CHECK-NEXT:    vstrw.32 q0, [r1]
569; CHECK-NEXT:    bx lr
570entry:
571  %z = getelementptr inbounds i8, ptr %x, i32 127
572  %mask = load <4 x i32>, ptr %m, align 4
573  %c = icmp ne <4 x i32> %mask, zeroinitializer
574  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
575  %1 = zext <4 x i8> %0 to <4 x i32>
576  store <4 x i32> %1, ptr %y, align 4
577  ret ptr %x
578}
579
580define ptr @ldrbu32_128(ptr %x, ptr %y, ptr %m) {
581; CHECK-LABEL: ldrbu32_128:
582; CHECK:       @ %bb.0: @ %entry
583; CHECK-NEXT:    vldrw.u32 q0, [r2]
584; CHECK-NEXT:    add.w r3, r0, #128
585; CHECK-NEXT:    vpt.i32 ne, q0, zr
586; CHECK-NEXT:    vldrbt.u32 q0, [r3]
587; CHECK-NEXT:    vstrw.32 q0, [r1]
588; CHECK-NEXT:    bx lr
589entry:
590  %z = getelementptr inbounds i8, ptr %x, i32 128
591  %mask = load <4 x i32>, ptr %m, align 4
592  %c = icmp ne <4 x i32> %mask, zeroinitializer
593  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
594  %1 = zext <4 x i8> %0 to <4 x i32>
595  store <4 x i32> %1, ptr %y, align 4
596  ret ptr %x
597}
598
599define ptr @ldrbu32_m127(ptr %x, ptr %y, ptr %m) {
600; CHECK-LABEL: ldrbu32_m127:
601; CHECK:       @ %bb.0: @ %entry
602; CHECK-NEXT:    vldrw.u32 q0, [r2]
603; CHECK-NEXT:    vpt.i32 ne, q0, zr
604; CHECK-NEXT:    vldrbt.u32 q0, [r0, #-127]
605; CHECK-NEXT:    vstrw.32 q0, [r1]
606; CHECK-NEXT:    bx lr
607entry:
608  %z = getelementptr inbounds i8, ptr %x, i32 -127
609  %mask = load <4 x i32>, ptr %m, align 4
610  %c = icmp ne <4 x i32> %mask, zeroinitializer
611  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
612  %1 = zext <4 x i8> %0 to <4 x i32>
613  store <4 x i32> %1, ptr %y, align 4
614  ret ptr %x
615}
616
617define ptr @ldrbu32_m128(ptr %x, ptr %y, ptr %m) {
618; CHECK-LABEL: ldrbu32_m128:
619; CHECK:       @ %bb.0: @ %entry
620; CHECK-NEXT:    vldrw.u32 q0, [r2]
621; CHECK-NEXT:    sub.w r3, r0, #128
622; CHECK-NEXT:    vpt.i32 ne, q0, zr
623; CHECK-NEXT:    vldrbt.u32 q0, [r3]
624; CHECK-NEXT:    vstrw.32 q0, [r1]
625; CHECK-NEXT:    bx lr
626entry:
627  %z = getelementptr inbounds i8, ptr %x, i32 -128
628  %mask = load <4 x i32>, ptr %m, align 4
629  %c = icmp ne <4 x i32> %mask, zeroinitializer
630  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
631  %1 = zext <4 x i8> %0 to <4 x i32>
632  store <4 x i32> %1, ptr %y, align 4
633  ret ptr %x
634}
635
636define ptr @ldrbs32_4(ptr %x, ptr %y, ptr %m) {
637; CHECK-LABEL: ldrbs32_4:
638; CHECK:       @ %bb.0: @ %entry
639; CHECK-NEXT:    vldrw.u32 q0, [r2]
640; CHECK-NEXT:    vpt.i32 ne, q0, zr
641; CHECK-NEXT:    vldrbt.s32 q0, [r0, #4]
642; CHECK-NEXT:    vstrw.32 q0, [r1]
643; CHECK-NEXT:    bx lr
644entry:
645  %z = getelementptr inbounds i8, ptr %x, i32 4
646  %mask = load <4 x i32>, ptr %m, align 4
647  %c = icmp ne <4 x i32> %mask, zeroinitializer
648  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
649  %1 = sext <4 x i8> %0 to <4 x i32>
650  store <4 x i32> %1, ptr %y, align 4
651  ret ptr %x
652}
653
654define ptr @ldrbs32_3(ptr %x, ptr %y, ptr %m) {
655; CHECK-LABEL: ldrbs32_3:
656; CHECK:       @ %bb.0: @ %entry
657; CHECK-NEXT:    vldrw.u32 q0, [r2]
658; CHECK-NEXT:    vpt.i32 ne, q0, zr
659; CHECK-NEXT:    vldrbt.s32 q0, [r0, #3]
660; CHECK-NEXT:    vstrw.32 q0, [r1]
661; CHECK-NEXT:    bx lr
662entry:
663  %z = getelementptr inbounds i8, ptr %x, i32 3
664  %mask = load <4 x i32>, ptr %m, align 4
665  %c = icmp ne <4 x i32> %mask, zeroinitializer
666  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
667  %1 = sext <4 x i8> %0 to <4 x i32>
668  store <4 x i32> %1, ptr %y, align 4
669  ret ptr %x
670}
671
672define ptr @ldrbs32_2(ptr %x, ptr %y, ptr %m) {
673; CHECK-LABEL: ldrbs32_2:
674; CHECK:       @ %bb.0: @ %entry
675; CHECK-NEXT:    vldrw.u32 q0, [r2]
676; CHECK-NEXT:    vpt.i32 ne, q0, zr
677; CHECK-NEXT:    vldrbt.s32 q0, [r0, #2]
678; CHECK-NEXT:    vstrw.32 q0, [r1]
679; CHECK-NEXT:    bx lr
680entry:
681  %z = getelementptr inbounds i8, ptr %x, i32 2
682  %mask = load <4 x i32>, ptr %m, align 4
683  %c = icmp ne <4 x i32> %mask, zeroinitializer
684  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
685  %1 = sext <4 x i8> %0 to <4 x i32>
686  store <4 x i32> %1, ptr %y, align 4
687  ret ptr %x
688}
689
690define ptr @ldrbs32_127(ptr %x, ptr %y, ptr %m) {
691; CHECK-LABEL: ldrbs32_127:
692; CHECK:       @ %bb.0: @ %entry
693; CHECK-NEXT:    vldrw.u32 q0, [r2]
694; CHECK-NEXT:    vpt.i32 ne, q0, zr
695; CHECK-NEXT:    vldrbt.s32 q0, [r0, #127]
696; CHECK-NEXT:    vstrw.32 q0, [r1]
697; CHECK-NEXT:    bx lr
698entry:
699  %z = getelementptr inbounds i8, ptr %x, i32 127
700  %mask = load <4 x i32>, ptr %m, align 4
701  %c = icmp ne <4 x i32> %mask, zeroinitializer
702  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
703  %1 = sext <4 x i8> %0 to <4 x i32>
704  store <4 x i32> %1, ptr %y, align 4
705  ret ptr %x
706}
707
708define ptr @ldrbs32_128(ptr %x, ptr %y, ptr %m) {
709; CHECK-LABEL: ldrbs32_128:
710; CHECK:       @ %bb.0: @ %entry
711; CHECK-NEXT:    vldrw.u32 q0, [r2]
712; CHECK-NEXT:    add.w r3, r0, #128
713; CHECK-NEXT:    vpt.i32 ne, q0, zr
714; CHECK-NEXT:    vldrbt.s32 q0, [r3]
715; CHECK-NEXT:    vstrw.32 q0, [r1]
716; CHECK-NEXT:    bx lr
717entry:
718  %z = getelementptr inbounds i8, ptr %x, i32 128
719  %mask = load <4 x i32>, ptr %m, align 4
720  %c = icmp ne <4 x i32> %mask, zeroinitializer
721  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
722  %1 = sext <4 x i8> %0 to <4 x i32>
723  store <4 x i32> %1, ptr %y, align 4
724  ret ptr %x
725}
726
727define ptr @ldrbs32_m127(ptr %x, ptr %y, ptr %m) {
728; CHECK-LABEL: ldrbs32_m127:
729; CHECK:       @ %bb.0: @ %entry
730; CHECK-NEXT:    vldrw.u32 q0, [r2]
731; CHECK-NEXT:    vpt.i32 ne, q0, zr
732; CHECK-NEXT:    vldrbt.s32 q0, [r0, #-127]
733; CHECK-NEXT:    vstrw.32 q0, [r1]
734; CHECK-NEXT:    bx lr
735entry:
736  %z = getelementptr inbounds i8, ptr %x, i32 -127
737  %mask = load <4 x i32>, ptr %m, align 4
738  %c = icmp ne <4 x i32> %mask, zeroinitializer
739  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
740  %1 = sext <4 x i8> %0 to <4 x i32>
741  store <4 x i32> %1, ptr %y, align 4
742  ret ptr %x
743}
744
745define ptr @ldrbs32_m128(ptr %x, ptr %y, ptr %m) {
746; CHECK-LABEL: ldrbs32_m128:
747; CHECK:       @ %bb.0: @ %entry
748; CHECK-NEXT:    vldrw.u32 q0, [r2]
749; CHECK-NEXT:    sub.w r3, r0, #128
750; CHECK-NEXT:    vpt.i32 ne, q0, zr
751; CHECK-NEXT:    vldrbt.s32 q0, [r3]
752; CHECK-NEXT:    vstrw.32 q0, [r1]
753; CHECK-NEXT:    bx lr
754entry:
755  %z = getelementptr inbounds i8, ptr %x, i32 -128
756  %mask = load <4 x i32>, ptr %m, align 4
757  %c = icmp ne <4 x i32> %mask, zeroinitializer
758  %0 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %z, i32 1, <4 x i1> %c, <4 x i8> undef)
759  %1 = sext <4 x i8> %0 to <4 x i32>
760  store <4 x i32> %1, ptr %y, align 4
761  ret ptr %x
762}
763
764define ptr @ldrbu16_4(ptr %x, ptr %y, ptr %m) {
765; CHECK-LABEL: ldrbu16_4:
766; CHECK:       @ %bb.0: @ %entry
767; CHECK-NEXT:    vldrh.u16 q0, [r2]
768; CHECK-NEXT:    vpt.i16 ne, q0, zr
769; CHECK-NEXT:    vldrbt.u16 q0, [r0, #4]
770; CHECK-NEXT:    vstrh.16 q0, [r1]
771; CHECK-NEXT:    bx lr
772entry:
773  %z = getelementptr inbounds i8, ptr %x, i32 4
774  %mask = load <8 x i16>, ptr %m, align 2
775  %c = icmp ne <8 x i16> %mask, zeroinitializer
776  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
777  %1 = zext <8 x i8> %0 to <8 x i16>
778  store <8 x i16> %1, ptr %y, align 2
779  ret ptr %x
780}
781
782define ptr @ldrbu16_3(ptr %x, ptr %y, ptr %m) {
783; CHECK-LABEL: ldrbu16_3:
784; CHECK:       @ %bb.0: @ %entry
785; CHECK-NEXT:    vldrh.u16 q0, [r2]
786; CHECK-NEXT:    vpt.i16 ne, q0, zr
787; CHECK-NEXT:    vldrbt.u16 q0, [r0, #3]
788; CHECK-NEXT:    vstrh.16 q0, [r1]
789; CHECK-NEXT:    bx lr
790entry:
791  %z = getelementptr inbounds i8, ptr %x, i32 3
792  %mask = load <8 x i16>, ptr %m, align 2
793  %c = icmp ne <8 x i16> %mask, zeroinitializer
794  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
795  %1 = zext <8 x i8> %0 to <8 x i16>
796  store <8 x i16> %1, ptr %y, align 2
797  ret ptr %x
798}
799
800define ptr @ldrbu16_2(ptr %x, ptr %y, ptr %m) {
801; CHECK-LABEL: ldrbu16_2:
802; CHECK:       @ %bb.0: @ %entry
803; CHECK-NEXT:    vldrh.u16 q0, [r2]
804; CHECK-NEXT:    vpt.i16 ne, q0, zr
805; CHECK-NEXT:    vldrbt.u16 q0, [r0, #2]
806; CHECK-NEXT:    vstrh.16 q0, [r1]
807; CHECK-NEXT:    bx lr
808entry:
809  %z = getelementptr inbounds i8, ptr %x, i32 2
810  %mask = load <8 x i16>, ptr %m, align 2
811  %c = icmp ne <8 x i16> %mask, zeroinitializer
812  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
813  %1 = zext <8 x i8> %0 to <8 x i16>
814  store <8 x i16> %1, ptr %y, align 2
815  ret ptr %x
816}
817
818define ptr @ldrbu16_127(ptr %x, ptr %y, ptr %m) {
819; CHECK-LABEL: ldrbu16_127:
820; CHECK:       @ %bb.0: @ %entry
821; CHECK-NEXT:    vldrh.u16 q0, [r2]
822; CHECK-NEXT:    vpt.i16 ne, q0, zr
823; CHECK-NEXT:    vldrbt.u16 q0, [r0, #127]
824; CHECK-NEXT:    vstrh.16 q0, [r1]
825; CHECK-NEXT:    bx lr
826entry:
827  %z = getelementptr inbounds i8, ptr %x, i32 127
828  %mask = load <8 x i16>, ptr %m, align 2
829  %c = icmp ne <8 x i16> %mask, zeroinitializer
830  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
831  %1 = zext <8 x i8> %0 to <8 x i16>
832  store <8 x i16> %1, ptr %y, align 2
833  ret ptr %x
834}
835
836define ptr @ldrbu16_128(ptr %x, ptr %y, ptr %m) {
837; CHECK-LABEL: ldrbu16_128:
838; CHECK:       @ %bb.0: @ %entry
839; CHECK-NEXT:    vldrh.u16 q0, [r2]
840; CHECK-NEXT:    add.w r3, r0, #128
841; CHECK-NEXT:    vpt.i16 ne, q0, zr
842; CHECK-NEXT:    vldrbt.u16 q0, [r3]
843; CHECK-NEXT:    vstrh.16 q0, [r1]
844; CHECK-NEXT:    bx lr
845entry:
846  %z = getelementptr inbounds i8, ptr %x, i32 128
847  %mask = load <8 x i16>, ptr %m, align 2
848  %c = icmp ne <8 x i16> %mask, zeroinitializer
849  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
850  %1 = zext <8 x i8> %0 to <8 x i16>
851  store <8 x i16> %1, ptr %y, align 2
852  ret ptr %x
853}
854
855define ptr @ldrbu16_m127(ptr %x, ptr %y, ptr %m) {
856; CHECK-LABEL: ldrbu16_m127:
857; CHECK:       @ %bb.0: @ %entry
858; CHECK-NEXT:    vldrh.u16 q0, [r2]
859; CHECK-NEXT:    vpt.i16 ne, q0, zr
860; CHECK-NEXT:    vldrbt.u16 q0, [r0, #-127]
861; CHECK-NEXT:    vstrh.16 q0, [r1]
862; CHECK-NEXT:    bx lr
863entry:
864  %z = getelementptr inbounds i8, ptr %x, i32 -127
865  %mask = load <8 x i16>, ptr %m, align 2
866  %c = icmp ne <8 x i16> %mask, zeroinitializer
867  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
868  %1 = zext <8 x i8> %0 to <8 x i16>
869  store <8 x i16> %1, ptr %y, align 2
870  ret ptr %x
871}
872
873define ptr @ldrbu16_m128(ptr %x, ptr %y, ptr %m) {
874; CHECK-LABEL: ldrbu16_m128:
875; CHECK:       @ %bb.0: @ %entry
876; CHECK-NEXT:    vldrh.u16 q0, [r2]
877; CHECK-NEXT:    sub.w r3, r0, #128
878; CHECK-NEXT:    vpt.i16 ne, q0, zr
879; CHECK-NEXT:    vldrbt.u16 q0, [r3]
880; CHECK-NEXT:    vstrh.16 q0, [r1]
881; CHECK-NEXT:    bx lr
882entry:
883  %z = getelementptr inbounds i8, ptr %x, i32 -128
884  %mask = load <8 x i16>, ptr %m, align 2
885  %c = icmp ne <8 x i16> %mask, zeroinitializer
886  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
887  %1 = zext <8 x i8> %0 to <8 x i16>
888  store <8 x i16> %1, ptr %y, align 2
889  ret ptr %x
890}
891
892define ptr @ldrbs16_4(ptr %x, ptr %y, ptr %m) {
893; CHECK-LABEL: ldrbs16_4:
894; CHECK:       @ %bb.0: @ %entry
895; CHECK-NEXT:    vldrh.u16 q0, [r2]
896; CHECK-NEXT:    vpt.i16 ne, q0, zr
897; CHECK-NEXT:    vldrbt.s16 q0, [r0, #4]
898; CHECK-NEXT:    vstrh.16 q0, [r1]
899; CHECK-NEXT:    bx lr
900entry:
901  %z = getelementptr inbounds i8, ptr %x, i32 4
902  %mask = load <8 x i16>, ptr %m, align 2
903  %c = icmp ne <8 x i16> %mask, zeroinitializer
904  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
905  %1 = sext <8 x i8> %0 to <8 x i16>
906  store <8 x i16> %1, ptr %y, align 2
907  ret ptr %x
908}
909
910define ptr @ldrbs16_3(ptr %x, ptr %y, ptr %m) {
911; CHECK-LABEL: ldrbs16_3:
912; CHECK:       @ %bb.0: @ %entry
913; CHECK-NEXT:    vldrh.u16 q0, [r2]
914; CHECK-NEXT:    vpt.i16 ne, q0, zr
915; CHECK-NEXT:    vldrbt.s16 q0, [r0, #3]
916; CHECK-NEXT:    vstrh.16 q0, [r1]
917; CHECK-NEXT:    bx lr
918entry:
919  %z = getelementptr inbounds i8, ptr %x, i32 3
920  %mask = load <8 x i16>, ptr %m, align 2
921  %c = icmp ne <8 x i16> %mask, zeroinitializer
922  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
923  %1 = sext <8 x i8> %0 to <8 x i16>
924  store <8 x i16> %1, ptr %y, align 2
925  ret ptr %x
926}
927
928define ptr @ldrbs16_2(ptr %x, ptr %y, ptr %m) {
929; CHECK-LABEL: ldrbs16_2:
930; CHECK:       @ %bb.0: @ %entry
931; CHECK-NEXT:    vldrh.u16 q0, [r2]
932; CHECK-NEXT:    vpt.i16 ne, q0, zr
933; CHECK-NEXT:    vldrbt.s16 q0, [r0, #2]
934; CHECK-NEXT:    vstrh.16 q0, [r1]
935; CHECK-NEXT:    bx lr
936entry:
937  %z = getelementptr inbounds i8, ptr %x, i32 2
938  %mask = load <8 x i16>, ptr %m, align 2
939  %c = icmp ne <8 x i16> %mask, zeroinitializer
940  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
941  %1 = sext <8 x i8> %0 to <8 x i16>
942  store <8 x i16> %1, ptr %y, align 2
943  ret ptr %x
944}
945
946define ptr @ldrbs16_127(ptr %x, ptr %y, ptr %m) {
947; CHECK-LABEL: ldrbs16_127:
948; CHECK:       @ %bb.0: @ %entry
949; CHECK-NEXT:    vldrh.u16 q0, [r2]
950; CHECK-NEXT:    vpt.i16 ne, q0, zr
951; CHECK-NEXT:    vldrbt.s16 q0, [r0, #127]
952; CHECK-NEXT:    vstrh.16 q0, [r1]
953; CHECK-NEXT:    bx lr
954entry:
955  %z = getelementptr inbounds i8, ptr %x, i32 127
956  %mask = load <8 x i16>, ptr %m, align 2
957  %c = icmp ne <8 x i16> %mask, zeroinitializer
958  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
959  %1 = sext <8 x i8> %0 to <8 x i16>
960  store <8 x i16> %1, ptr %y, align 2
961  ret ptr %x
962}
963
964define ptr @ldrbs16_128(ptr %x, ptr %y, ptr %m) {
965; CHECK-LABEL: ldrbs16_128:
966; CHECK:       @ %bb.0: @ %entry
967; CHECK-NEXT:    vldrh.u16 q0, [r2]
968; CHECK-NEXT:    add.w r3, r0, #128
969; CHECK-NEXT:    vpt.i16 ne, q0, zr
970; CHECK-NEXT:    vldrbt.s16 q0, [r3]
971; CHECK-NEXT:    vstrh.16 q0, [r1]
972; CHECK-NEXT:    bx lr
973entry:
974  %z = getelementptr inbounds i8, ptr %x, i32 128
975  %mask = load <8 x i16>, ptr %m, align 2
976  %c = icmp ne <8 x i16> %mask, zeroinitializer
977  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
978  %1 = sext <8 x i8> %0 to <8 x i16>
979  store <8 x i16> %1, ptr %y, align 2
980  ret ptr %x
981}
982
983define ptr @ldrbs16_m127(ptr %x, ptr %y, ptr %m) {
984; CHECK-LABEL: ldrbs16_m127:
985; CHECK:       @ %bb.0: @ %entry
986; CHECK-NEXT:    vldrh.u16 q0, [r2]
987; CHECK-NEXT:    vpt.i16 ne, q0, zr
988; CHECK-NEXT:    vldrbt.s16 q0, [r0, #-127]
989; CHECK-NEXT:    vstrh.16 q0, [r1]
990; CHECK-NEXT:    bx lr
991entry:
992  %z = getelementptr inbounds i8, ptr %x, i32 -127
993  %mask = load <8 x i16>, ptr %m, align 2
994  %c = icmp ne <8 x i16> %mask, zeroinitializer
995  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
996  %1 = sext <8 x i8> %0 to <8 x i16>
997  store <8 x i16> %1, ptr %y, align 2
998  ret ptr %x
999}
1000
1001define ptr @ldrbs16_m128(ptr %x, ptr %y, ptr %m) {
1002; CHECK-LABEL: ldrbs16_m128:
1003; CHECK:       @ %bb.0: @ %entry
1004; CHECK-NEXT:    vldrh.u16 q0, [r2]
1005; CHECK-NEXT:    sub.w r3, r0, #128
1006; CHECK-NEXT:    vpt.i16 ne, q0, zr
1007; CHECK-NEXT:    vldrbt.s16 q0, [r3]
1008; CHECK-NEXT:    vstrh.16 q0, [r1]
1009; CHECK-NEXT:    bx lr
1010entry:
1011  %z = getelementptr inbounds i8, ptr %x, i32 -128
1012  %mask = load <8 x i16>, ptr %m, align 2
1013  %c = icmp ne <8 x i16> %mask, zeroinitializer
1014  %0 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %z, i32 1, <8 x i1> %c, <8 x i8> undef)
1015  %1 = sext <8 x i8> %0 to <8 x i16>
1016  store <8 x i16> %1, ptr %y, align 2
1017  ret ptr %x
1018}
1019
1020define ptr @ldrbu8_4(ptr %x, ptr %y, ptr %m) {
1021; CHECK-LABEL: ldrbu8_4:
1022; CHECK:       @ %bb.0: @ %entry
1023; CHECK-NEXT:    vldrb.u8 q0, [r2]
1024; CHECK-NEXT:    vpt.i8 ne, q0, zr
1025; CHECK-NEXT:    vldrbt.u8 q0, [r0, #4]
1026; CHECK-NEXT:    vstrb.8 q0, [r1]
1027; CHECK-NEXT:    bx lr
1028entry:
1029  %z = getelementptr inbounds i8, ptr %x, i32 4
1030  %mask = load <16 x i8>, ptr %m, align 1
1031  %c = icmp ne <16 x i8> %mask, zeroinitializer
1032  %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef)
1033  store <16 x i8> %0, ptr %y, align 1
1034  ret ptr %x
1035}
1036
1037define ptr @ldrbu8_3(ptr %x, ptr %y, ptr %m) {
1038; CHECK-LABEL: ldrbu8_3:
1039; CHECK:       @ %bb.0: @ %entry
1040; CHECK-NEXT:    vldrb.u8 q0, [r2]
1041; CHECK-NEXT:    vpt.i8 ne, q0, zr
1042; CHECK-NEXT:    vldrbt.u8 q0, [r0, #3]
1043; CHECK-NEXT:    vstrb.8 q0, [r1]
1044; CHECK-NEXT:    bx lr
1045entry:
1046  %z = getelementptr inbounds i8, ptr %x, i32 3
1047  %mask = load <16 x i8>, ptr %m, align 1
1048  %c = icmp ne <16 x i8> %mask, zeroinitializer
1049  %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef)
1050  store <16 x i8> %0, ptr %y, align 1
1051  ret ptr %x
1052}
1053
1054define ptr @ldrbu8_2(ptr %x, ptr %y, ptr %m) {
1055; CHECK-LABEL: ldrbu8_2:
1056; CHECK:       @ %bb.0: @ %entry
1057; CHECK-NEXT:    vldrb.u8 q0, [r2]
1058; CHECK-NEXT:    vpt.i8 ne, q0, zr
1059; CHECK-NEXT:    vldrbt.u8 q0, [r0, #2]
1060; CHECK-NEXT:    vstrb.8 q0, [r1]
1061; CHECK-NEXT:    bx lr
1062entry:
1063  %z = getelementptr inbounds i8, ptr %x, i32 2
1064  %mask = load <16 x i8>, ptr %m, align 1
1065  %c = icmp ne <16 x i8> %mask, zeroinitializer
1066  %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef)
1067  store <16 x i8> %0, ptr %y, align 1
1068  ret ptr %x
1069}
1070
1071define ptr @ldrbu8_127(ptr %x, ptr %y, ptr %m) {
1072; CHECK-LABEL: ldrbu8_127:
1073; CHECK:       @ %bb.0: @ %entry
1074; CHECK-NEXT:    vldrb.u8 q0, [r2]
1075; CHECK-NEXT:    vpt.i8 ne, q0, zr
1076; CHECK-NEXT:    vldrbt.u8 q0, [r0, #127]
1077; CHECK-NEXT:    vstrb.8 q0, [r1]
1078; CHECK-NEXT:    bx lr
1079entry:
1080  %z = getelementptr inbounds i8, ptr %x, i32 127
1081  %mask = load <16 x i8>, ptr %m, align 1
1082  %c = icmp ne <16 x i8> %mask, zeroinitializer
1083  %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef)
1084  store <16 x i8> %0, ptr %y, align 1
1085  ret ptr %x
1086}
1087
1088define ptr @ldrbu8_128(ptr %x, ptr %y, ptr %m) {
1089; CHECK-LABEL: ldrbu8_128:
1090; CHECK:       @ %bb.0: @ %entry
1091; CHECK-NEXT:    vldrb.u8 q0, [r2]
1092; CHECK-NEXT:    add.w r3, r0, #128
1093; CHECK-NEXT:    vpt.i8 ne, q0, zr
1094; CHECK-NEXT:    vldrbt.u8 q0, [r3]
1095; CHECK-NEXT:    vstrb.8 q0, [r1]
1096; CHECK-NEXT:    bx lr
1097entry:
1098  %z = getelementptr inbounds i8, ptr %x, i32 128
1099  %mask = load <16 x i8>, ptr %m, align 1
1100  %c = icmp ne <16 x i8> %mask, zeroinitializer
1101  %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef)
1102  store <16 x i8> %0, ptr %y, align 1
1103  ret ptr %x
1104}
1105
1106define ptr @ldrbu8_m127(ptr %x, ptr %y, ptr %m) {
1107; CHECK-LABEL: ldrbu8_m127:
1108; CHECK:       @ %bb.0: @ %entry
1109; CHECK-NEXT:    vldrb.u8 q0, [r2]
1110; CHECK-NEXT:    vpt.i8 ne, q0, zr
1111; CHECK-NEXT:    vldrbt.u8 q0, [r0, #-127]
1112; CHECK-NEXT:    vstrb.8 q0, [r1]
1113; CHECK-NEXT:    bx lr
1114entry:
1115  %z = getelementptr inbounds i8, ptr %x, i32 -127
1116  %mask = load <16 x i8>, ptr %m, align 1
1117  %c = icmp ne <16 x i8> %mask, zeroinitializer
1118  %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef)
1119  store <16 x i8> %0, ptr %y, align 1
1120  ret ptr %x
1121}
1122
1123define ptr @ldrbu8_m128(ptr %x, ptr %y, ptr %m) {
1124; CHECK-LABEL: ldrbu8_m128:
1125; CHECK:       @ %bb.0: @ %entry
1126; CHECK-NEXT:    vldrb.u8 q0, [r2]
1127; CHECK-NEXT:    sub.w r3, r0, #128
1128; CHECK-NEXT:    vpt.i8 ne, q0, zr
1129; CHECK-NEXT:    vldrbt.u8 q0, [r3]
1130; CHECK-NEXT:    vstrb.8 q0, [r1]
1131; CHECK-NEXT:    bx lr
1132entry:
1133  %z = getelementptr inbounds i8, ptr %x, i32 -128
1134  %mask = load <16 x i8>, ptr %m, align 1
1135  %c = icmp ne <16 x i8> %mask, zeroinitializer
1136  %0 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %z, i32 1, <16 x i1> %c, <16 x i8> undef)
1137  store <16 x i8> %0, ptr %y, align 1
1138  ret ptr %x
1139}
1140
1141define ptr @ldrwf32_4(ptr %x, ptr %y, ptr %m) {
1142; CHECK-LABEL: ldrwf32_4:
1143; CHECK:       @ %bb.0: @ %entry
1144; CHECK-NEXT:    vldrw.u32 q0, [r2]
1145; CHECK-NEXT:    vpt.i32 ne, q0, zr
1146; CHECK-NEXT:    vldrwt.u32 q0, [r0, #4]
1147; CHECK-NEXT:    vstrw.32 q0, [r1]
1148; CHECK-NEXT:    bx lr
1149entry:
1150  %z = getelementptr inbounds i8, ptr %x, i32 4
1151  %mask = load <4 x i32>, ptr %m, align 4
1152  %c = icmp ne <4 x i32> %mask, zeroinitializer
1153  %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef)
1154  store <4 x float> %0, ptr %y, align 4
1155  ret ptr %x
1156}
1157
1158define ptr @ldrwf32_3(ptr %x, ptr %y, ptr %m) {
1159; CHECK-LABEL: ldrwf32_3:
1160; CHECK:       @ %bb.0: @ %entry
1161; CHECK-NEXT:    vldrw.u32 q0, [r2]
1162; CHECK-NEXT:    adds r3, r0, #3
1163; CHECK-NEXT:    vpt.i32 ne, q0, zr
1164; CHECK-NEXT:    vldrwt.u32 q0, [r3]
1165; CHECK-NEXT:    vstrw.32 q0, [r1]
1166; CHECK-NEXT:    bx lr
1167entry:
1168  %z = getelementptr inbounds i8, ptr %x, i32 3
1169  %mask = load <4 x i32>, ptr %m, align 4
1170  %c = icmp ne <4 x i32> %mask, zeroinitializer
1171  %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef)
1172  store <4 x float> %0, ptr %y, align 4
1173  ret ptr %x
1174}
1175
1176define ptr @ldrwf32_2(ptr %x, ptr %y, ptr %m) {
1177; CHECK-LABEL: ldrwf32_2:
1178; CHECK:       @ %bb.0: @ %entry
1179; CHECK-NEXT:    vldrw.u32 q0, [r2]
1180; CHECK-NEXT:    adds r3, r0, #2
1181; CHECK-NEXT:    vpt.i32 ne, q0, zr
1182; CHECK-NEXT:    vldrwt.u32 q0, [r3]
1183; CHECK-NEXT:    vstrw.32 q0, [r1]
1184; CHECK-NEXT:    bx lr
1185entry:
1186  %z = getelementptr inbounds i8, ptr %x, i32 2
1187  %mask = load <4 x i32>, ptr %m, align 4
1188  %c = icmp ne <4 x i32> %mask, zeroinitializer
1189  %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef)
1190  store <4 x float> %0, ptr %y, align 4
1191  ret ptr %x
1192}
1193
1194define ptr @ldrwf32_508(ptr %x, ptr %y, ptr %m) {
1195; CHECK-LABEL: ldrwf32_508:
1196; CHECK:       @ %bb.0: @ %entry
1197; CHECK-NEXT:    vldrw.u32 q0, [r2]
1198; CHECK-NEXT:    vpt.i32 ne, q0, zr
1199; CHECK-NEXT:    vldrwt.u32 q0, [r0, #508]
1200; CHECK-NEXT:    vstrw.32 q0, [r1]
1201; CHECK-NEXT:    bx lr
1202entry:
1203  %z = getelementptr inbounds i8, ptr %x, i32 508
1204  %mask = load <4 x i32>, ptr %m, align 4
1205  %c = icmp ne <4 x i32> %mask, zeroinitializer
1206  %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef)
1207  store <4 x float> %0, ptr %y, align 4
1208  ret ptr %x
1209}
1210
1211define ptr @ldrwf32_512(ptr %x, ptr %y, ptr %m) {
1212; CHECK-LABEL: ldrwf32_512:
1213; CHECK:       @ %bb.0: @ %entry
1214; CHECK-NEXT:    vldrw.u32 q0, [r2]
1215; CHECK-NEXT:    add.w r3, r0, #512
1216; CHECK-NEXT:    vpt.i32 ne, q0, zr
1217; CHECK-NEXT:    vldrwt.u32 q0, [r3]
1218; CHECK-NEXT:    vstrw.32 q0, [r1]
1219; CHECK-NEXT:    bx lr
1220entry:
1221  %z = getelementptr inbounds i8, ptr %x, i32 512
1222  %mask = load <4 x i32>, ptr %m, align 4
1223  %c = icmp ne <4 x i32> %mask, zeroinitializer
1224  %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef)
1225  store <4 x float> %0, ptr %y, align 4
1226  ret ptr %x
1227}
1228
1229define ptr @ldrwf32_m508(ptr %x, ptr %y, ptr %m) {
1230; CHECK-LABEL: ldrwf32_m508:
1231; CHECK:       @ %bb.0: @ %entry
1232; CHECK-NEXT:    vldrw.u32 q0, [r2]
1233; CHECK-NEXT:    vpt.i32 ne, q0, zr
1234; CHECK-NEXT:    vldrwt.u32 q0, [r0, #-508]
1235; CHECK-NEXT:    vstrw.32 q0, [r1]
1236; CHECK-NEXT:    bx lr
1237entry:
1238  %z = getelementptr inbounds i8, ptr %x, i32 -508
1239  %mask = load <4 x i32>, ptr %m, align 4
1240  %c = icmp ne <4 x i32> %mask, zeroinitializer
1241  %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef)
1242  store <4 x float> %0, ptr %y, align 4
1243  ret ptr %x
1244}
1245
1246define ptr @ldrwf32_m512(ptr %x, ptr %y, ptr %m) {
1247; CHECK-LABEL: ldrwf32_m512:
1248; CHECK:       @ %bb.0: @ %entry
1249; CHECK-NEXT:    vldrw.u32 q0, [r2]
1250; CHECK-NEXT:    sub.w r3, r0, #512
1251; CHECK-NEXT:    vpt.i32 ne, q0, zr
1252; CHECK-NEXT:    vldrwt.u32 q0, [r3]
1253; CHECK-NEXT:    vstrw.32 q0, [r1]
1254; CHECK-NEXT:    bx lr
1255entry:
1256  %z = getelementptr inbounds i8, ptr %x, i32 -512
1257  %mask = load <4 x i32>, ptr %m, align 4
1258  %c = icmp ne <4 x i32> %mask, zeroinitializer
1259  %0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %z, i32 4, <4 x i1> %c, <4 x float> undef)
1260  store <4 x float> %0, ptr %y, align 4
1261  ret ptr %x
1262}
1263
1264define ptr @ldrhf16_4(ptr %x, ptr %y, ptr %m) {
1265; CHECK-LABEL: ldrhf16_4:
1266; CHECK:       @ %bb.0: @ %entry
1267; CHECK-NEXT:    vldrh.u16 q0, [r2]
1268; CHECK-NEXT:    vpt.i16 ne, q0, zr
1269; CHECK-NEXT:    vldrht.u16 q0, [r0, #4]
1270; CHECK-NEXT:    vstrh.16 q0, [r1]
1271; CHECK-NEXT:    bx lr
1272entry:
1273  %z = getelementptr inbounds i8, ptr %x, i32 4
1274  %mask = load <8 x i16>, ptr %m, align 2
1275  %c = icmp ne <8 x i16> %mask, zeroinitializer
1276  %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef)
1277  store <8 x half> %0, ptr %y, align 2
1278  ret ptr %x
1279}
1280
1281define ptr @ldrhf16_3(ptr %x, ptr %y, ptr %m) {
1282; CHECK-LABEL: ldrhf16_3:
1283; CHECK:       @ %bb.0: @ %entry
1284; CHECK-NEXT:    vldrh.u16 q0, [r2]
1285; CHECK-NEXT:    adds r3, r0, #3
1286; CHECK-NEXT:    vpt.i16 ne, q0, zr
1287; CHECK-NEXT:    vldrht.u16 q0, [r3]
1288; CHECK-NEXT:    vstrh.16 q0, [r1]
1289; CHECK-NEXT:    bx lr
1290entry:
1291  %z = getelementptr inbounds i8, ptr %x, i32 3
1292  %mask = load <8 x i16>, ptr %m, align 2
1293  %c = icmp ne <8 x i16> %mask, zeroinitializer
1294  %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef)
1295  store <8 x half> %0, ptr %y, align 2
1296  ret ptr %x
1297}
1298
1299define ptr @ldrhf16_2(ptr %x, ptr %y, ptr %m) {
1300; CHECK-LABEL: ldrhf16_2:
1301; CHECK:       @ %bb.0: @ %entry
1302; CHECK-NEXT:    vldrh.u16 q0, [r2]
1303; CHECK-NEXT:    vpt.i16 ne, q0, zr
1304; CHECK-NEXT:    vldrht.u16 q0, [r0, #2]
1305; CHECK-NEXT:    vstrh.16 q0, [r1]
1306; CHECK-NEXT:    bx lr
1307entry:
1308  %z = getelementptr inbounds i8, ptr %x, i32 2
1309  %mask = load <8 x i16>, ptr %m, align 2
1310  %c = icmp ne <8 x i16> %mask, zeroinitializer
1311  %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef)
1312  store <8 x half> %0, ptr %y, align 2
1313  ret ptr %x
1314}
1315
1316define ptr @ldrhf16_254(ptr %x, ptr %y, ptr %m) {
1317; CHECK-LABEL: ldrhf16_254:
1318; CHECK:       @ %bb.0: @ %entry
1319; CHECK-NEXT:    vldrh.u16 q0, [r2]
1320; CHECK-NEXT:    vpt.i16 ne, q0, zr
1321; CHECK-NEXT:    vldrht.u16 q0, [r0, #254]
1322; CHECK-NEXT:    vstrh.16 q0, [r1]
1323; CHECK-NEXT:    bx lr
1324entry:
1325  %z = getelementptr inbounds i8, ptr %x, i32 254
1326  %mask = load <8 x i16>, ptr %m, align 2
1327  %c = icmp ne <8 x i16> %mask, zeroinitializer
1328  %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef)
1329  store <8 x half> %0, ptr %y, align 2
1330  ret ptr %x
1331}
1332
1333define ptr @ldrhf16_256(ptr %x, ptr %y, ptr %m) {
1334; CHECK-LABEL: ldrhf16_256:
1335; CHECK:       @ %bb.0: @ %entry
1336; CHECK-NEXT:    vldrh.u16 q0, [r2]
1337; CHECK-NEXT:    add.w r3, r0, #256
1338; CHECK-NEXT:    vpt.i16 ne, q0, zr
1339; CHECK-NEXT:    vldrht.u16 q0, [r3]
1340; CHECK-NEXT:    vstrh.16 q0, [r1]
1341; CHECK-NEXT:    bx lr
1342entry:
1343  %z = getelementptr inbounds i8, ptr %x, i32 256
1344  %mask = load <8 x i16>, ptr %m, align 2
1345  %c = icmp ne <8 x i16> %mask, zeroinitializer
1346  %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef)
1347  store <8 x half> %0, ptr %y, align 2
1348  ret ptr %x
1349}
1350
1351define ptr @ldrhf16_m254(ptr %x, ptr %y, ptr %m) {
1352; CHECK-LABEL: ldrhf16_m254:
1353; CHECK:       @ %bb.0: @ %entry
1354; CHECK-NEXT:    vldrh.u16 q0, [r2]
1355; CHECK-NEXT:    vpt.i16 ne, q0, zr
1356; CHECK-NEXT:    vldrht.u16 q0, [r0, #-254]
1357; CHECK-NEXT:    vstrh.16 q0, [r1]
1358; CHECK-NEXT:    bx lr
1359entry:
1360  %z = getelementptr inbounds i8, ptr %x, i32 -254
1361  %mask = load <8 x i16>, ptr %m, align 2
1362  %c = icmp ne <8 x i16> %mask, zeroinitializer
1363  %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef)
1364  store <8 x half> %0, ptr %y, align 2
1365  ret ptr %x
1366}
1367
1368define ptr @ldrhf16_m256(ptr %x, ptr %y, ptr %m) {
1369; CHECK-LABEL: ldrhf16_m256:
1370; CHECK:       @ %bb.0: @ %entry
1371; CHECK-NEXT:    vldrh.u16 q0, [r2]
1372; CHECK-NEXT:    sub.w r3, r0, #256
1373; CHECK-NEXT:    vpt.i16 ne, q0, zr
1374; CHECK-NEXT:    vldrht.u16 q0, [r3]
1375; CHECK-NEXT:    vstrh.16 q0, [r1]
1376; CHECK-NEXT:    bx lr
1377entry:
1378  %z = getelementptr inbounds i8, ptr %x, i32 -256
1379  %mask = load <8 x i16>, ptr %m, align 2
1380  %c = icmp ne <8 x i16> %mask, zeroinitializer
1381  %0 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %z, i32 2, <8 x i1> %c, <8 x half> undef)
1382  store <8 x half> %0, ptr %y, align 2
1383  ret ptr %x
1384}
1385
1386
1387
1388
1389define ptr @strw32_4(ptr %y, ptr %x, ptr %m) {
1390; CHECK-LABEL: strw32_4:
1391; CHECK:       @ %bb.0: @ %entry
1392; CHECK-NEXT:    vldrw.u32 q1, [r2]
1393; CHECK-NEXT:    vldrw.u32 q0, [r1]
1394; CHECK-NEXT:    vpt.i32 ne, q1, zr
1395; CHECK-NEXT:    vstrwt.32 q0, [r0, #4]
1396; CHECK-NEXT:    bx lr
1397entry:
1398  %z = getelementptr inbounds i8, ptr %y, i32 4
1399  %mask = load <4 x i32>, ptr %m, align 4
1400  %c = icmp ne <4 x i32> %mask, zeroinitializer
1401  %0 = load <4 x i32>, ptr %x, align 4
1402  call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c)
1403  ret ptr %y
1404}
1405
1406define ptr @strw32_3(ptr %y, ptr %x, ptr %m) {
1407; CHECK-LABEL: strw32_3:
1408; CHECK:       @ %bb.0: @ %entry
1409; CHECK-NEXT:    vldrw.u32 q1, [r2]
1410; CHECK-NEXT:    vldrw.u32 q0, [r1]
1411; CHECK-NEXT:    adds r1, r0, #3
1412; CHECK-NEXT:    vpt.i32 ne, q1, zr
1413; CHECK-NEXT:    vstrwt.32 q0, [r1]
1414; CHECK-NEXT:    bx lr
1415entry:
1416  %z = getelementptr inbounds i8, ptr %y, i32 3
1417  %mask = load <4 x i32>, ptr %m, align 4
1418  %c = icmp ne <4 x i32> %mask, zeroinitializer
1419  %0 = load <4 x i32>, ptr %x, align 4
1420  call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c)
1421  ret ptr %y
1422}
1423
1424define ptr @strw32_2(ptr %y, ptr %x, ptr %m) {
1425; CHECK-LABEL: strw32_2:
1426; CHECK:       @ %bb.0: @ %entry
1427; CHECK-NEXT:    vldrw.u32 q1, [r2]
1428; CHECK-NEXT:    vldrw.u32 q0, [r1]
1429; CHECK-NEXT:    adds r1, r0, #2
1430; CHECK-NEXT:    vpt.i32 ne, q1, zr
1431; CHECK-NEXT:    vstrwt.32 q0, [r1]
1432; CHECK-NEXT:    bx lr
1433entry:
1434  %z = getelementptr inbounds i8, ptr %y, i32 2
1435  %mask = load <4 x i32>, ptr %m, align 4
1436  %c = icmp ne <4 x i32> %mask, zeroinitializer
1437  %0 = load <4 x i32>, ptr %x, align 4
1438  call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c)
1439  ret ptr %y
1440}
1441
1442define ptr @strw32_508(ptr %y, ptr %x, ptr %m) {
1443; CHECK-LABEL: strw32_508:
1444; CHECK:       @ %bb.0: @ %entry
1445; CHECK-NEXT:    vldrw.u32 q1, [r2]
1446; CHECK-NEXT:    vldrw.u32 q0, [r1]
1447; CHECK-NEXT:    vpt.i32 ne, q1, zr
1448; CHECK-NEXT:    vstrwt.32 q0, [r0, #508]
1449; CHECK-NEXT:    bx lr
1450entry:
1451  %z = getelementptr inbounds i8, ptr %y, i32 508
1452  %mask = load <4 x i32>, ptr %m, align 4
1453  %c = icmp ne <4 x i32> %mask, zeroinitializer
1454  %0 = load <4 x i32>, ptr %x, align 4
1455  call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c)
1456  ret ptr %y
1457}
1458
1459define ptr @strw32_512(ptr %y, ptr %x, ptr %m) {
1460; CHECK-LABEL: strw32_512:
1461; CHECK:       @ %bb.0: @ %entry
1462; CHECK-NEXT:    vldrw.u32 q1, [r2]
1463; CHECK-NEXT:    vldrw.u32 q0, [r1]
1464; CHECK-NEXT:    add.w r1, r0, #512
1465; CHECK-NEXT:    vpt.i32 ne, q1, zr
1466; CHECK-NEXT:    vstrwt.32 q0, [r1]
1467; CHECK-NEXT:    bx lr
1468entry:
1469  %z = getelementptr inbounds i8, ptr %y, i32 512
1470  %mask = load <4 x i32>, ptr %m, align 4
1471  %c = icmp ne <4 x i32> %mask, zeroinitializer
1472  %0 = load <4 x i32>, ptr %x, align 4
1473  call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c)
1474  ret ptr %y
1475}
1476
1477define ptr @strw32_m508(ptr %y, ptr %x, ptr %m) {
1478; CHECK-LABEL: strw32_m508:
1479; CHECK:       @ %bb.0: @ %entry
1480; CHECK-NEXT:    vldrw.u32 q1, [r2]
1481; CHECK-NEXT:    vldrw.u32 q0, [r1]
1482; CHECK-NEXT:    vpt.i32 ne, q1, zr
1483; CHECK-NEXT:    vstrwt.32 q0, [r0, #-508]
1484; CHECK-NEXT:    bx lr
1485entry:
1486  %z = getelementptr inbounds i8, ptr %y, i32 -508
1487  %mask = load <4 x i32>, ptr %m, align 4
1488  %c = icmp ne <4 x i32> %mask, zeroinitializer
1489  %0 = load <4 x i32>, ptr %x, align 4
1490  call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c)
1491  ret ptr %y
1492}
1493
1494define ptr @strw32_m512(ptr %y, ptr %x, ptr %m) {
1495; CHECK-LABEL: strw32_m512:
1496; CHECK:       @ %bb.0: @ %entry
1497; CHECK-NEXT:    vldrw.u32 q1, [r2]
1498; CHECK-NEXT:    vldrw.u32 q0, [r1]
1499; CHECK-NEXT:    sub.w r1, r0, #512
1500; CHECK-NEXT:    vpt.i32 ne, q1, zr
1501; CHECK-NEXT:    vstrwt.32 q0, [r1]
1502; CHECK-NEXT:    bx lr
1503entry:
1504  %z = getelementptr inbounds i8, ptr %y, i32 -512
1505  %mask = load <4 x i32>, ptr %m, align 4
1506  %c = icmp ne <4 x i32> %mask, zeroinitializer
1507  %0 = load <4 x i32>, ptr %x, align 4
1508  call void @llvm.masked.store.v4i32.p0(<4 x i32> %0, ptr %z, i32 4, <4 x i1> %c)
1509  ret ptr %y
1510}
1511
1512define ptr @strh32_4(ptr %y, ptr %x, ptr %m) {
1513; CHECK-LABEL: strh32_4:
1514; CHECK:       @ %bb.0: @ %entry
1515; CHECK-NEXT:    vldrw.u32 q1, [r2]
1516; CHECK-NEXT:    vldrh.u32 q0, [r1]
1517; CHECK-NEXT:    vpt.i32 ne, q1, zr
1518; CHECK-NEXT:    vstrht.32 q0, [r0, #4]
1519; CHECK-NEXT:    bx lr
1520entry:
1521  %z = getelementptr inbounds i8, ptr %y, i32 4
1522  %mask = load <4 x i32>, ptr %m, align 4
1523  %c = icmp ne <4 x i32> %mask, zeroinitializer
1524  %0 = load <4 x i16>, ptr %x, align 2
1525  call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c)
1526  ret ptr %y
1527}
1528
1529define ptr @strh32_3(ptr %y, ptr %x, ptr %m) {
1530; CHECK-LABEL: strh32_3:
1531; CHECK:       @ %bb.0: @ %entry
1532; CHECK-NEXT:    vldrw.u32 q1, [r2]
1533; CHECK-NEXT:    vldrh.u32 q0, [r1]
1534; CHECK-NEXT:    adds r1, r0, #3
1535; CHECK-NEXT:    vpt.i32 ne, q1, zr
1536; CHECK-NEXT:    vstrht.32 q0, [r1]
1537; CHECK-NEXT:    bx lr
1538entry:
1539  %z = getelementptr inbounds i8, ptr %y, i32 3
1540  %mask = load <4 x i32>, ptr %m, align 4
1541  %c = icmp ne <4 x i32> %mask, zeroinitializer
1542  %0 = load <4 x i16>, ptr %x, align 2
1543  call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c)
1544  ret ptr %y
1545}
1546
1547define ptr @strh32_2(ptr %y, ptr %x, ptr %m) {
1548; CHECK-LABEL: strh32_2:
1549; CHECK:       @ %bb.0: @ %entry
1550; CHECK-NEXT:    vldrw.u32 q1, [r2]
1551; CHECK-NEXT:    vldrh.u32 q0, [r1]
1552; CHECK-NEXT:    vpt.i32 ne, q1, zr
1553; CHECK-NEXT:    vstrht.32 q0, [r0, #2]
1554; CHECK-NEXT:    bx lr
1555entry:
1556  %z = getelementptr inbounds i8, ptr %y, i32 2
1557  %mask = load <4 x i32>, ptr %m, align 4
1558  %c = icmp ne <4 x i32> %mask, zeroinitializer
1559  %0 = load <4 x i16>, ptr %x, align 2
1560  call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c)
1561  ret ptr %y
1562}
1563
1564define ptr @strh32_254(ptr %y, ptr %x, ptr %m) {
1565; CHECK-LABEL: strh32_254:
1566; CHECK:       @ %bb.0: @ %entry
1567; CHECK-NEXT:    vldrw.u32 q1, [r2]
1568; CHECK-NEXT:    vldrh.u32 q0, [r1]
1569; CHECK-NEXT:    vpt.i32 ne, q1, zr
1570; CHECK-NEXT:    vstrht.32 q0, [r0, #254]
1571; CHECK-NEXT:    bx lr
1572entry:
1573  %z = getelementptr inbounds i8, ptr %y, i32 254
1574  %mask = load <4 x i32>, ptr %m, align 4
1575  %c = icmp ne <4 x i32> %mask, zeroinitializer
1576  %0 = load <4 x i16>, ptr %x, align 2
1577  call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c)
1578  ret ptr %y
1579}
1580
1581define ptr @strh32_256(ptr %y, ptr %x, ptr %m) {
1582; CHECK-LABEL: strh32_256:
1583; CHECK:       @ %bb.0: @ %entry
1584; CHECK-NEXT:    vldrw.u32 q1, [r2]
1585; CHECK-NEXT:    vldrh.u32 q0, [r1]
1586; CHECK-NEXT:    add.w r1, r0, #256
1587; CHECK-NEXT:    vpt.i32 ne, q1, zr
1588; CHECK-NEXT:    vstrht.32 q0, [r1]
1589; CHECK-NEXT:    bx lr
1590entry:
1591  %z = getelementptr inbounds i8, ptr %y, i32 256
1592  %mask = load <4 x i32>, ptr %m, align 4
1593  %c = icmp ne <4 x i32> %mask, zeroinitializer
1594  %0 = load <4 x i16>, ptr %x, align 2
1595  call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c)
1596  ret ptr %y
1597}
1598
1599define ptr @strh32_m254(ptr %y, ptr %x, ptr %m) {
1600; CHECK-LABEL: strh32_m254:
1601; CHECK:       @ %bb.0: @ %entry
1602; CHECK-NEXT:    vldrw.u32 q1, [r2]
1603; CHECK-NEXT:    vldrh.u32 q0, [r1]
1604; CHECK-NEXT:    vpt.i32 ne, q1, zr
1605; CHECK-NEXT:    vstrht.32 q0, [r0, #-254]
1606; CHECK-NEXT:    bx lr
1607entry:
1608  %z = getelementptr inbounds i8, ptr %y, i32 -254
1609  %mask = load <4 x i32>, ptr %m, align 4
1610  %c = icmp ne <4 x i32> %mask, zeroinitializer
1611  %0 = load <4 x i16>, ptr %x, align 2
1612  call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c)
1613  ret ptr %y
1614}
1615
1616define ptr @strh32_m256(ptr %y, ptr %x, ptr %m) {
1617; CHECK-LABEL: strh32_m256:
1618; CHECK:       @ %bb.0: @ %entry
1619; CHECK-NEXT:    vldrw.u32 q1, [r2]
1620; CHECK-NEXT:    vldrh.u32 q0, [r1]
1621; CHECK-NEXT:    sub.w r1, r0, #256
1622; CHECK-NEXT:    vpt.i32 ne, q1, zr
1623; CHECK-NEXT:    vstrht.32 q0, [r1]
1624; CHECK-NEXT:    bx lr
1625entry:
1626  %z = getelementptr inbounds i8, ptr %y, i32 -256
1627  %mask = load <4 x i32>, ptr %m, align 4
1628  %c = icmp ne <4 x i32> %mask, zeroinitializer
1629  %0 = load <4 x i16>, ptr %x, align 2
1630  call void @llvm.masked.store.v4i16.p0(<4 x i16> %0, ptr %z, i32 2, <4 x i1> %c)
1631  ret ptr %y
1632}
1633
1634define ptr @strh16_4(ptr %y, ptr %x, ptr %m) {
1635; CHECK-LABEL: strh16_4:
1636; CHECK:       @ %bb.0: @ %entry
1637; CHECK-NEXT:    vldrh.u16 q1, [r2]
1638; CHECK-NEXT:    vldrh.u16 q0, [r1]
1639; CHECK-NEXT:    vpt.i16 ne, q1, zr
1640; CHECK-NEXT:    vstrht.16 q0, [r0, #4]
1641; CHECK-NEXT:    bx lr
1642entry:
1643  %z = getelementptr inbounds i8, ptr %y, i32 4
1644  %mask = load <8 x i16>, ptr %m, align 2
1645  %c = icmp ne <8 x i16> %mask, zeroinitializer
1646  %0 = load <8 x i16>, ptr %x, align 2
1647  call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c)
1648  ret ptr %y
1649}
1650
1651define ptr @strh16_3(ptr %y, ptr %x, ptr %m) {
1652; CHECK-LABEL: strh16_3:
1653; CHECK:       @ %bb.0: @ %entry
1654; CHECK-NEXT:    vldrh.u16 q1, [r2]
1655; CHECK-NEXT:    vldrh.u16 q0, [r1]
1656; CHECK-NEXT:    adds r1, r0, #3
1657; CHECK-NEXT:    vpt.i16 ne, q1, zr
1658; CHECK-NEXT:    vstrht.16 q0, [r1]
1659; CHECK-NEXT:    bx lr
1660entry:
1661  %z = getelementptr inbounds i8, ptr %y, i32 3
1662  %mask = load <8 x i16>, ptr %m, align 2
1663  %c = icmp ne <8 x i16> %mask, zeroinitializer
1664  %0 = load <8 x i16>, ptr %x, align 2
1665  call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c)
1666  ret ptr %y
1667}
1668
1669define ptr @strh16_2(ptr %y, ptr %x, ptr %m) {
1670; CHECK-LABEL: strh16_2:
1671; CHECK:       @ %bb.0: @ %entry
1672; CHECK-NEXT:    vldrh.u16 q1, [r2]
1673; CHECK-NEXT:    vldrh.u16 q0, [r1]
1674; CHECK-NEXT:    vpt.i16 ne, q1, zr
1675; CHECK-NEXT:    vstrht.16 q0, [r0, #2]
1676; CHECK-NEXT:    bx lr
1677entry:
1678  %z = getelementptr inbounds i8, ptr %y, i32 2
1679  %mask = load <8 x i16>, ptr %m, align 2
1680  %c = icmp ne <8 x i16> %mask, zeroinitializer
1681  %0 = load <8 x i16>, ptr %x, align 2
1682  call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c)
1683  ret ptr %y
1684}
1685
1686define ptr @strh16_254(ptr %y, ptr %x, ptr %m) {
1687; CHECK-LABEL: strh16_254:
1688; CHECK:       @ %bb.0: @ %entry
1689; CHECK-NEXT:    vldrh.u16 q1, [r2]
1690; CHECK-NEXT:    vldrh.u16 q0, [r1]
1691; CHECK-NEXT:    vpt.i16 ne, q1, zr
1692; CHECK-NEXT:    vstrht.16 q0, [r0, #254]
1693; CHECK-NEXT:    bx lr
1694entry:
1695  %z = getelementptr inbounds i8, ptr %y, i32 254
1696  %mask = load <8 x i16>, ptr %m, align 2
1697  %c = icmp ne <8 x i16> %mask, zeroinitializer
1698  %0 = load <8 x i16>, ptr %x, align 2
1699  call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c)
1700  ret ptr %y
1701}
1702
1703define ptr @strh16_256(ptr %y, ptr %x, ptr %m) {
1704; CHECK-LABEL: strh16_256:
1705; CHECK:       @ %bb.0: @ %entry
1706; CHECK-NEXT:    vldrh.u16 q1, [r2]
1707; CHECK-NEXT:    vldrh.u16 q0, [r1]
1708; CHECK-NEXT:    add.w r1, r0, #256
1709; CHECK-NEXT:    vpt.i16 ne, q1, zr
1710; CHECK-NEXT:    vstrht.16 q0, [r1]
1711; CHECK-NEXT:    bx lr
1712entry:
1713  %z = getelementptr inbounds i8, ptr %y, i32 256
1714  %mask = load <8 x i16>, ptr %m, align 2
1715  %c = icmp ne <8 x i16> %mask, zeroinitializer
1716  %0 = load <8 x i16>, ptr %x, align 2
1717  call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c)
1718  ret ptr %y
1719}
1720
1721define ptr @strh16_m254(ptr %y, ptr %x, ptr %m) {
1722; CHECK-LABEL: strh16_m254:
1723; CHECK:       @ %bb.0: @ %entry
1724; CHECK-NEXT:    vldrh.u16 q1, [r2]
1725; CHECK-NEXT:    vldrh.u16 q0, [r1]
1726; CHECK-NEXT:    vpt.i16 ne, q1, zr
1727; CHECK-NEXT:    vstrht.16 q0, [r0, #-254]
1728; CHECK-NEXT:    bx lr
1729entry:
1730  %z = getelementptr inbounds i8, ptr %y, i32 -254
1731  %mask = load <8 x i16>, ptr %m, align 2
1732  %c = icmp ne <8 x i16> %mask, zeroinitializer
1733  %0 = load <8 x i16>, ptr %x, align 2
1734  call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c)
1735  ret ptr %y
1736}
1737
1738define ptr @strh16_m256(ptr %y, ptr %x, ptr %m) {
1739; CHECK-LABEL: strh16_m256:
1740; CHECK:       @ %bb.0: @ %entry
1741; CHECK-NEXT:    vldrh.u16 q1, [r2]
1742; CHECK-NEXT:    vldrh.u16 q0, [r1]
1743; CHECK-NEXT:    sub.w r1, r0, #256
1744; CHECK-NEXT:    vpt.i16 ne, q1, zr
1745; CHECK-NEXT:    vstrht.16 q0, [r1]
1746; CHECK-NEXT:    bx lr
1747entry:
1748  %z = getelementptr inbounds i8, ptr %y, i32 -256
1749  %mask = load <8 x i16>, ptr %m, align 2
1750  %c = icmp ne <8 x i16> %mask, zeroinitializer
1751  %0 = load <8 x i16>, ptr %x, align 2
1752  call void @llvm.masked.store.v8i16.p0(<8 x i16> %0, ptr %z, i32 2, <8 x i1> %c)
1753  ret ptr %y
1754}
1755
1756define ptr @strb32_4(ptr %y, ptr %x, ptr %m) {
1757; CHECK-LABEL: strb32_4:
1758; CHECK:       @ %bb.0: @ %entry
1759; CHECK-NEXT:    vldrw.u32 q1, [r2]
1760; CHECK-NEXT:    vldrb.u32 q0, [r1]
1761; CHECK-NEXT:    vpt.i32 ne, q1, zr
1762; CHECK-NEXT:    vstrbt.32 q0, [r0, #4]
1763; CHECK-NEXT:    bx lr
1764entry:
1765  %z = getelementptr inbounds i8, ptr %y, i32 4
1766  %mask = load <4 x i32>, ptr %m, align 4
1767  %c = icmp ne <4 x i32> %mask, zeroinitializer
1768  %0 = load <4 x i8>, ptr %x, align 1
1769  call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c)
1770  ret ptr %y
1771}
1772
1773define ptr @strb32_3(ptr %y, ptr %x, ptr %m) {
1774; CHECK-LABEL: strb32_3:
1775; CHECK:       @ %bb.0: @ %entry
1776; CHECK-NEXT:    vldrw.u32 q1, [r2]
1777; CHECK-NEXT:    vldrb.u32 q0, [r1]
1778; CHECK-NEXT:    vpt.i32 ne, q1, zr
1779; CHECK-NEXT:    vstrbt.32 q0, [r0, #3]
1780; CHECK-NEXT:    bx lr
1781entry:
1782  %z = getelementptr inbounds i8, ptr %y, i32 3
1783  %mask = load <4 x i32>, ptr %m, align 4
1784  %c = icmp ne <4 x i32> %mask, zeroinitializer
1785  %0 = load <4 x i8>, ptr %x, align 1
1786  call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c)
1787  ret ptr %y
1788}
1789
1790define ptr @strb32_2(ptr %y, ptr %x, ptr %m) {
1791; CHECK-LABEL: strb32_2:
1792; CHECK:       @ %bb.0: @ %entry
1793; CHECK-NEXT:    vldrw.u32 q1, [r2]
1794; CHECK-NEXT:    vldrb.u32 q0, [r1]
1795; CHECK-NEXT:    vpt.i32 ne, q1, zr
1796; CHECK-NEXT:    vstrbt.32 q0, [r0, #2]
1797; CHECK-NEXT:    bx lr
1798entry:
1799  %z = getelementptr inbounds i8, ptr %y, i32 2
1800  %mask = load <4 x i32>, ptr %m, align 4
1801  %c = icmp ne <4 x i32> %mask, zeroinitializer
1802  %0 = load <4 x i8>, ptr %x, align 1
1803  call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c)
1804  ret ptr %y
1805}
1806
1807define ptr @strb32_127(ptr %y, ptr %x, ptr %m) {
1808; CHECK-LABEL: strb32_127:
1809; CHECK:       @ %bb.0: @ %entry
1810; CHECK-NEXT:    vldrw.u32 q1, [r2]
1811; CHECK-NEXT:    vldrb.u32 q0, [r1]
1812; CHECK-NEXT:    vpt.i32 ne, q1, zr
1813; CHECK-NEXT:    vstrbt.32 q0, [r0, #127]
1814; CHECK-NEXT:    bx lr
1815entry:
1816  %z = getelementptr inbounds i8, ptr %y, i32 127
1817  %mask = load <4 x i32>, ptr %m, align 4
1818  %c = icmp ne <4 x i32> %mask, zeroinitializer
1819  %0 = load <4 x i8>, ptr %x, align 1
1820  call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c)
1821  ret ptr %y
1822}
1823
1824define ptr @strb32_128(ptr %y, ptr %x, ptr %m) {
1825; CHECK-LABEL: strb32_128:
1826; CHECK:       @ %bb.0: @ %entry
1827; CHECK-NEXT:    vldrw.u32 q1, [r2]
1828; CHECK-NEXT:    vldrb.u32 q0, [r1]
1829; CHECK-NEXT:    add.w r1, r0, #128
1830; CHECK-NEXT:    vpt.i32 ne, q1, zr
1831; CHECK-NEXT:    vstrbt.32 q0, [r1]
1832; CHECK-NEXT:    bx lr
1833entry:
1834  %z = getelementptr inbounds i8, ptr %y, i32 128
1835  %mask = load <4 x i32>, ptr %m, align 4
1836  %c = icmp ne <4 x i32> %mask, zeroinitializer
1837  %0 = load <4 x i8>, ptr %x, align 1
1838  call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c)
1839  ret ptr %y
1840}
1841
1842define ptr @strb32_m127(ptr %y, ptr %x, ptr %m) {
1843; CHECK-LABEL: strb32_m127:
1844; CHECK:       @ %bb.0: @ %entry
1845; CHECK-NEXT:    vldrw.u32 q1, [r2]
1846; CHECK-NEXT:    vldrb.u32 q0, [r1]
1847; CHECK-NEXT:    vpt.i32 ne, q1, zr
1848; CHECK-NEXT:    vstrbt.32 q0, [r0, #-127]
1849; CHECK-NEXT:    bx lr
1850entry:
1851  %z = getelementptr inbounds i8, ptr %y, i32 -127
1852  %mask = load <4 x i32>, ptr %m, align 4
1853  %c = icmp ne <4 x i32> %mask, zeroinitializer
1854  %0 = load <4 x i8>, ptr %x, align 1
1855  call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c)
1856  ret ptr %y
1857}
1858
1859define ptr @strb32_m128(ptr %y, ptr %x, ptr %m) {
1860; CHECK-LABEL: strb32_m128:
1861; CHECK:       @ %bb.0: @ %entry
1862; CHECK-NEXT:    vldrw.u32 q1, [r2]
1863; CHECK-NEXT:    vldrb.u32 q0, [r1]
1864; CHECK-NEXT:    sub.w r1, r0, #128
1865; CHECK-NEXT:    vpt.i32 ne, q1, zr
1866; CHECK-NEXT:    vstrbt.32 q0, [r1]
1867; CHECK-NEXT:    bx lr
1868entry:
1869  %z = getelementptr inbounds i8, ptr %y, i32 -128
1870  %mask = load <4 x i32>, ptr %m, align 4
1871  %c = icmp ne <4 x i32> %mask, zeroinitializer
1872  %0 = load <4 x i8>, ptr %x, align 1
1873  call void @llvm.masked.store.v4i8.p0(<4 x i8> %0, ptr %z, i32 1, <4 x i1> %c)
1874  ret ptr %y
1875}
1876
1877define ptr @strb16_4(ptr %y, ptr %x, ptr %m) {
1878; CHECK-LABEL: strb16_4:
1879; CHECK:       @ %bb.0: @ %entry
1880; CHECK-NEXT:    vldrh.u16 q1, [r2]
1881; CHECK-NEXT:    vldrb.u16 q0, [r1]
1882; CHECK-NEXT:    vpt.i16 ne, q1, zr
1883; CHECK-NEXT:    vstrbt.16 q0, [r0, #4]
1884; CHECK-NEXT:    bx lr
1885entry:
1886  %z = getelementptr inbounds i8, ptr %y, i32 4
1887  %mask = load <8 x i16>, ptr %m, align 2
1888  %c = icmp ne <8 x i16> %mask, zeroinitializer
1889  %0 = load <8 x i8>, ptr %x, align 1
1890  call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c)
1891  ret ptr %y
1892}
1893
1894define ptr @strb16_3(ptr %y, ptr %x, ptr %m) {
1895; CHECK-LABEL: strb16_3:
1896; CHECK:       @ %bb.0: @ %entry
1897; CHECK-NEXT:    vldrh.u16 q1, [r2]
1898; CHECK-NEXT:    vldrb.u16 q0, [r1]
1899; CHECK-NEXT:    vpt.i16 ne, q1, zr
1900; CHECK-NEXT:    vstrbt.16 q0, [r0, #3]
1901; CHECK-NEXT:    bx lr
1902entry:
1903  %z = getelementptr inbounds i8, ptr %y, i32 3
1904  %mask = load <8 x i16>, ptr %m, align 2
1905  %c = icmp ne <8 x i16> %mask, zeroinitializer
1906  %0 = load <8 x i8>, ptr %x, align 1
1907  call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c)
1908  ret ptr %y
1909}
1910
1911define ptr @strb16_2(ptr %y, ptr %x, ptr %m) {
1912; CHECK-LABEL: strb16_2:
1913; CHECK:       @ %bb.0: @ %entry
1914; CHECK-NEXT:    vldrh.u16 q1, [r2]
1915; CHECK-NEXT:    vldrb.u16 q0, [r1]
1916; CHECK-NEXT:    vpt.i16 ne, q1, zr
1917; CHECK-NEXT:    vstrbt.16 q0, [r0, #2]
1918; CHECK-NEXT:    bx lr
1919entry:
1920  %z = getelementptr inbounds i8, ptr %y, i32 2
1921  %mask = load <8 x i16>, ptr %m, align 2
1922  %c = icmp ne <8 x i16> %mask, zeroinitializer
1923  %0 = load <8 x i8>, ptr %x, align 1
1924  call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c)
1925  ret ptr %y
1926}
1927
1928define ptr @strb16_127(ptr %y, ptr %x, ptr %m) {
1929; CHECK-LABEL: strb16_127:
1930; CHECK:       @ %bb.0: @ %entry
1931; CHECK-NEXT:    vldrh.u16 q1, [r2]
1932; CHECK-NEXT:    vldrb.u16 q0, [r1]
1933; CHECK-NEXT:    vpt.i16 ne, q1, zr
1934; CHECK-NEXT:    vstrbt.16 q0, [r0, #127]
1935; CHECK-NEXT:    bx lr
1936entry:
1937  %z = getelementptr inbounds i8, ptr %y, i32 127
1938  %mask = load <8 x i16>, ptr %m, align 2
1939  %c = icmp ne <8 x i16> %mask, zeroinitializer
1940  %0 = load <8 x i8>, ptr %x, align 1
1941  call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c)
1942  ret ptr %y
1943}
1944
1945define ptr @strb16_128(ptr %y, ptr %x, ptr %m) {
1946; CHECK-LABEL: strb16_128:
1947; CHECK:       @ %bb.0: @ %entry
1948; CHECK-NEXT:    vldrh.u16 q1, [r2]
1949; CHECK-NEXT:    vldrb.u16 q0, [r1]
1950; CHECK-NEXT:    add.w r1, r0, #128
1951; CHECK-NEXT:    vpt.i16 ne, q1, zr
1952; CHECK-NEXT:    vstrbt.16 q0, [r1]
1953; CHECK-NEXT:    bx lr
1954entry:
1955  %z = getelementptr inbounds i8, ptr %y, i32 128
1956  %mask = load <8 x i16>, ptr %m, align 2
1957  %c = icmp ne <8 x i16> %mask, zeroinitializer
1958  %0 = load <8 x i8>, ptr %x, align 1
1959  call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c)
1960  ret ptr %y
1961}
1962
1963define ptr @strb16_m127(ptr %y, ptr %x, ptr %m) {
1964; CHECK-LABEL: strb16_m127:
1965; CHECK:       @ %bb.0: @ %entry
1966; CHECK-NEXT:    vldrh.u16 q1, [r2]
1967; CHECK-NEXT:    vldrb.u16 q0, [r1]
1968; CHECK-NEXT:    vpt.i16 ne, q1, zr
1969; CHECK-NEXT:    vstrbt.16 q0, [r0, #-127]
1970; CHECK-NEXT:    bx lr
1971entry:
1972  %z = getelementptr inbounds i8, ptr %y, i32 -127
1973  %mask = load <8 x i16>, ptr %m, align 2
1974  %c = icmp ne <8 x i16> %mask, zeroinitializer
1975  %0 = load <8 x i8>, ptr %x, align 1
1976  call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c)
1977  ret ptr %y
1978}
1979
1980define ptr @strb16_m128(ptr %y, ptr %x, ptr %m) {
1981; CHECK-LABEL: strb16_m128:
1982; CHECK:       @ %bb.0: @ %entry
1983; CHECK-NEXT:    vldrh.u16 q1, [r2]
1984; CHECK-NEXT:    vldrb.u16 q0, [r1]
1985; CHECK-NEXT:    sub.w r1, r0, #128
1986; CHECK-NEXT:    vpt.i16 ne, q1, zr
1987; CHECK-NEXT:    vstrbt.16 q0, [r1]
1988; CHECK-NEXT:    bx lr
1989entry:
1990  %z = getelementptr inbounds i8, ptr %y, i32 -128
1991  %mask = load <8 x i16>, ptr %m, align 2
1992  %c = icmp ne <8 x i16> %mask, zeroinitializer
1993  %0 = load <8 x i8>, ptr %x, align 1
1994  call void @llvm.masked.store.v8i8.p0(<8 x i8> %0, ptr %z, i32 1, <8 x i1> %c)
1995  ret ptr %y
1996}
1997
1998define ptr @strb8_4(ptr %y, ptr %x, ptr %m) {
1999; CHECK-LABEL: strb8_4:
2000; CHECK:       @ %bb.0: @ %entry
2001; CHECK-NEXT:    vldrb.u8 q1, [r2]
2002; CHECK-NEXT:    vldrb.u8 q0, [r1]
2003; CHECK-NEXT:    vpt.i8 ne, q1, zr
2004; CHECK-NEXT:    vstrbt.8 q0, [r0, #4]
2005; CHECK-NEXT:    bx lr
2006entry:
2007  %z = getelementptr inbounds i8, ptr %y, i32 4
2008  %mask = load <16 x i8>, ptr %m, align 1
2009  %c = icmp ne <16 x i8> %mask, zeroinitializer
2010  %0 = load <16 x i8>, ptr %x, align 1
2011  call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c)
2012  ret ptr %y
2013}
2014
2015define ptr @strb8_3(ptr %y, ptr %x, ptr %m) {
2016; CHECK-LABEL: strb8_3:
2017; CHECK:       @ %bb.0: @ %entry
2018; CHECK-NEXT:    vldrb.u8 q1, [r2]
2019; CHECK-NEXT:    vldrb.u8 q0, [r1]
2020; CHECK-NEXT:    vpt.i8 ne, q1, zr
2021; CHECK-NEXT:    vstrbt.8 q0, [r0, #3]
2022; CHECK-NEXT:    bx lr
2023entry:
2024  %z = getelementptr inbounds i8, ptr %y, i32 3
2025  %mask = load <16 x i8>, ptr %m, align 1
2026  %c = icmp ne <16 x i8> %mask, zeroinitializer
2027  %0 = load <16 x i8>, ptr %x, align 1
2028  call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c)
2029  ret ptr %y
2030}
2031
2032define ptr @strb8_2(ptr %y, ptr %x, ptr %m) {
2033; CHECK-LABEL: strb8_2:
2034; CHECK:       @ %bb.0: @ %entry
2035; CHECK-NEXT:    vldrb.u8 q1, [r2]
2036; CHECK-NEXT:    vldrb.u8 q0, [r1]
2037; CHECK-NEXT:    vpt.i8 ne, q1, zr
2038; CHECK-NEXT:    vstrbt.8 q0, [r0, #2]
2039; CHECK-NEXT:    bx lr
2040entry:
2041  %z = getelementptr inbounds i8, ptr %y, i32 2
2042  %mask = load <16 x i8>, ptr %m, align 1
2043  %c = icmp ne <16 x i8> %mask, zeroinitializer
2044  %0 = load <16 x i8>, ptr %x, align 1
2045  call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c)
2046  ret ptr %y
2047}
2048
2049define ptr @strb8_127(ptr %y, ptr %x, ptr %m) {
2050; CHECK-LABEL: strb8_127:
2051; CHECK:       @ %bb.0: @ %entry
2052; CHECK-NEXT:    vldrb.u8 q1, [r2]
2053; CHECK-NEXT:    vldrb.u8 q0, [r1]
2054; CHECK-NEXT:    vpt.i8 ne, q1, zr
2055; CHECK-NEXT:    vstrbt.8 q0, [r0, #127]
2056; CHECK-NEXT:    bx lr
2057entry:
2058  %z = getelementptr inbounds i8, ptr %y, i32 127
2059  %mask = load <16 x i8>, ptr %m, align 1
2060  %c = icmp ne <16 x i8> %mask, zeroinitializer
2061  %0 = load <16 x i8>, ptr %x, align 1
2062  call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c)
2063  ret ptr %y
2064}
2065
2066define ptr @strb8_128(ptr %y, ptr %x, ptr %m) {
2067; CHECK-LABEL: strb8_128:
2068; CHECK:       @ %bb.0: @ %entry
2069; CHECK-NEXT:    vldrb.u8 q1, [r2]
2070; CHECK-NEXT:    vldrb.u8 q0, [r1]
2071; CHECK-NEXT:    add.w r1, r0, #128
2072; CHECK-NEXT:    vpt.i8 ne, q1, zr
2073; CHECK-NEXT:    vstrbt.8 q0, [r1]
2074; CHECK-NEXT:    bx lr
2075entry:
2076  %z = getelementptr inbounds i8, ptr %y, i32 128
2077  %mask = load <16 x i8>, ptr %m, align 1
2078  %c = icmp ne <16 x i8> %mask, zeroinitializer
2079  %0 = load <16 x i8>, ptr %x, align 1
2080  call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c)
2081  ret ptr %y
2082}
2083
2084define ptr @strb8_m127(ptr %y, ptr %x, ptr %m) {
2085; CHECK-LABEL: strb8_m127:
2086; CHECK:       @ %bb.0: @ %entry
2087; CHECK-NEXT:    vldrb.u8 q1, [r2]
2088; CHECK-NEXT:    vldrb.u8 q0, [r1]
2089; CHECK-NEXT:    vpt.i8 ne, q1, zr
2090; CHECK-NEXT:    vstrbt.8 q0, [r0, #-127]
2091; CHECK-NEXT:    bx lr
2092entry:
2093  %z = getelementptr inbounds i8, ptr %y, i32 -127
2094  %mask = load <16 x i8>, ptr %m, align 1
2095  %c = icmp ne <16 x i8> %mask, zeroinitializer
2096  %0 = load <16 x i8>, ptr %x, align 1
2097  call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c)
2098  ret ptr %y
2099}
2100
2101define ptr @strb8_m128(ptr %y, ptr %x, ptr %m) {
2102; CHECK-LABEL: strb8_m128:
2103; CHECK:       @ %bb.0: @ %entry
2104; CHECK-NEXT:    vldrb.u8 q1, [r2]
2105; CHECK-NEXT:    vldrb.u8 q0, [r1]
2106; CHECK-NEXT:    sub.w r1, r0, #128
2107; CHECK-NEXT:    vpt.i8 ne, q1, zr
2108; CHECK-NEXT:    vstrbt.8 q0, [r1]
2109; CHECK-NEXT:    bx lr
2110entry:
2111  %z = getelementptr inbounds i8, ptr %y, i32 -128
2112  %mask = load <16 x i8>, ptr %m, align 1
2113  %c = icmp ne <16 x i8> %mask, zeroinitializer
2114  %0 = load <16 x i8>, ptr %x, align 1
2115  call void @llvm.masked.store.v16i8.p0(<16 x i8> %0, ptr %z, i32 1, <16 x i1> %c)
2116  ret ptr %y
2117}
2118
2119define ptr @strwf32_4(ptr %y, ptr %x, ptr %m) {
2120; CHECK-LABEL: strwf32_4:
2121; CHECK:       @ %bb.0: @ %entry
2122; CHECK-NEXT:    vldrw.u32 q1, [r2]
2123; CHECK-NEXT:    vldrw.u32 q0, [r1]
2124; CHECK-NEXT:    vpt.i32 ne, q1, zr
2125; CHECK-NEXT:    vstrwt.32 q0, [r0, #4]
2126; CHECK-NEXT:    bx lr
2127entry:
2128  %z = getelementptr inbounds i8, ptr %y, i32 4
2129  %mask = load <4 x i32>, ptr %m, align 4
2130  %c = icmp ne <4 x i32> %mask, zeroinitializer
2131  %0 = load <4 x float>, ptr %x, align 4
2132  call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c)
2133  ret ptr %y
2134}
2135
2136define ptr @strwf32_3(ptr %y, ptr %x, ptr %m) {
2137; CHECK-LABEL: strwf32_3:
2138; CHECK:       @ %bb.0: @ %entry
2139; CHECK-NEXT:    vldrw.u32 q1, [r2]
2140; CHECK-NEXT:    vldrw.u32 q0, [r1]
2141; CHECK-NEXT:    adds r1, r0, #3
2142; CHECK-NEXT:    vpt.i32 ne, q1, zr
2143; CHECK-NEXT:    vstrwt.32 q0, [r1]
2144; CHECK-NEXT:    bx lr
2145entry:
2146  %z = getelementptr inbounds i8, ptr %y, i32 3
2147  %mask = load <4 x i32>, ptr %m, align 4
2148  %c = icmp ne <4 x i32> %mask, zeroinitializer
2149  %0 = load <4 x float>, ptr %x, align 4
2150  call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c)
2151  ret ptr %y
2152}
2153
2154define ptr @strwf32_2(ptr %y, ptr %x, ptr %m) {
2155; CHECK-LABEL: strwf32_2:
2156; CHECK:       @ %bb.0: @ %entry
2157; CHECK-NEXT:    vldrw.u32 q1, [r2]
2158; CHECK-NEXT:    vldrw.u32 q0, [r1]
2159; CHECK-NEXT:    adds r1, r0, #2
2160; CHECK-NEXT:    vpt.i32 ne, q1, zr
2161; CHECK-NEXT:    vstrwt.32 q0, [r1]
2162; CHECK-NEXT:    bx lr
2163entry:
2164  %z = getelementptr inbounds i8, ptr %y, i32 2
2165  %mask = load <4 x i32>, ptr %m, align 4
2166  %c = icmp ne <4 x i32> %mask, zeroinitializer
2167  %0 = load <4 x float>, ptr %x, align 4
2168  call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c)
2169  ret ptr %y
2170}
2171
2172define ptr @strwf32_508(ptr %y, ptr %x, ptr %m) {
2173; CHECK-LABEL: strwf32_508:
2174; CHECK:       @ %bb.0: @ %entry
2175; CHECK-NEXT:    vldrw.u32 q1, [r2]
2176; CHECK-NEXT:    vldrw.u32 q0, [r1]
2177; CHECK-NEXT:    vpt.i32 ne, q1, zr
2178; CHECK-NEXT:    vstrwt.32 q0, [r0, #508]
2179; CHECK-NEXT:    bx lr
2180entry:
2181  %z = getelementptr inbounds i8, ptr %y, i32 508
2182  %mask = load <4 x i32>, ptr %m, align 4
2183  %c = icmp ne <4 x i32> %mask, zeroinitializer
2184  %0 = load <4 x float>, ptr %x, align 4
2185  call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c)
2186  ret ptr %y
2187}
2188
2189define ptr @strwf32_512(ptr %y, ptr %x, ptr %m) {
2190; CHECK-LABEL: strwf32_512:
2191; CHECK:       @ %bb.0: @ %entry
2192; CHECK-NEXT:    vldrw.u32 q1, [r2]
2193; CHECK-NEXT:    vldrw.u32 q0, [r1]
2194; CHECK-NEXT:    add.w r1, r0, #512
2195; CHECK-NEXT:    vpt.i32 ne, q1, zr
2196; CHECK-NEXT:    vstrwt.32 q0, [r1]
2197; CHECK-NEXT:    bx lr
2198entry:
2199  %z = getelementptr inbounds i8, ptr %y, i32 512
2200  %mask = load <4 x i32>, ptr %m, align 4
2201  %c = icmp ne <4 x i32> %mask, zeroinitializer
2202  %0 = load <4 x float>, ptr %x, align 4
2203  call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c)
2204  ret ptr %y
2205}
2206
2207define ptr @strwf32_m508(ptr %y, ptr %x, ptr %m) {
2208; CHECK-LABEL: strwf32_m508:
2209; CHECK:       @ %bb.0: @ %entry
2210; CHECK-NEXT:    vldrw.u32 q1, [r2]
2211; CHECK-NEXT:    vldrw.u32 q0, [r1]
2212; CHECK-NEXT:    vpt.i32 ne, q1, zr
2213; CHECK-NEXT:    vstrwt.32 q0, [r0, #-508]
2214; CHECK-NEXT:    bx lr
2215entry:
2216  %z = getelementptr inbounds i8, ptr %y, i32 -508
2217  %mask = load <4 x i32>, ptr %m, align 4
2218  %c = icmp ne <4 x i32> %mask, zeroinitializer
2219  %0 = load <4 x float>, ptr %x, align 4
2220  call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c)
2221  ret ptr %y
2222}
2223
2224define ptr @strwf32_m512(ptr %y, ptr %x, ptr %m) {
2225; CHECK-LABEL: strwf32_m512:
2226; CHECK:       @ %bb.0: @ %entry
2227; CHECK-NEXT:    vldrw.u32 q1, [r2]
2228; CHECK-NEXT:    vldrw.u32 q0, [r1]
2229; CHECK-NEXT:    sub.w r1, r0, #512
2230; CHECK-NEXT:    vpt.i32 ne, q1, zr
2231; CHECK-NEXT:    vstrwt.32 q0, [r1]
2232; CHECK-NEXT:    bx lr
2233entry:
2234  %z = getelementptr inbounds i8, ptr %y, i32 -512
2235  %mask = load <4 x i32>, ptr %m, align 4
2236  %c = icmp ne <4 x i32> %mask, zeroinitializer
2237  %0 = load <4 x float>, ptr %x, align 4
2238  call void @llvm.masked.store.v4f32.p0(<4 x float> %0, ptr %z, i32 4, <4 x i1> %c)
2239  ret ptr %y
2240}
2241
2242define ptr @strhf16_4(ptr %y, ptr %x, ptr %m) {
2243; CHECK-LABEL: strhf16_4:
2244; CHECK:       @ %bb.0: @ %entry
2245; CHECK-NEXT:    vldrh.u16 q1, [r2]
2246; CHECK-NEXT:    vldrh.u16 q0, [r1]
2247; CHECK-NEXT:    vpt.i16 ne, q1, zr
2248; CHECK-NEXT:    vstrht.16 q0, [r0, #4]
2249; CHECK-NEXT:    bx lr
2250entry:
2251  %z = getelementptr inbounds i8, ptr %y, i32 4
2252  %mask = load <8 x i16>, ptr %m, align 2
2253  %c = icmp ne <8 x i16> %mask, zeroinitializer
2254  %0 = load <8 x half>, ptr %x, align 2
2255  call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c)
2256  ret ptr %y
2257}
2258
2259define ptr @strhf16_3(ptr %y, ptr %x, ptr %m) {
2260; CHECK-LABEL: strhf16_3:
2261; CHECK:       @ %bb.0: @ %entry
2262; CHECK-NEXT:    vldrh.u16 q1, [r2]
2263; CHECK-NEXT:    vldrh.u16 q0, [r1]
2264; CHECK-NEXT:    adds r1, r0, #3
2265; CHECK-NEXT:    vpt.i16 ne, q1, zr
2266; CHECK-NEXT:    vstrht.16 q0, [r1]
2267; CHECK-NEXT:    bx lr
2268entry:
2269  %z = getelementptr inbounds i8, ptr %y, i32 3
2270  %mask = load <8 x i16>, ptr %m, align 2
2271  %c = icmp ne <8 x i16> %mask, zeroinitializer
2272  %0 = load <8 x half>, ptr %x, align 2
2273  call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c)
2274  ret ptr %y
2275}
2276
2277define ptr @strhf16_2(ptr %y, ptr %x, ptr %m) {
2278; CHECK-LABEL: strhf16_2:
2279; CHECK:       @ %bb.0: @ %entry
2280; CHECK-NEXT:    vldrh.u16 q1, [r2]
2281; CHECK-NEXT:    vldrh.u16 q0, [r1]
2282; CHECK-NEXT:    vpt.i16 ne, q1, zr
2283; CHECK-NEXT:    vstrht.16 q0, [r0, #2]
2284; CHECK-NEXT:    bx lr
2285entry:
2286  %z = getelementptr inbounds i8, ptr %y, i32 2
2287  %mask = load <8 x i16>, ptr %m, align 2
2288  %c = icmp ne <8 x i16> %mask, zeroinitializer
2289  %0 = load <8 x half>, ptr %x, align 2
2290  call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c)
2291  ret ptr %y
2292}
2293
2294define ptr @strhf16_254(ptr %y, ptr %x, ptr %m) {
2295; CHECK-LABEL: strhf16_254:
2296; CHECK:       @ %bb.0: @ %entry
2297; CHECK-NEXT:    vldrh.u16 q1, [r2]
2298; CHECK-NEXT:    vldrh.u16 q0, [r1]
2299; CHECK-NEXT:    vpt.i16 ne, q1, zr
2300; CHECK-NEXT:    vstrht.16 q0, [r0, #254]
2301; CHECK-NEXT:    bx lr
2302entry:
2303  %z = getelementptr inbounds i8, ptr %y, i32 254
2304  %mask = load <8 x i16>, ptr %m, align 2
2305  %c = icmp ne <8 x i16> %mask, zeroinitializer
2306  %0 = load <8 x half>, ptr %x, align 2
2307  call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c)
2308  ret ptr %y
2309}
2310
2311define ptr @strhf16_256(ptr %y, ptr %x, ptr %m) {
2312; CHECK-LABEL: strhf16_256:
2313; CHECK:       @ %bb.0: @ %entry
2314; CHECK-NEXT:    vldrh.u16 q1, [r2]
2315; CHECK-NEXT:    vldrh.u16 q0, [r1]
2316; CHECK-NEXT:    add.w r1, r0, #256
2317; CHECK-NEXT:    vpt.i16 ne, q1, zr
2318; CHECK-NEXT:    vstrht.16 q0, [r1]
2319; CHECK-NEXT:    bx lr
2320entry:
2321  %z = getelementptr inbounds i8, ptr %y, i32 256
2322  %mask = load <8 x i16>, ptr %m, align 2
2323  %c = icmp ne <8 x i16> %mask, zeroinitializer
2324  %0 = load <8 x half>, ptr %x, align 2
2325  call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c)
2326  ret ptr %y
2327}
2328
2329define ptr @strhf16_m254(ptr %y, ptr %x, ptr %m) {
2330; CHECK-LABEL: strhf16_m254:
2331; CHECK:       @ %bb.0: @ %entry
2332; CHECK-NEXT:    vldrh.u16 q1, [r2]
2333; CHECK-NEXT:    vldrh.u16 q0, [r1]
2334; CHECK-NEXT:    vpt.i16 ne, q1, zr
2335; CHECK-NEXT:    vstrht.16 q0, [r0, #-254]
2336; CHECK-NEXT:    bx lr
2337entry:
2338  %z = getelementptr inbounds i8, ptr %y, i32 -254
2339  %mask = load <8 x i16>, ptr %m, align 2
2340  %c = icmp ne <8 x i16> %mask, zeroinitializer
2341  %0 = load <8 x half>, ptr %x, align 2
2342  call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c)
2343  ret ptr %y
2344}
2345
2346define ptr @strhf16_m256(ptr %y, ptr %x, ptr %m) {
2347; CHECK-LABEL: strhf16_m256:
2348; CHECK:       @ %bb.0: @ %entry
2349; CHECK-NEXT:    vldrh.u16 q1, [r2]
2350; CHECK-NEXT:    vldrh.u16 q0, [r1]
2351; CHECK-NEXT:    sub.w r1, r0, #256
2352; CHECK-NEXT:    vpt.i16 ne, q1, zr
2353; CHECK-NEXT:    vstrht.16 q0, [r1]
2354; CHECK-NEXT:    bx lr
2355entry:
2356  %z = getelementptr inbounds i8, ptr %y, i32 -256
2357  %mask = load <8 x i16>, ptr %m, align 2
2358  %c = icmp ne <8 x i16> %mask, zeroinitializer
2359  %0 = load <8 x half>, ptr %x, align 2
2360  call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %z, i32 2, <8 x i1> %c)
2361  ret ptr %y
2362}
2363
2364declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
2365declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
2366declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
2367declare <4 x i8> @llvm.masked.load.v4i8.p0(ptr, i32, <4 x i1>, <4 x i8>)
2368declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
2369declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
2370declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
2371declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32, <8 x i1>, <8 x half>)
2372
2373declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
2374declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
2375declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
2376declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
2377declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
2378declare void @llvm.masked.store.v4i8.p0(<4 x i8>, ptr, i32, <4 x i1>)
2379declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
2380declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>)
2381