xref: /llvm-project/llvm/test/Transforms/VectorCombine/X86/load-widening.ll (revision 497e2e8cf8d27d0488b089757f1569c4c7d8635e)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,AVX
4; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,SSE
5; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,AVX
6
7;-------------------------------------------------------------------------------
8; Here we know we can load 128 bits as per dereferenceability and alignment.
9
10; We don't widen scalar loads per-se.
11define <1 x float> @scalar(ptr align 16 dereferenceable(16) %p) {
12; CHECK-LABEL: @scalar(
13; CHECK-NEXT:    [[R:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16
14; CHECK-NEXT:    ret <1 x float> [[R]]
15;
16  %r = load <1 x float>, ptr %p, align 16
17  ret <1 x float> %r
18}
19
20; We don't widen single-element loads, these get scalarized.
21define <1 x float> @vec_with_1elt(ptr align 16 dereferenceable(16) %p) {
22; CHECK-LABEL: @vec_with_1elt(
23; CHECK-NEXT:    [[R:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16
24; CHECK-NEXT:    ret <1 x float> [[R]]
25;
26  %r = load <1 x float>, ptr %p, align 16
27  ret <1 x float> %r
28}
29
30define <2 x float> @vec_with_2elts(ptr align 16 dereferenceable(16) %p) {
31; CHECK-LABEL: @vec_with_2elts(
32; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, ptr [[P:%.*]], align 16
33; CHECK-NEXT:    ret <2 x float> [[R]]
34;
35  %r = load <2 x float>, ptr %p, align 16
36  ret <2 x float> %r
37}
38
39define <3 x float> @vec_with_3elts(ptr align 16 dereferenceable(16) %p) {
40; CHECK-LABEL: @vec_with_3elts(
41; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16
42; CHECK-NEXT:    ret <3 x float> [[R]]
43;
44  %r = load <3 x float>, ptr %p, align 16
45  ret <3 x float> %r
46}
47
48; Full-vector load. All good already.
49define <4 x float> @vec_with_4elts(ptr align 16 dereferenceable(16) %p) {
50; CHECK-LABEL: @vec_with_4elts(
51; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
52; CHECK-NEXT:    ret <4 x float> [[R]]
53;
54  %r = load <4 x float>, ptr %p, align 16
55  ret <4 x float> %r
56}
57
58; We don't know we can load 256 bits though.
59define <5 x float> @vec_with_5elts(ptr align 16 dereferenceable(16) %p) {
60; CHECK-LABEL: @vec_with_5elts(
61; CHECK-NEXT:    [[R:%.*]] = load <5 x float>, ptr [[P:%.*]], align 16
62; CHECK-NEXT:    ret <5 x float> [[R]]
63;
64  %r = load <5 x float>, ptr %p, align 16
65  ret <5 x float> %r
66}
67
68;-------------------------------------------------------------------------------
69
70; We can load 128 bits, and the fact that it's underaligned isn't relevant.
71define <3 x float> @vec_with_3elts_underaligned(ptr align 8 dereferenceable(16) %p) {
72; CHECK-LABEL: @vec_with_3elts_underaligned(
73; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 8
74; CHECK-NEXT:    ret <3 x float> [[R]]
75;
76  %r = load <3 x float>, ptr %p, align 8
77  ret <3 x float> %r
78}
79
80; We don't know we can load 128 bits, but since it's aligned, we still can do wide load.
81; FIXME: this should still get widened.
82define <3 x float> @vec_with_3elts_underdereferenceable(ptr align 16 dereferenceable(12) %p) {
83; CHECK-LABEL: @vec_with_3elts_underdereferenceable(
84; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16
85; CHECK-NEXT:    ret <3 x float> [[R]]
86;
87  %r = load <3 x float>, ptr %p, align 16
88  ret <3 x float> %r
89}
90
91; We can't tell if we can load 128 bits.
92define <3 x float> @vec_with_3elts_underaligned_underdereferenceable(ptr align 8 dereferenceable(12) %p) {
93; CHECK-LABEL: @vec_with_3elts_underaligned_underdereferenceable(
94; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 8
95; CHECK-NEXT:    ret <3 x float> [[R]]
96;
97  %r = load <3 x float>, ptr %p, align 8
98  ret <3 x float> %r
99}
100
101;-------------------------------------------------------------------------------
102; Here we know we can load 256 bits as per dereferenceability and alignment.
103
104define <1 x float> @vec_with_1elt_256bits(ptr align 32 dereferenceable(32) %p) {
105; CHECK-LABEL: @vec_with_1elt_256bits(
106; CHECK-NEXT:    [[R:%.*]] = load <1 x float>, ptr [[P:%.*]], align 32
107; CHECK-NEXT:    ret <1 x float> [[R]]
108;
109  %r = load <1 x float>, ptr %p, align 32
110  ret <1 x float> %r
111}
112
113define <2 x float> @vec_with_2elts_256bits(ptr align 32 dereferenceable(32) %p) {
114; CHECK-LABEL: @vec_with_2elts_256bits(
115; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, ptr [[P:%.*]], align 32
116; CHECK-NEXT:    ret <2 x float> [[R]]
117;
118  %r = load <2 x float>, ptr %p, align 32
119  ret <2 x float> %r
120}
121
122define <3 x float> @vec_with_3elts_256bits(ptr align 32 dereferenceable(32) %p) {
123; CHECK-LABEL: @vec_with_3elts_256bits(
124; CHECK-NEXT:    [[R:%.*]] = load <3 x float>, ptr [[P:%.*]], align 32
125; CHECK-NEXT:    ret <3 x float> [[R]]
126;
127  %r = load <3 x float>, ptr %p, align 32
128  ret <3 x float> %r
129}
130
131define <4 x float> @vec_with_4elts_256bits(ptr align 32 dereferenceable(32) %p) {
132; CHECK-LABEL: @vec_with_4elts_256bits(
133; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, ptr [[P:%.*]], align 32
134; CHECK-NEXT:    ret <4 x float> [[R]]
135;
136  %r = load <4 x float>, ptr %p, align 32
137  ret <4 x float> %r
138}
139
140define <5 x float> @vec_with_5elts_256bits(ptr align 32 dereferenceable(32) %p) {
141; CHECK-LABEL: @vec_with_5elts_256bits(
142; CHECK-NEXT:    [[R:%.*]] = load <5 x float>, ptr [[P:%.*]], align 32
143; CHECK-NEXT:    ret <5 x float> [[R]]
144;
145  %r = load <5 x float>, ptr %p, align 32
146  ret <5 x float> %r
147}
148
149define <6 x float> @vec_with_6elts_256bits(ptr align 32 dereferenceable(32) %p) {
150; CHECK-LABEL: @vec_with_6elts_256bits(
151; CHECK-NEXT:    [[R:%.*]] = load <6 x float>, ptr [[P:%.*]], align 32
152; CHECK-NEXT:    ret <6 x float> [[R]]
153;
154  %r = load <6 x float>, ptr %p, align 32
155  ret <6 x float> %r
156}
157
158define <7 x float> @vec_with_7elts_256bits(ptr align 32 dereferenceable(32) %p) {
159; CHECK-LABEL: @vec_with_7elts_256bits(
160; CHECK-NEXT:    [[R:%.*]] = load <7 x float>, ptr [[P:%.*]], align 32
161; CHECK-NEXT:    ret <7 x float> [[R]]
162;
163  %r = load <7 x float>, ptr %p, align 32
164  ret <7 x float> %r
165}
166
167; Full-vector load. All good already.
168define <8 x float> @vec_with_8elts_256bits(ptr align 32 dereferenceable(32) %p) {
169; CHECK-LABEL: @vec_with_8elts_256bits(
170; CHECK-NEXT:    [[R:%.*]] = load <8 x float>, ptr [[P:%.*]], align 32
171; CHECK-NEXT:    ret <8 x float> [[R]]
172;
173  %r = load <8 x float>, ptr %p, align 32
174  ret <8 x float> %r
175}
176
177; We can't tell if we can load more than 256 bits.
178define <9 x float> @vec_with_9elts_256bits(ptr align 32 dereferenceable(32) %p) {
179; CHECK-LABEL: @vec_with_9elts_256bits(
180; CHECK-NEXT:    [[R:%.*]] = load <9 x float>, ptr [[P:%.*]], align 32
181; CHECK-NEXT:    ret <9 x float> [[R]]
182;
183  %r = load <9 x float>, ptr %p, align 32
184  ret <9 x float> %r
185}
186
187;-------------------------------------------------------------------------------
188
189; Weird types we don't deal with
190define <2 x i7> @vec_with_two_subbyte_elts(ptr align 16 dereferenceable(16) %p) {
191; CHECK-LABEL: @vec_with_two_subbyte_elts(
192; CHECK-NEXT:    [[R:%.*]] = load <2 x i7>, ptr [[P:%.*]], align 16
193; CHECK-NEXT:    ret <2 x i7> [[R]]
194;
195  %r = load <2 x i7>, ptr %p, align 16
196  ret <2 x i7> %r
197}
198
199define <2 x i9> @vec_with_two_nonbyte_sized_elts(ptr align 16 dereferenceable(16) %p) {
200; CHECK-LABEL: @vec_with_two_nonbyte_sized_elts(
201; CHECK-NEXT:    [[R:%.*]] = load <2 x i9>, ptr [[P:%.*]], align 16
202; CHECK-NEXT:    ret <2 x i9> [[R]]
203;
204  %r = load <2 x i9>, ptr %p, align 16
205  ret <2 x i9> %r
206}
207
208define <2 x i24> @vec_with_two_nonpoweroftwo_sized_elts(ptr align 16 dereferenceable(16) %p) {
209; CHECK-LABEL: @vec_with_two_nonpoweroftwo_sized_elts(
210; CHECK-NEXT:    [[R:%.*]] = load <2 x i24>, ptr [[P:%.*]], align 16
211; CHECK-NEXT:    ret <2 x i24> [[R]]
212;
213  %r = load <2 x i24>, ptr %p, align 16
214  ret <2 x i24> %r
215}
216
217define <2 x float> @vec_with_2elts_addressspace(ptr addrspace(2) align 16 dereferenceable(16) %p) {
218; CHECK-LABEL: @vec_with_2elts_addressspace(
219; CHECK-NEXT:    [[R:%.*]] = load <2 x float>, ptr addrspace(2) [[P:%.*]], align 16
220; CHECK-NEXT:    ret <2 x float> [[R]]
221;
222  %r = load <2 x float>, ptr addrspace(2) %p, align 16
223  ret <2 x float> %r
224}
225
226;-------------------------------------------------------------------------------
227
228; Widening these would change the legalized type, so leave them alone.
229
230define <2 x i1> @vec_with_2elts_128bits_i1(ptr align 16 dereferenceable(16) %p) {
231; CHECK-LABEL: @vec_with_2elts_128bits_i1(
232; CHECK-NEXT:    [[R:%.*]] = load <2 x i1>, ptr [[P:%.*]], align 16
233; CHECK-NEXT:    ret <2 x i1> [[R]]
234;
235  %r = load <2 x i1>, ptr %p, align 16
236  ret <2 x i1> %r
237}
238define <2 x i2> @vec_with_2elts_128bits_i2(ptr align 16 dereferenceable(16) %p) {
239; CHECK-LABEL: @vec_with_2elts_128bits_i2(
240; CHECK-NEXT:    [[R:%.*]] = load <2 x i2>, ptr [[P:%.*]], align 16
241; CHECK-NEXT:    ret <2 x i2> [[R]]
242;
243  %r = load <2 x i2>, ptr %p, align 16
244  ret <2 x i2> %r
245}
246define <2 x i4> @vec_with_2elts_128bits_i4(ptr align 16 dereferenceable(16) %p) {
247; CHECK-LABEL: @vec_with_2elts_128bits_i4(
248; CHECK-NEXT:    [[R:%.*]] = load <2 x i4>, ptr [[P:%.*]], align 16
249; CHECK-NEXT:    ret <2 x i4> [[R]]
250;
251  %r = load <2 x i4>, ptr %p, align 16
252  ret <2 x i4> %r
253}
254
255; Load the 128-bit vector because there is no additional cost.
256
257define <4 x float> @load_v1f32_v4f32(ptr dereferenceable(16) %p) {
258; CHECK-LABEL: @load_v1f32_v4f32(
259; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
260; CHECK-NEXT:    ret <4 x float> [[S]]
261;
262  %l = load <1 x float>, ptr %p, align 16
263  %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
264  ret <4 x float> %s
265}
266
267; Load the 128-bit vector because there is no additional cost.
268; Alignment is taken from param attr.
269
270define <4 x float> @load_v2f32_v4f32(ptr align 16 dereferenceable(16) %p) {
271; CHECK-LABEL: @load_v2f32_v4f32(
272; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
273; CHECK-NEXT:    ret <4 x float> [[S]]
274;
275  %l = load <2 x float>, ptr %p, align 1
276  %s = shufflevector <2 x float> %l, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
277  ret <4 x float> %s
278}
279
280; Load the 128-bit vector because there is no additional cost.
281
282define <4 x float> @load_v3f32_v4f32(ptr dereferenceable(16) %p) {
283; CHECK-LABEL: @load_v3f32_v4f32(
284; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 1
285; CHECK-NEXT:    ret <4 x float> [[S]]
286;
287  %l = load <3 x float>, ptr %p, align 1
288  %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
289  ret <4 x float> %s
290}
291
292; Negative test - the shuffle must be a simple subvector insert.
293
294define <4 x float> @load_v3f32_v4f32_wrong_mask(ptr dereferenceable(16) %p) {
295; CHECK-LABEL: @load_v3f32_v4f32_wrong_mask(
296; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
297; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 poison>
298; CHECK-NEXT:    ret <4 x float> [[S]]
299;
300  %l = load <3 x float>, ptr %p, align 1
301  %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 undef>
302  ret <4 x float> %s
303}
304
305; Negative test - must be dereferenceable to vector width.
306
307define <4 x float> @load_v3f32_v4f32_not_deref(ptr dereferenceable(15) %p) {
308; CHECK-LABEL: @load_v3f32_v4f32_not_deref(
309; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16
310; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
311; CHECK-NEXT:    ret <4 x float> [[S]]
312;
313  %l = load <3 x float>, ptr %p, align 16
314  %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
315  ret <4 x float> %s
316}
317
318; Without AVX, the cost of loading 256-bits would be greater.
319
320define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) {
321; SSE-LABEL: @load_v2f32_v8f32(
322; SSE-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
323; SSE-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
324; SSE-NEXT:    ret <8 x float> [[S]]
325;
326; AVX-LABEL: @load_v2f32_v8f32(
327; AVX-NEXT:    [[S:%.*]] = load <8 x float>, ptr [[P:%.*]], align 1
328; AVX-NEXT:    ret <8 x float> [[S]]
329;
330  %l = load <2 x float>, ptr %p, align 1
331  %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
332  ret <8 x float> %s
333}
334
335; Integer type is ok too.
336
337define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) {
338; CHECK-LABEL: @load_v2i32_v4i32(
339; CHECK-NEXT:    [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
340; CHECK-NEXT:    ret <4 x i32> [[S]]
341;
342  %l = load <2 x i32>, ptr %p, align 1
343  %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
344  ret <4 x i32> %s
345}
346
347; TODO: We assumed the shuffle mask is canonical.
348
349define <4 x i32> @load_v2i32_v4i32_non_canonical_mask(ptr dereferenceable(16) %p) {
350; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask(
351; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
352; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
353; CHECK-NEXT:    ret <4 x i32> [[S]]
354;
355  %l = load <2 x i32>, ptr %p, align 1
356  %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
357  ret <4 x i32> %s
358}
359
360; Allow non-canonical commuted shuffle.
361
362define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute(ptr dereferenceable(16) %p) {
363; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute(
364; CHECK-NEXT:    [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
365; CHECK-NEXT:    ret <4 x i32> [[S]]
366;
367  %l = load <2 x i32>, ptr %p, align 1
368  %s = shufflevector <2 x i32> poison, <2 x i32> %l, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
369  ret <4 x i32> %s
370}
371
372; The wide load must be in the same addrspace as the original load.
373
374define <4 x i32> @load_v2i32_v4i32_addrspacecast(ptr addrspace(5) align 16 dereferenceable(16) %p) {
375; CHECK-LABEL: @load_v2i32_v4i32_addrspacecast(
376; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42)
377; CHECK-NEXT:    [[S:%.*]] = load <4 x i32>, ptr addrspace(42) [[TMP1]], align 16
378; CHECK-NEXT:    ret <4 x i32> [[S]]
379;
380  %asc = addrspacecast ptr addrspace(5) %p to ptr addrspace(42)
381  %l = load <2 x i32>, ptr addrspace(42) %asc, align 4
382  %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
383  ret <4 x i32> %s
384}
385
386; Negative-negative tests with msan, which should be OK with widening.
387
388define <4 x float> @load_v1f32_v4f32_msan(ptr dereferenceable(16) %p) sanitize_memory  {
389; CHECK-LABEL: @load_v1f32_v4f32_msan(
390; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
391; CHECK-NEXT:    ret <4 x float> [[S]]
392;
393  %l = load <1 x float>, ptr %p, align 16
394  %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
395  ret <4 x float> %s
396}
397
398; Negative tests with sanitizers.
399
400define <4 x float> @load_v1f32_v4f32_asan(ptr dereferenceable(16) %p) sanitize_address  {
401; CHECK-LABEL: @load_v1f32_v4f32_asan(
402; CHECK-NEXT:    [[L:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16
403; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x float> [[L]], <1 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
404; CHECK-NEXT:    ret <4 x float> [[S]]
405;
406  %l = load <1 x float>, ptr %p, align 16
407  %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
408  ret <4 x float> %s
409}
410
411define <4 x float> @load_v2f32_v4f32_hwasan(ptr align 16 dereferenceable(16) %p) sanitize_hwaddress {
412; CHECK-LABEL: @load_v2f32_v4f32_hwasan(
413; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
414; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
415; CHECK-NEXT:    ret <4 x float> [[S]]
416;
417  %l = load <2 x float>, ptr %p, align 1
418  %s = shufflevector <2 x float> %l, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
419  ret <4 x float> %s
420}
421
422define <4 x float> @load_v3f32_v4f32_tsan(ptr dereferenceable(16) %p) sanitize_thread  {
423; CHECK-LABEL: @load_v3f32_v4f32_tsan(
424; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
425; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
426; CHECK-NEXT:    ret <4 x float> [[S]]
427;
428  %l = load <3 x float>, ptr %p, align 1
429  %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
430  ret <4 x float> %s
431}
432
433define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize_hwaddress {
434; CHECK-LABEL: @load_v2f32_v8f32_hwasan(
435; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
436; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
437; CHECK-NEXT:    ret <8 x float> [[S]]
438;
439  %l = load <2 x float>, ptr %p, align 1
440  %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
441  ret <8 x float> %s
442}
443
444define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
445; CHECK-LABEL: @load_v2i32_v4i32_asan(
446; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
447; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
448; CHECK-NEXT:    ret <4 x i32> [[S]]
449;
450  %l = load <2 x i32>, ptr %p, align 1
451  %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
452  ret <4 x i32> %s
453}
454
455define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute_hwasan(ptr dereferenceable(16) %p) sanitize_hwaddress {
456; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute_hwasan(
457; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
458; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> poison, <2 x i32> [[L]], <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
459; CHECK-NEXT:    ret <4 x i32> [[S]]
460;
461  %l = load <2 x i32>, ptr %p, align 1
462  %s = shufflevector <2 x i32> poison, <2 x i32> %l, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
463  ret <4 x i32> %s
464}
465