xref: /llvm-project/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll (revision 178f47143a3b3c547df6d1f07e9707792f5d9fd4)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
4
5target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
6
7define float @matching_fp_scalar(ptr align 16 dereferenceable(16) %p) {
8; CHECK-LABEL: @matching_fp_scalar(
9; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
10; CHECK-NEXT:    ret float [[R]]
11;
12  %r = load float, ptr %p, align 16
13  ret float %r
14}
15
16define float @matching_fp_scalar_volatile(ptr align 16 dereferenceable(16) %p) {
17; CHECK-LABEL: @matching_fp_scalar_volatile(
18; CHECK-NEXT:    [[R:%.*]] = load volatile float, ptr [[P:%.*]], align 16
19; CHECK-NEXT:    ret float [[R]]
20;
21  %r = load volatile float, ptr %p, align 16
22  ret float %r
23}
24
25define double @larger_fp_scalar(ptr align 16 dereferenceable(16) %p) {
26; CHECK-LABEL: @larger_fp_scalar(
27; CHECK-NEXT:    [[R:%.*]] = load double, ptr [[P:%.*]], align 16
28; CHECK-NEXT:    ret double [[R]]
29;
30  %r = load double, ptr %p, align 16
31  ret double %r
32}
33
34define float @smaller_fp_scalar(ptr align 16 dereferenceable(16) %p) {
35; CHECK-LABEL: @smaller_fp_scalar(
36; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
37; CHECK-NEXT:    ret float [[R]]
38;
39  %r = load float, ptr %p, align 16
40  ret float %r
41}
42
43define float @matching_fp_vector(ptr align 16 dereferenceable(16) %p) {
44; CHECK-LABEL: @matching_fp_vector(
45; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
46; CHECK-NEXT:    ret float [[R]]
47;
48  %r = load float, ptr %p, align 16
49  ret float %r
50}
51
52define float @matching_fp_vector_gep00(ptr align 16 dereferenceable(16) %p) {
53; CHECK-LABEL: @matching_fp_vector_gep00(
54; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
55; CHECK-NEXT:    ret float [[R]]
56;
57  %r = load float, ptr %p, align 16
58  ret float %r
59}
60
61define float @matching_fp_vector_gep01(ptr align 16 dereferenceable(20) %p) {
62; CHECK-LABEL: @matching_fp_vector_gep01(
63; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 0, i64 1
64; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 4
65; CHECK-NEXT:    ret float [[R]]
66;
67  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 0, i64 1
68  %r = load float, ptr %gep, align 4
69  ret float %r
70}
71
72define float @matching_fp_vector_gep01_deref(ptr align 16 dereferenceable(19) %p) {
73; CHECK-LABEL: @matching_fp_vector_gep01_deref(
74; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 0, i64 1
75; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 4
76; CHECK-NEXT:    ret float [[R]]
77;
78  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 0, i64 1
79  %r = load float, ptr %gep, align 4
80  ret float %r
81}
82
83define float @matching_fp_vector_gep10(ptr align 16 dereferenceable(32) %p) {
84; CHECK-LABEL: @matching_fp_vector_gep10(
85; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 1, i64 0
86; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 16
87; CHECK-NEXT:    ret float [[R]]
88;
89  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 1, i64 0
90  %r = load float, ptr %gep, align 16
91  ret float %r
92}
93
94define float @matching_fp_vector_gep10_deref(ptr align 16 dereferenceable(31) %p) {
95; CHECK-LABEL: @matching_fp_vector_gep10_deref(
96; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 1, i64 0
97; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 16
98; CHECK-NEXT:    ret float [[R]]
99;
100  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 1, i64 0
101  %r = load float, ptr %gep, align 16
102  ret float %r
103}
104
105define float @nonmatching_int_vector(ptr align 16 dereferenceable(16) %p) {
106; CHECK-LABEL: @nonmatching_int_vector(
107; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
108; CHECK-NEXT:    ret float [[R]]
109;
110  %r = load float, ptr %p, align 16
111  ret float %r
112}
113
114define double @less_aligned(ptr align 4 dereferenceable(16) %p) {
115; CHECK-LABEL: @less_aligned(
116; CHECK-NEXT:    [[R:%.*]] = load double, ptr [[P:%.*]], align 4
117; CHECK-NEXT:    ret double [[R]]
118;
119  %r = load double, ptr %p, align 4
120  ret double %r
121}
122
123define float @matching_fp_scalar_small_deref(ptr align 16 dereferenceable(15) %p) {
124; CHECK-LABEL: @matching_fp_scalar_small_deref(
125; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
126; CHECK-NEXT:    ret float [[R]]
127;
128  %r = load float, ptr %p, align 16
129  ret float %r
130}
131
132define i64 @larger_int_scalar(ptr align 16 dereferenceable(16) %p) {
133; CHECK-LABEL: @larger_int_scalar(
134; CHECK-NEXT:    [[R:%.*]] = load i64, ptr [[P:%.*]], align 16
135; CHECK-NEXT:    ret i64 [[R]]
136;
137  %r = load i64, ptr %p, align 16
138  ret i64 %r
139}
140
141define i8 @smaller_int_scalar(ptr align 16 dereferenceable(16) %p) {
142; CHECK-LABEL: @smaller_int_scalar(
143; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[P:%.*]], align 16
144; CHECK-NEXT:    ret i8 [[R]]
145;
146  %r = load i8, ptr %p, align 16
147  ret i8 %r
148}
149
150define double @larger_fp_scalar_256bit_vec(ptr align 32 dereferenceable(32) %p) {
151; CHECK-LABEL: @larger_fp_scalar_256bit_vec(
152; CHECK-NEXT:    [[R:%.*]] = load double, ptr [[P:%.*]], align 32
153; CHECK-NEXT:    ret double [[R]]
154;
155  %r = load double, ptr %p, align 32
156  ret double %r
157}
158
159define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
160; CHECK-LABEL: @load_f32_insert_v4f32(
161; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
162; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
163; CHECK-NEXT:    ret <4 x float> [[R]]
164;
165  %s = load float, ptr %p, align 4
166  %r = insertelement <4 x float> poison, float %s, i32 0
167  ret <4 x float> %r
168}
169
170define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) %p) nofree nosync {
171; CHECK-LABEL: @casted_load_f32_insert_v4f32(
172; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
173; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
174; CHECK-NEXT:    ret <4 x float> [[R]]
175;
176  %s = load float, ptr %p, align 4
177  %r = insertelement <4 x float> poison, float %s, i32 0
178  ret <4 x float> %r
179}
180
181; Element type does not change cost.
182
183define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nofree nosync {
184; CHECK-LABEL: @load_i32_insert_v4i32(
185; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
186; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
187; CHECK-NEXT:    ret <4 x i32> [[R]]
188;
189  %s = load i32, ptr %p, align 4
190  %r = insertelement <4 x i32> poison, i32 %s, i32 0
191  ret <4 x i32> %r
192}
193
194; Pointer type does not change cost.
195
196define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) %p) nofree nosync {
197; CHECK-LABEL: @casted_load_i32_insert_v4i32(
198; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
199; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
200; CHECK-NEXT:    ret <4 x i32> [[R]]
201;
202  %s = load i32, ptr %p, align 4
203  %r = insertelement <4 x i32> poison, i32 %s, i32 0
204  ret <4 x i32> %r
205}
206
207; This is canonical form for vector element access.
208
209define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
210; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
211; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
212; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
213; CHECK-NEXT:    ret <4 x float> [[R]]
214;
215  %s = load float, ptr %p, align 16
216  %r = insertelement <4 x float> poison, float %s, i64 0
217  ret <4 x float> %r
218}
219
220; Should work with addrspace as well.
221
222define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) align 16 dereferenceable(16) %p) nofree nosync {
223; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
224; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr addrspace(44) [[P:%.*]], align 16
225; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
226; CHECK-NEXT:    ret <4 x float> [[R]]
227;
228  %s = load float, ptr addrspace(44) %p, align 16
229  %r = insertelement <4 x float> poison, float %s, i64 0
230  ret <4 x float> %r
231}
232
233; Should work with addrspace even when peeking past unsafe loads through geps
234
235define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenceable(16) %v3) {
236; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace(
237; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[V3:%.*]] to ptr addrspace(42)
238; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(42) [[TMP1]], align 16
239; CHECK-NEXT:    [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
240; CHECK-NEXT:    ret <4 x i32> [[INSELT]]
241;
242  %t0 = getelementptr inbounds i32, ptr %v3, i32 1
243  %t1 = addrspacecast ptr %t0 to ptr addrspace(42)
244  %t2 = getelementptr inbounds i32, ptr addrspace(42) %t1, i64 1
245  %val = load i32, ptr addrspace(42) %t2, align 4
246  %inselt = insertelement <4 x i32> poison, i32 %val, i32 0
247  ret <4 x i32> %inselt
248}
249
250; If there are enough dereferenceable bytes, we can offset the vector load.
251
252define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
253; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
254; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
255; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
256; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
257; CHECK-NEXT:    ret <8 x i16> [[R]]
258;
259  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
260  %s = load i16, ptr %gep, align 2
261  %r = insertelement <8 x i16> poison, i16 %s, i64 0
262  ret <8 x i16> %r
263}
264
265; Can't safely load the offset vector, but can load+shuffle if it is profitable.
266
267define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync {
268; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
269; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16
270; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
271; CHECK-NEXT:    ret <8 x i16> [[R]]
272;
273  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
274  %s = load i16, ptr %gep, align 2
275  %r = insertelement <8 x i16> poison, i16 %s, i64 0
276  ret <8 x i16> %r
277}
278
279; Verify that alignment of the new load is not over-specified.
280
281define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync {
282; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
283; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
284; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
285; CHECK-NEXT:    ret <8 x i16> [[R]]
286;
287  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
288  %s = load i16, ptr %gep, align 8
289  %r = insertelement <8 x i16> poison, i16 %s, i64 0
290  ret <8 x i16> %r
291}
292
293; Negative test - if we are shuffling a load from the base pointer, the address offset
294; must be a multiple of element size.
295; TODO: Could bitcast around this limitation.
296
297define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync {
298; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
299; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
300; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
301; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
302; CHECK-NEXT:    ret <4 x i32> [[R]]
303;
304  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
305  %s = load i32, ptr %gep, align 1
306  %r = insertelement <4 x i32> poison, i32 %s, i64 0
307  ret <4 x i32> %r
308}
309
310define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
311; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
312; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
313; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
314; CHECK-NEXT:    ret <4 x i32> [[R]]
315;
316  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 12
317  %s = load i32, ptr %gep, align 1
318  %r = insertelement <4 x i32> poison, i32 %s, i64 0
319  ret <4 x i32> %r
320}
321
322; Negative test - if we are shuffling a load from the base pointer, the address offset
323; must be a multiple of element size and the offset must be low enough to fit in the vector
324; (bitcasting would not help this case).
325
326define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
327; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32(
328; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 13
329; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
330; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
331; CHECK-NEXT:    ret <4 x i32> [[R]]
332;
333  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 13
334  %s = load i32, ptr %gep, align 1
335  %r = insertelement <4 x i32> poison, i32 %s, i64 0
336  ret <4 x i32> %r
337}
338
339; If there are enough dereferenceable bytes, we can offset the vector load.
340
341define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
342; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
343; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
344; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
345; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
346; CHECK-NEXT:    ret <8 x i16> [[R]]
347;
348  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
349  %s = load i16, ptr %gep, align 16
350  %r = insertelement <8 x i16> poison, i16 %s, i64 0
351  ret <8 x i16> %r
352}
353
354; Negative test - disable under asan because widened load can cause spurious
355; use-after-poison issues when __asan_poison_memory_region is used.
356
357define <8 x i16> @gep10_load_i16_insert_v8i16_asan(ptr align 16 dereferenceable(32) %p) sanitize_address {
358; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan(
359; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
360; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
361; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
362; CHECK-NEXT:    ret <8 x i16> [[R]]
363;
364  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
365  %s = load i16, ptr %gep, align 16
366  %r = insertelement <8 x i16> poison, i16 %s, i64 0
367  ret <8 x i16> %r
368}
369
370; hwasan and memtag should be similarly suppressed.
371
372define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(ptr align 16 dereferenceable(32) %p) sanitize_hwaddress {
373; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan(
374; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
375; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
376; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
377; CHECK-NEXT:    ret <8 x i16> [[R]]
378;
379  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
380  %s = load i16, ptr %gep, align 16
381  %r = insertelement <8 x i16> poison, i16 %s, i64 0
382  ret <8 x i16> %r
383}
384
385define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(ptr align 16 dereferenceable(32) %p) sanitize_memtag {
386; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag(
387; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
388; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
389; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
390; CHECK-NEXT:    ret <8 x i16> [[R]]
391;
392  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
393  %s = load i16, ptr %gep, align 16
394  %r = insertelement <8 x i16> poison, i16 %s, i64 0
395  ret <8 x i16> %r
396}
397
398; Negative test - disable under tsan because widened load may overlap bytes
399; being concurrently modified. tsan does not know that some bytes are undef.
400
401define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(ptr align 16 dereferenceable(32) %p) sanitize_thread {
402; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan(
403; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
404; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
405; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
406; CHECK-NEXT:    ret <8 x i16> [[R]]
407;
408  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
409  %s = load i16, ptr %gep, align 16
410  %r = insertelement <8 x i16> poison, i16 %s, i64 0
411  ret <8 x i16> %r
412}
413
414; Negative test - can't safely load the offset vector, but could load+shuffle.
415
416define <8 x i16> @gep10_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(31) %p) nofree nosync {
417; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
418; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
419; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
420; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
421; CHECK-NEXT:    ret <8 x i16> [[R]]
422;
423  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
424  %s = load i16, ptr %gep, align 16
425  %r = insertelement <8 x i16> poison, i16 %s, i64 0
426  ret <8 x i16> %r
427}
428
429; Negative test - do not alter volatile.
430
431define <4 x float> @load_f32_insert_v4f32_volatile(ptr align 16 dereferenceable(16) %p) nofree nosync {
432; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
433; CHECK-NEXT:    [[S:%.*]] = load volatile float, ptr [[P:%.*]], align 4
434; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
435; CHECK-NEXT:    ret <4 x float> [[R]]
436;
437  %s = load volatile float, ptr %p, align 4
438  %r = insertelement <4 x float> poison, float %s, i32 0
439  ret <4 x float> %r
440}
441
442; Pointer is not as aligned as load, but that's ok.
443; The new load uses the larger alignment value.
444
445define <4 x float> @load_f32_insert_v4f32_align(ptr align 1 dereferenceable(16) %p) nofree nosync {
446; CHECK-LABEL: @load_f32_insert_v4f32_align(
447; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
448; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
449; CHECK-NEXT:    ret <4 x float> [[R]]
450;
451  %s = load float, ptr %p, align 4
452  %r = insertelement <4 x float> poison, float %s, i32 0
453  ret <4 x float> %r
454}
455
456; Negative test - not enough bytes.
457
458define <4 x float> @load_f32_insert_v4f32_deref(ptr align 4 dereferenceable(15) %p) nofree nosync {
459; CHECK-LABEL: @load_f32_insert_v4f32_deref(
460; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
461; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
462; CHECK-NEXT:    ret <4 x float> [[R]]
463;
464  %s = load float, ptr %p, align 4
465  %r = insertelement <4 x float> poison, float %s, i32 0
466  ret <4 x float> %r
467}
468
469define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nofree nosync {
470; CHECK-LABEL: @load_i32_insert_v8i32(
471; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
472; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
473; CHECK-NEXT:    ret <8 x i32> [[R]]
474;
475  %s = load i32, ptr %p, align 4
476  %r = insertelement <8 x i32> poison, i32 %s, i32 0
477  ret <8 x i32> %r
478}
479
480define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) %p) nofree nosync {
481; CHECK-LABEL: @casted_load_i32_insert_v8i32(
482; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
483; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
484; CHECK-NEXT:    ret <8 x i32> [[R]]
485;
486  %s = load i32, ptr %p, align 4
487  %r = insertelement <8 x i32> poison, i32 %s, i32 0
488  ret <8 x i32> %r
489}
490
491define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
492; CHECK-LABEL: @load_f32_insert_v16f32(
493; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
494; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
495; CHECK-NEXT:    ret <16 x float> [[R]]
496;
497  %s = load float, ptr %p, align 4
498  %r = insertelement <16 x float> poison, float %s, i32 0
499  ret <16 x float> %r
500}
501
502define <2 x float> @load_f32_insert_v2f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
503; CHECK-LABEL: @load_f32_insert_v2f32(
504; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
505; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 poison>
506; CHECK-NEXT:    ret <2 x float> [[R]]
507;
508  %s = load float, ptr %p, align 4
509  %r = insertelement <2 x float> poison, float %s, i32 0
510  ret <2 x float> %r
511}
512
513; Negative test - suppress load widening for asan/hwasan/memtag/tsan.
514
515define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16) %p) sanitize_address {
516; CHECK-LABEL: @load_f32_insert_v2f32_asan(
517; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
518; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> poison, float [[S]], i32 0
519; CHECK-NEXT:    ret <2 x float> [[R]]
520;
521  %s = load float, ptr %p, align 4
522  %r = insertelement <2 x float> poison, float %s, i32 0
523  ret <2 x float> %r
524}
525
526declare ptr @getscaleptr()
527define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) nofree nosync {
528; CHECK-LABEL: @PR47558_multiple_use_load(
529; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
530; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
531; CHECK-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
532; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
533; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
534; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
535; CHECK-NEXT:    store <2 x float> [[T3]], ptr [[RESULTPTR:%.*]], align 8
536; CHECK-NEXT:    ret void
537;
538  %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
539  %op = load <2 x float>, ptr %opptr, align 4
540  %scale = load float, ptr %scaleptr, align 16
541  %t1 = insertelement <2 x float> poison, float %scale, i32 0
542  %t2 = insertelement <2 x float> %t1, float %scale, i32 1
543  %t3 = fmul <2 x float> %op, %t2
544  %t4 = extractelement <2 x float> %t3, i32 0
545  %result0 = insertelement <2 x float> poison, float %t4, i32 0
546  %t5 = extractelement <2 x float> %t3, i32 1
547  %result1 = insertelement <2 x float> %result0, float %t5, i32 1
548  store <2 x float> %result1, ptr %resultptr, align 8
549  ret void
550}
551
552define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
553; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
554; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
555; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
556; CHECK-NEXT:    ret <4 x float> [[R]]
557;
558  %l = load <2 x float>, ptr %p, align 4
559  %s = extractelement <2 x float> %l, i32 0
560  %r = insertelement <4 x float> poison, float %s, i32 0
561  ret <4 x float> %r
562}
563
564define <4 x float> @load_v8f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
565; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
566; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
567; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
568; CHECK-NEXT:    ret <4 x float> [[R]]
569;
570  %l = load <8 x float>, ptr %p, align 4
571  %s = extractelement <8 x float> %l, i32 0
572  %r = insertelement <4 x float> poison, float %s, i32 0
573  ret <4 x float> %r
574}
575
576define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 dereferenceable(16) %p, ptr %store_ptr) nofree nosync {
577; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
578; CHECK-NEXT:    [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
579; CHECK-NEXT:    store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
580; CHECK-NEXT:    [[S:%.*]] = extractelement <1 x i32> [[L]], i32 0
581; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> poison, i32 [[S]], i32 0
582; CHECK-NEXT:    ret <8 x i32> [[R]]
583;
584  %l = load <1 x i32>, ptr %p, align 4
585  store <1 x i32> %l, ptr %store_ptr
586  %s = extractelement <1 x i32> %l, i32 0
587  %r = insertelement <8 x i32> poison, i32 %s, i32 0
588  ret <8 x i32> %r
589}
590
591; Can't safely load the offset vector, but can load+shuffle if it is profitable.
592
593define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync {
594; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
595; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4
596; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
597; CHECK-NEXT:    ret <8 x i16> [[R]]
598;
599  %gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1
600  %l = load <2 x i16>, ptr %gep, align 8
601  %s = extractelement <2 x i16> %l, i32 0
602  %r = insertelement <8 x i16> poison, i16 %s, i64 0
603  ret <8 x i16> %r
604}
605
606; Negative sanitizer tests.
607
608define <4 x i32> @load_i32_insert_v4i32_asan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_address  {
609; CHECK-LABEL: @load_i32_insert_v4i32_asan(
610; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
611; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i32 0
612; CHECK-NEXT:    ret <4 x i32> [[R]]
613;
614  %s = load i32, ptr %p, align 4
615  %r = insertelement <4 x i32> poison, i32 %s, i32 0
616  ret <4 x i32> %r
617}
618
619define <4 x float> @load_v2f32_extract_insert_v4f32_hwasan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_hwaddress  {
620; CHECK-LABEL: @load_v2f32_extract_insert_v4f32_hwasan(
621; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
622; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
623; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
624; CHECK-NEXT:    ret <4 x float> [[R]]
625;
626  %l = load <2 x float>, ptr %p, align 4
627  %s = extractelement <2 x float> %l, i32 0
628  %r = insertelement <4 x float> poison, float %s, i32 0
629  ret <4 x float> %r
630}
631
632define <4 x float> @load_v2f32_extract_insert_v4f32_tsan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_thread  {
633; CHECK-LABEL: @load_v2f32_extract_insert_v4f32_tsan(
634; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
635; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
636; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
637; CHECK-NEXT:    ret <4 x float> [[R]]
638;
639  %l = load <2 x float>, ptr %p, align 4
640  %s = extractelement <2 x float> %l, i32 0
641  %r = insertelement <4 x float> poison, float %s, i32 0
642  ret <4 x float> %r
643}
644
645; Double negative msan tests, it's OK with the optimization.
646
647define <2 x float> @load_f32_insert_v2f32_msan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_memory  {
648; CHECK-LABEL: @load_f32_insert_v2f32_msan(
649; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
650; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 poison>
651; CHECK-NEXT:    ret <2 x float> [[R]]
652;
653  %s = load float, ptr %p, align 4
654  %r = insertelement <2 x float> poison, float %s, i32 0
655  ret <2 x float> %r
656}
657
658; PR30986 - split vector loads for scalarized operations
659define <2 x i64> @PR30986(ptr %0) {
660; CHECK-LABEL: @PR30986(
661; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i64>, ptr [[TMP0:%.*]], i32 0, i32 0
662; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 16
663; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP3]])
664; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
665; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i64>, ptr [[TMP0]], i32 0, i32 1
666; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
667; CHECK-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP7]])
668; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1
669; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
670;
671  %2 = load <2 x i64>, ptr %0, align 16
672  %3 = extractelement <2 x i64> %2, i32 0
673  %4 = tail call i64 @llvm.ctpop.i64(i64 %3)
674  %5 = insertelement <2 x i64> poison, i64 %4, i32 0
675  %6 = extractelement <2 x i64> %2, i32 1
676  %7 = tail call i64 @llvm.ctpop.i64(i64 %6)
677  %8 = insertelement <2 x i64> %5, i64 %7, i32 1
678  ret <2 x i64> %8
679}
680declare i64 @llvm.ctpop.i64(i64)
681