xref: /llvm-project/llvm/test/Transforms/VectorCombine/X86/load.ll (revision cc54a0ce5674b740c2136d7bd2416ffeb4a230cf)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
4
5target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
6
7define float @matching_fp_scalar(ptr align 16 dereferenceable(16) %p) {
8; CHECK-LABEL: @matching_fp_scalar(
9; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
10; CHECK-NEXT:    ret float [[R]]
11;
12  %r = load float, ptr %p, align 16
13  ret float %r
14}
15
16define float @matching_fp_scalar_volatile(ptr align 16 dereferenceable(16) %p) {
17; CHECK-LABEL: @matching_fp_scalar_volatile(
18; CHECK-NEXT:    [[R:%.*]] = load volatile float, ptr [[P:%.*]], align 16
19; CHECK-NEXT:    ret float [[R]]
20;
21  %r = load volatile float, ptr %p, align 16
22  ret float %r
23}
24
25define double @larger_fp_scalar(ptr align 16 dereferenceable(16) %p) {
26; CHECK-LABEL: @larger_fp_scalar(
27; CHECK-NEXT:    [[R:%.*]] = load double, ptr [[P:%.*]], align 16
28; CHECK-NEXT:    ret double [[R]]
29;
30  %r = load double, ptr %p, align 16
31  ret double %r
32}
33
34define float @smaller_fp_scalar(ptr align 16 dereferenceable(16) %p) {
35; CHECK-LABEL: @smaller_fp_scalar(
36; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
37; CHECK-NEXT:    ret float [[R]]
38;
39  %r = load float, ptr %p, align 16
40  ret float %r
41}
42
43define float @matching_fp_vector(ptr align 16 dereferenceable(16) %p) {
44; CHECK-LABEL: @matching_fp_vector(
45; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
46; CHECK-NEXT:    ret float [[R]]
47;
48  %r = load float, ptr %p, align 16
49  ret float %r
50}
51
52define float @matching_fp_vector_gep00(ptr align 16 dereferenceable(16) %p) {
53; CHECK-LABEL: @matching_fp_vector_gep00(
54; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
55; CHECK-NEXT:    ret float [[R]]
56;
57  %r = load float, ptr %p, align 16
58  ret float %r
59}
60
61define float @matching_fp_vector_gep01(ptr align 16 dereferenceable(20) %p) {
62; CHECK-LABEL: @matching_fp_vector_gep01(
63; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 0, i64 1
64; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 4
65; CHECK-NEXT:    ret float [[R]]
66;
67  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 0, i64 1
68  %r = load float, ptr %gep, align 4
69  ret float %r
70}
71
72define float @matching_fp_vector_gep01_deref(ptr align 16 dereferenceable(19) %p) {
73; CHECK-LABEL: @matching_fp_vector_gep01_deref(
74; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 0, i64 1
75; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 4
76; CHECK-NEXT:    ret float [[R]]
77;
78  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 0, i64 1
79  %r = load float, ptr %gep, align 4
80  ret float %r
81}
82
83define float @matching_fp_vector_gep10(ptr align 16 dereferenceable(32) %p) {
84; CHECK-LABEL: @matching_fp_vector_gep10(
85; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 1, i64 0
86; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 16
87; CHECK-NEXT:    ret float [[R]]
88;
89  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 1, i64 0
90  %r = load float, ptr %gep, align 16
91  ret float %r
92}
93
94define float @matching_fp_vector_gep10_deref(ptr align 16 dereferenceable(31) %p) {
95; CHECK-LABEL: @matching_fp_vector_gep10_deref(
96; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, ptr [[P:%.*]], i64 1, i64 0
97; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[GEP]], align 16
98; CHECK-NEXT:    ret float [[R]]
99;
100  %gep = getelementptr inbounds <4 x float>, ptr %p, i64 1, i64 0
101  %r = load float, ptr %gep, align 16
102  ret float %r
103}
104
105define float @nonmatching_int_vector(ptr align 16 dereferenceable(16) %p) {
106; CHECK-LABEL: @nonmatching_int_vector(
107; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
108; CHECK-NEXT:    ret float [[R]]
109;
110  %r = load float, ptr %p, align 16
111  ret float %r
112}
113
114define double @less_aligned(ptr align 4 dereferenceable(16) %p) {
115; CHECK-LABEL: @less_aligned(
116; CHECK-NEXT:    [[R:%.*]] = load double, ptr [[P:%.*]], align 4
117; CHECK-NEXT:    ret double [[R]]
118;
119  %r = load double, ptr %p, align 4
120  ret double %r
121}
122
123define float @matching_fp_scalar_small_deref(ptr align 16 dereferenceable(15) %p) {
124; CHECK-LABEL: @matching_fp_scalar_small_deref(
125; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P:%.*]], align 16
126; CHECK-NEXT:    ret float [[R]]
127;
128  %r = load float, ptr %p, align 16
129  ret float %r
130}
131
132define i64 @larger_int_scalar(ptr align 16 dereferenceable(16) %p) {
133; CHECK-LABEL: @larger_int_scalar(
134; CHECK-NEXT:    [[R:%.*]] = load i64, ptr [[P:%.*]], align 16
135; CHECK-NEXT:    ret i64 [[R]]
136;
137  %r = load i64, ptr %p, align 16
138  ret i64 %r
139}
140
141define i8 @smaller_int_scalar(ptr align 16 dereferenceable(16) %p) {
142; CHECK-LABEL: @smaller_int_scalar(
143; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[P:%.*]], align 16
144; CHECK-NEXT:    ret i8 [[R]]
145;
146  %r = load i8, ptr %p, align 16
147  ret i8 %r
148}
149
150define double @larger_fp_scalar_256bit_vec(ptr align 32 dereferenceable(32) %p) {
151; CHECK-LABEL: @larger_fp_scalar_256bit_vec(
152; CHECK-NEXT:    [[R:%.*]] = load double, ptr [[P:%.*]], align 32
153; CHECK-NEXT:    ret double [[R]]
154;
155  %r = load double, ptr %p, align 32
156  ret double %r
157}
158
159define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
160; CHECK-LABEL: @load_f32_insert_v4f32(
161; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
162; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
163; CHECK-NEXT:    ret <4 x float> [[R]]
164;
165  %s = load float, ptr %p, align 4
166  %r = insertelement <4 x float> undef, float %s, i32 0
167  ret <4 x float> %r
168}
169
170define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) %p) nofree nosync {
171; CHECK-LABEL: @casted_load_f32_insert_v4f32(
172; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
173; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
174; CHECK-NEXT:    ret <4 x float> [[R]]
175;
176  %s = load float, ptr %p, align 4
177  %r = insertelement <4 x float> undef, float %s, i32 0
178  ret <4 x float> %r
179}
180
181; Element type does not change cost.
182
183define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nofree nosync {
184; CHECK-LABEL: @load_i32_insert_v4i32(
185; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
186; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
187; CHECK-NEXT:    ret <4 x i32> [[R]]
188;
189  %s = load i32, ptr %p, align 4
190  %r = insertelement <4 x i32> undef, i32 %s, i32 0
191  ret <4 x i32> %r
192}
193
194; Pointer type does not change cost.
195
196define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) %p) nofree nosync {
197; CHECK-LABEL: @casted_load_i32_insert_v4i32(
198; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
199; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
200; CHECK-NEXT:    ret <4 x i32> [[R]]
201;
202  %s = load i32, ptr %p, align 4
203  %r = insertelement <4 x i32> undef, i32 %s, i32 0
204  ret <4 x i32> %r
205}
206
207; This is canonical form for vector element access.
208
209define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
210; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
211; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 16
212; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
213; CHECK-NEXT:    ret <4 x float> [[R]]
214;
215  %s = load float, ptr %p, align 16
216  %r = insertelement <4 x float> undef, float %s, i64 0
217  ret <4 x float> %r
218}
219
220; Should work with addrspace as well.
221
222define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) align 16 dereferenceable(16) %p) nofree nosync {
223; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
224; CHECK-NEXT:    [[S:%.*]] = load float, ptr addrspace(44) [[P:%.*]], align 16
225; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
226; CHECK-NEXT:    ret <4 x float> [[R]]
227;
228  %s = load float, ptr addrspace(44) %p, align 16
229  %r = insertelement <4 x float> undef, float %s, i64 0
230  ret <4 x float> %r
231}
232
233; If there are enough dereferenceable bytes, we can offset the vector load.
234
235define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
236; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
237; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
238; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 2
239; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
240; CHECK-NEXT:    ret <8 x i16> [[R]]
241;
242  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
243  %s = load i16, ptr %gep, align 2
244  %r = insertelement <8 x i16> undef, i16 %s, i64 0
245  ret <8 x i16> %r
246}
247
248; Can't safely load the offset vector, but can load+shuffle if it is profitable.
249
250define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync {
251; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
252; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
253; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 2
254; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
255; CHECK-NEXT:    ret <8 x i16> [[R]]
256;
257  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
258  %s = load i16, ptr %gep, align 2
259  %r = insertelement <8 x i16> undef, i16 %s, i64 0
260  ret <8 x i16> %r
261}
262
263; Verify that alignment of the new load is not over-specified.
264
265define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync {
266; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
267; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
268; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 8
269; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
270; CHECK-NEXT:    ret <8 x i16> [[R]]
271;
272  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
273  %s = load i16, ptr %gep, align 8
274  %r = insertelement <8 x i16> undef, i16 %s, i64 0
275  ret <8 x i16> %r
276}
277
278; Negative test - if we are shuffling a load from the base pointer, the address offset
279; must be a multiple of element size.
280; TODO: Could bitcast around this limitation.
281
282define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
283; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
284; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
285; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
286; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
287; CHECK-NEXT:    ret <4 x i32> [[R]]
288;
289  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
290  %s = load i32, ptr %gep, align 1
291  %r = insertelement <4 x i32> undef, i32 %s, i64 0
292  ret <4 x i32> %r
293}
294
295define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
296; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
297; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 12
298; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
299; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
300; CHECK-NEXT:    ret <4 x i32> [[R]]
301;
302  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 12
303  %s = load i32, ptr %gep, align 1
304  %r = insertelement <4 x i32> undef, i32 %s, i64 0
305  ret <4 x i32> %r
306}
307
308; Negative test - if we are shuffling a load from the base pointer, the address offset
309; must be a multiple of element size and the offset must be low enough to fit in the vector
310; (bitcasting would not help this case).
311
312define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
313; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32(
314; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 13
315; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
316; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
317; CHECK-NEXT:    ret <4 x i32> [[R]]
318;
319  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 13
320  %s = load i32, ptr %gep, align 1
321  %r = insertelement <4 x i32> undef, i32 %s, i64 0
322  ret <4 x i32> %r
323}
324
325; If there are enough dereferenceable bytes, we can offset the vector load.
326
327define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
328; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
329; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
330; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
331; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
332; CHECK-NEXT:    ret <8 x i16> [[R]]
333;
334  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
335  %s = load i16, ptr %gep, align 16
336  %r = insertelement <8 x i16> undef, i16 %s, i64 0
337  ret <8 x i16> %r
338}
339
340; Negative test - disable under asan because widened load can cause spurious
341; use-after-poison issues when __asan_poison_memory_region is used.
342
343define <8 x i16> @gep10_load_i16_insert_v8i16_asan(ptr align 16 dereferenceable(32) %p) sanitize_address nofree nosync {
344; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan(
345; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
346; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
347; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
348; CHECK-NEXT:    ret <8 x i16> [[R]]
349;
350  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
351  %s = load i16, ptr %gep, align 16
352  %r = insertelement <8 x i16> undef, i16 %s, i64 0
353  ret <8 x i16> %r
354}
355
356; hwasan and memtag should be similarly suppressed.
357
358define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(ptr align 16 dereferenceable(32) %p) sanitize_hwaddress nofree nosync {
359; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan(
360; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
361; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
362; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
363; CHECK-NEXT:    ret <8 x i16> [[R]]
364;
365  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
366  %s = load i16, ptr %gep, align 16
367  %r = insertelement <8 x i16> undef, i16 %s, i64 0
368  ret <8 x i16> %r
369}
370
371define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(ptr align 16 dereferenceable(32) %p) sanitize_memtag nofree nosync {
372; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag(
373; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
374; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
375; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
376; CHECK-NEXT:    ret <8 x i16> [[R]]
377;
378  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
379  %s = load i16, ptr %gep, align 16
380  %r = insertelement <8 x i16> undef, i16 %s, i64 0
381  ret <8 x i16> %r
382}
383
384; Negative test - disable under tsan because widened load may overlap bytes
385; being concurrently modified. tsan does not know that some bytes are undef.
386
387define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(ptr align 16 dereferenceable(32) %p) sanitize_thread nofree nosync {
388; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan(
389; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
390; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
391; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
392; CHECK-NEXT:    ret <8 x i16> [[R]]
393;
394  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
395  %s = load i16, ptr %gep, align 16
396  %r = insertelement <8 x i16> undef, i16 %s, i64 0
397  ret <8 x i16> %r
398}
399
400; Negative test - can't safely load the offset vector, but could load+shuffle.
401
402define <8 x i16> @gep10_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(31) %p) nofree nosync {
403; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
404; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
405; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 16
406; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
407; CHECK-NEXT:    ret <8 x i16> [[R]]
408;
409  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
410  %s = load i16, ptr %gep, align 16
411  %r = insertelement <8 x i16> undef, i16 %s, i64 0
412  ret <8 x i16> %r
413}
414
415; Negative test - do not alter volatile.
416
417define <4 x float> @load_f32_insert_v4f32_volatile(ptr align 16 dereferenceable(16) %p) nofree nosync {
418; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
419; CHECK-NEXT:    [[S:%.*]] = load volatile float, ptr [[P:%.*]], align 4
420; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
421; CHECK-NEXT:    ret <4 x float> [[R]]
422;
423  %s = load volatile float, ptr %p, align 4
424  %r = insertelement <4 x float> undef, float %s, i32 0
425  ret <4 x float> %r
426}
427
428; Pointer is not as aligned as load, but that's ok.
429; The new load uses the larger alignment value.
430
431define <4 x float> @load_f32_insert_v4f32_align(ptr align 1 dereferenceable(16) %p) nofree nosync {
432; CHECK-LABEL: @load_f32_insert_v4f32_align(
433; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
434; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
435; CHECK-NEXT:    ret <4 x float> [[R]]
436;
437  %s = load float, ptr %p, align 4
438  %r = insertelement <4 x float> undef, float %s, i32 0
439  ret <4 x float> %r
440}
441
442; Negative test - not enough bytes.
443
444define <4 x float> @load_f32_insert_v4f32_deref(ptr align 4 dereferenceable(15) %p) nofree nosync {
445; CHECK-LABEL: @load_f32_insert_v4f32_deref(
446; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
447; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
448; CHECK-NEXT:    ret <4 x float> [[R]]
449;
450  %s = load float, ptr %p, align 4
451  %r = insertelement <4 x float> undef, float %s, i32 0
452  ret <4 x float> %r
453}
454
455define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nofree nosync {
456; CHECK-LABEL: @load_i32_insert_v8i32(
457; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
458; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
459; CHECK-NEXT:    ret <8 x i32> [[R]]
460;
461  %s = load i32, ptr %p, align 4
462  %r = insertelement <8 x i32> undef, i32 %s, i32 0
463  ret <8 x i32> %r
464}
465
466define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) %p) nofree nosync {
467; CHECK-LABEL: @casted_load_i32_insert_v8i32(
468; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
469; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
470; CHECK-NEXT:    ret <8 x i32> [[R]]
471;
472  %s = load i32, ptr %p, align 4
473  %r = insertelement <8 x i32> undef, i32 %s, i32 0
474  ret <8 x i32> %r
475}
476
477define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
478; CHECK-LABEL: @load_f32_insert_v16f32(
479; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
480; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0
481; CHECK-NEXT:    ret <16 x float> [[R]]
482;
483  %s = load float, ptr %p, align 4
484  %r = insertelement <16 x float> undef, float %s, i32 0
485  ret <16 x float> %r
486}
487
488define <2 x float> @load_f32_insert_v2f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
489; CHECK-LABEL: @load_f32_insert_v2f32(
490; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
491; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
492; CHECK-NEXT:    ret <2 x float> [[R]]
493;
494  %s = load float, ptr %p, align 4
495  %r = insertelement <2 x float> undef, float %s, i32 0
496  ret <2 x float> %r
497}
498
499; Negative test - suppress load widening for asan/hwasan/memtag/tsan.
500
501define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16) %p) sanitize_address {
502; CHECK-LABEL: @load_f32_insert_v2f32_asan(
503; CHECK-NEXT:    [[S:%.*]] = load float, ptr [[P:%.*]], align 4
504; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
505; CHECK-NEXT:    ret <2 x float> [[R]]
506;
507  %s = load float, ptr %p, align 4
508  %r = insertelement <2 x float> undef, float %s, i32 0
509  ret <2 x float> %r
510}
511
512declare ptr @getscaleptr()
513define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) {
514; CHECK-LABEL: @PR47558_multiple_use_load(
515; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
516; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
517; CHECK-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
518; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
519; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
520; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
521; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
522; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
523; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
524; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
525; CHECK-NEXT:    ret void
526;
527  %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
528  %op = load <2 x float>, ptr %opptr, align 4
529  %scale = load float, ptr %scaleptr, align 16
530  %t1 = insertelement <2 x float> undef, float %scale, i32 0
531  %t2 = insertelement <2 x float> %t1, float %scale, i32 1
532  %t3 = fmul <2 x float> %op, %t2
533  %t4 = extractelement <2 x float> %t3, i32 0
534  %result0 = insertelement <2 x float> undef, float %t4, i32 0
535  %t5 = extractelement <2 x float> %t3, i32 1
536  %result1 = insertelement <2 x float> %result0, float %t5, i32 1
537  store <2 x float> %result1, ptr %resultptr, align 8
538  ret void
539}
540
541define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
542; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
543; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
544; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
545; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
546; CHECK-NEXT:    ret <4 x float> [[R]]
547;
548  %l = load <2 x float>, ptr %p, align 4
549  %s = extractelement <2 x float> %l, i32 0
550  %r = insertelement <4 x float> undef, float %s, i32 0
551  ret <4 x float> %r
552}
553
554define <4 x float> @load_v8f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
555; SSE2-LABEL: @load_v8f32_extract_insert_v4f32(
556; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x float>, ptr [[P:%.*]], i32 0, i32 0
557; SSE2-NEXT:    [[S:%.*]] = load float, ptr [[TMP1]], align 4
558; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
559; SSE2-NEXT:    ret <4 x float> [[R]]
560;
561; AVX2-LABEL: @load_v8f32_extract_insert_v4f32(
562; AVX2-NEXT:    [[L:%.*]] = load <8 x float>, ptr [[P:%.*]], align 4
563; AVX2-NEXT:    [[S:%.*]] = extractelement <8 x float> [[L]], i32 0
564; AVX2-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
565; AVX2-NEXT:    ret <4 x float> [[R]]
566;
567  %l = load <8 x float>, ptr %p, align 4
568  %s = extractelement <8 x float> %l, i32 0
569  %r = insertelement <4 x float> undef, float %s, i32 0
570  ret <4 x float> %r
571}
572
573define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 dereferenceable(16) %p, ptr %store_ptr) nofree nosync {
574; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
575; CHECK-NEXT:    [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
576; CHECK-NEXT:    store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
577; CHECK-NEXT:    [[S:%.*]] = extractelement <1 x i32> [[L]], i32 0
578; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
579; CHECK-NEXT:    ret <8 x i32> [[R]]
580;
581  %l = load <1 x i32>, ptr %p, align 4
582  store <1 x i32> %l, ptr %store_ptr
583  %s = extractelement <1 x i32> %l, i32 0
584  %r = insertelement <8 x i32> undef, i32 %s, i32 0
585  ret <8 x i32> %r
586}
587
588; Can't safely load the offset vector, but can load+shuffle if it is profitable.
589
590define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync {
591; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
592; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1
593; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0
594; CHECK-NEXT:    [[S:%.*]] = load i16, ptr [[TMP1]], align 8
595; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
596; CHECK-NEXT:    ret <8 x i16> [[R]]
597;
598  %gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1
599  %l = load <2 x i16>, ptr %gep, align 8
600  %s = extractelement <2 x i16> %l, i32 0
601  %r = insertelement <8 x i16> undef, i16 %s, i64 0
602  ret <8 x i16> %r
603}
604
605; PR30986 - split vector loads for scalarized operations
606define <2 x i64> @PR30986(ptr %0) {
607; CHECK-LABEL: @PR30986(
608; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i64>, ptr [[TMP0:%.*]], i32 0, i32 0
609; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 16
610; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP3]])
611; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
612; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i64>, ptr [[TMP0]], i32 0, i32 1
613; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
614; CHECK-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP7]])
615; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1
616; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
617;
618  %2 = load <2 x i64>, ptr %0, align 16
619  %3 = extractelement <2 x i64> %2, i32 0
620  %4 = tail call i64 @llvm.ctpop.i64(i64 %3)
621  %5 = insertelement <2 x i64> undef, i64 %4, i32 0
622  %6 = extractelement <2 x i64> %2, i32 1
623  %7 = tail call i64 @llvm.ctpop.i64(i64 %6)
624  %8 = insertelement <2 x i64> %5, i64 %7, i32 1
625  ret <2 x i64> %8
626}
627declare i64 @llvm.ctpop.i64(i64)
628