xref: /llvm-project/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll (revision 462cb3cd6cecd0511ecaf0e3ebcaba455ece587d)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes="default<O3>" -S            < %s | FileCheck %s --check-prefix=SSE
3; RUN: opt -passes="default<O3>" -S -mattr=avx < %s | FileCheck %s --check-prefix=AVX
4
5target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
6target triple = "x86_64--"
7
8%union.ElementWiseAccess = type { <4 x float> }
9
10$getAt = comdat any
11
12define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
13; SSE-LABEL: @ConvertVectors_ByRef(
14; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
15; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
16; SSE-NEXT:    ret <4 x float> [[TMP3]]
17;
18; AVX-LABEL: @ConvertVectors_ByRef(
19; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
20; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
21; AVX-NEXT:    ret <4 x float> [[TMP3]]
22;
23  %2 = alloca ptr, align 8
24  %3 = alloca <4 x float>, align 16
25  store ptr %0, ptr %2, align 8
26  %4 = load ptr, ptr %2, align 8
27  %5 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %4)
28  %6 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %5, i32 noundef 0)
29  %7 = insertelement <4 x float> undef, float %6, i32 0
30  %8 = load ptr, ptr %2, align 8
31  %9 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %8)
32  %10 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %9, i32 noundef 1)
33  %11 = insertelement <4 x float> %7, float %10, i32 1
34  %12 = load ptr, ptr %2, align 8
35  %13 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %12)
36  %14 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %13, i32 noundef 2)
37  %15 = insertelement <4 x float> %11, float %14, i32 2
38  %16 = load ptr, ptr %2, align 8
39  %17 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %16)
40  %18 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %17, i32 noundef 2)
41  %19 = insertelement <4 x float> %15, float %18, i32 3
42  store <4 x float> %19, ptr %3, align 16
43  %20 = load <4 x float>, ptr %3, align 16
44  ret <4 x float> %20
45}
46
47define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
48; SSE-LABEL: @ConvertVectors_ByVal(
49; SSE-NEXT:  entry:
50; SSE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
51; SSE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 8
52; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
53; SSE-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
54; SSE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
55; SSE-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
56; SSE-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
57; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
58;
59; AVX-LABEL: @ConvertVectors_ByVal(
60; AVX-NEXT:  entry:
61; AVX-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
62; AVX-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 8
63; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
64; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
65; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
66; AVX-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
67; AVX-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
68; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
69;
70entry:
71  %V.addr = alloca ptr, align 8
72  %.compoundliteral = alloca <4 x float>, align 16
73  %ref.tmp = alloca %union.ElementWiseAccess, align 16
74  %ref.tmp2 = alloca %union.ElementWiseAccess, align 16
75  %ref.tmp7 = alloca %union.ElementWiseAccess, align 16
76  %ref.tmp12 = alloca %union.ElementWiseAccess, align 16
77  store ptr %V, ptr %V.addr, align 8
78  call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp) #4
79  %0 = load ptr, ptr %V.addr, align 8
80  %call = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %0)
81  %coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp, i32 0, i32 0
82  %1 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 0
83  %2 = extractvalue { double, double } %call, 0
84  store double %2, ptr %1, align 16
85  %3 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 1
86  %4 = extractvalue { double, double } %call, 1
87  store double %4, ptr %3, align 8
88  %call1 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp, i32 noundef 0)
89  %vecinit = insertelement <4 x float> undef, float %call1, i32 0
90  call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp2) #4
91  %5 = load ptr, ptr %V.addr, align 8
92  %call3 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %5)
93  %coerce.dive4 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp2, i32 0, i32 0
94  %6 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 0
95  %7 = extractvalue { double, double } %call3, 0
96  store double %7, ptr %6, align 16
97  %8 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 1
98  %9 = extractvalue { double, double } %call3, 1
99  store double %9, ptr %8, align 8
100  %call5 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp2, i32 noundef 1)
101  %vecinit6 = insertelement <4 x float> %vecinit, float %call5, i32 1
102  call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp7) #4
103  %10 = load ptr, ptr %V.addr, align 8
104  %call8 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %10)
105  %coerce.dive9 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp7, i32 0, i32 0
106  %11 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 0
107  %12 = extractvalue { double, double } %call8, 0
108  store double %12, ptr %11, align 16
109  %13 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 1
110  %14 = extractvalue { double, double } %call8, 1
111  store double %14, ptr %13, align 8
112  %call10 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp7, i32 noundef 2)
113  %vecinit11 = insertelement <4 x float> %vecinit6, float %call10, i32 2
114  call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp12) #4
115  %15 = load ptr, ptr %V.addr, align 8
116  %call13 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %15)
117  %coerce.dive14 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp12, i32 0, i32 0
118  %16 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 0
119  %17 = extractvalue { double, double } %call13, 0
120  store double %17, ptr %16, align 16
121  %18 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 1
122  %19 = extractvalue { double, double } %call13, 1
123  store double %19, ptr %18, align 8
124  %call15 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp12, i32 noundef 2)
125  %vecinit16 = insertelement <4 x float> %vecinit11, float %call15, i32 3
126  store <4 x float> %vecinit16, ptr %.compoundliteral, align 16
127  %20 = load <4 x float>, ptr %.compoundliteral, align 16
128  call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp12) #4
129  call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp7) #4
130  call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp2) #4
131  call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp) #4
132  ret <4 x float> %20
133}
134
135define internal { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %t) #1 {
136entry:
137  %retval = alloca %union.ElementWiseAccess, align 16
138  %t.addr = alloca ptr, align 8
139  store ptr %t, ptr %t.addr, align 8
140  %0 = load ptr, ptr %t.addr, align 8
141  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval, ptr align 16 %0, i64 16, i1 false)
142  %coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %retval, i32 0, i32 0
143  %1 = load { double, double }, ptr %coerce.dive, align 16
144  ret { double, double } %1
145}
146
147declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
148declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
149declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3
150
151define internal noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #1 {
152  %2 = alloca ptr, align 8
153  store ptr %0, ptr %2, align 8
154  %3 = load ptr, ptr %2, align 8
155  ret ptr %3
156}
157
158define linkonce_odr dso_local noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %0, i32 noundef %1) #1 comdat align 2 {
159  %3 = alloca ptr, align 8
160  %4 = alloca i32, align 4
161  store ptr %0, ptr %3, align 8
162  store i32 %1, ptr %4, align 4
163  %5 = load ptr, ptr %3, align 8
164  %6 = load i32, ptr %4, align 4
165  %7 = sext i32 %6 to i64
166  %8 = getelementptr inbounds [4 x float], ptr %5, i64 0, i64 %7
167  %9 = load float, ptr %8, align 4
168  ret float %9
169}
170
171define linkonce_odr noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %this, i32 noundef %i) #1 align 2 {
172entry:
173  %this.addr = alloca ptr, align 8
174  %i.addr = alloca i32, align 4
175  store ptr %this, ptr %this.addr, align 8
176  store i32 %i, ptr %i.addr, align 4
177  %this1 = load ptr, ptr %this.addr, align 8
178  %0 = load i32, ptr %i.addr, align 4
179  %idxprom = sext i32 %0 to i64
180  %arrayidx = getelementptr inbounds [4 x float], ptr %this1, i64 0, i64 %idxprom
181  %1 = load float, ptr %arrayidx, align 4
182  ret float %1
183}
184
185; Vector combine + SLP should form a narrow load and a vector cast
186
187define void @PR51397(ptr noundef %dst, ptr noundef %srcp) {
188; SSE-LABEL: @PR51397(
189; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[SRCP:%.*]], align 16
190; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
191; SSE-NEXT:    store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 16
192; SSE-NEXT:    ret void
193;
194; AVX-LABEL: @PR51397(
195; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[SRCP:%.*]], align 16
196; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
197; AVX-NEXT:    store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 16
198; AVX-NEXT:    ret void
199;
200  %src = load <8 x i32>, ptr %srcp, align 16
201  %vecext = extractelement <8 x i32> %src, i32 0
202  %conv = sitofp i32 %vecext to float
203  %vecinit = insertelement <4 x float> undef, float %conv, i32 0
204  %vecext1 = extractelement <8 x i32> %src, i32 1
205  %conv2 = sitofp i32 %vecext1 to float
206  %vecinit3 = insertelement <4 x float> %vecinit, float %conv2, i32 1
207  %vecext4 = extractelement <8 x i32> %src, i32 2
208  %conv5 = sitofp i32 %vecext4 to float
209  %vecinit6 = insertelement <4 x float> %vecinit3, float %conv5, i32 2
210  %vecext7 = extractelement <8 x i32> %src, i32 3
211  %conv8 = sitofp i32 %vecext7 to float
212  %vecinit9 = insertelement <4 x float> %vecinit6, float %conv8, i32 3
213  store <4 x float> %vecinit9, ptr %dst, align 16
214  ret void
215}
216